llama_cpp 0.6.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +622 -150
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +358 -131
- data/ext/llama_cpp/src/ggml-metal.metal +137 -47
- data/ext/llama_cpp/src/ggml-opencl.cpp +136 -68
- data/ext/llama_cpp/src/ggml.c +812 -365
- data/ext/llama_cpp/src/ggml.h +25 -7
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +2387 -421
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +5 -2
@@ -62,6 +62,7 @@
|
|
62
62
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
63
63
|
#define cudaMemcpyKind hipMemcpyKind
|
64
64
|
#define cudaMemset hipMemset
|
65
|
+
#define cudaMemsetAsync hipMemsetAsync
|
65
66
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
66
67
|
#define cudaSetDevice hipSetDevice
|
67
68
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
@@ -80,9 +81,9 @@
|
|
80
81
|
#include "ggml.h"
|
81
82
|
|
82
83
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
83
|
-
#define
|
84
|
+
#define CC_VOLTA 700
|
84
85
|
#define CC_OFFSET_AMD 1000000
|
85
|
-
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
86
|
+
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
86
87
|
|
87
88
|
#if defined(GGML_USE_HIPBLAS)
|
88
89
|
#define __CUDA_ARCH__ 1300
|
@@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
414
415
|
#define CUDA_SILU_BLOCK_SIZE 256
|
415
416
|
#define CUDA_CPY_BLOCK_SIZE 32
|
416
417
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
418
|
+
#define CUDA_CLAMP_BLOCK_SIZE 256
|
417
419
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
418
420
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
419
421
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
420
422
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
421
423
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
424
|
+
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
422
425
|
|
423
426
|
// dmmv = dequantize_mul_mat_vec
|
424
427
|
#ifndef GGML_CUDA_DMMV_X
|
@@ -715,7 +718,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
715
718
|
|
716
719
|
//================================== k-quants
|
717
720
|
|
718
|
-
|
721
|
+
template<typename dst_t>
|
722
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
719
723
|
|
720
724
|
const int i = blockIdx.x;
|
721
725
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -727,7 +731,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
727
731
|
const int is = 8*n + l/16;
|
728
732
|
|
729
733
|
const uint8_t q = x[i].qs[32*n + l];
|
730
|
-
|
734
|
+
dst_t * y = yy + i*QK_K + 128*n;
|
731
735
|
|
732
736
|
float dall = __low2half(x[i].dm);
|
733
737
|
float dmin = __high2half(x[i].dm);
|
@@ -739,7 +743,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
739
743
|
const int is = tid/16; // 0 or 1
|
740
744
|
const int il = tid%16; // 0...15
|
741
745
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
742
|
-
|
746
|
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
743
747
|
float dall = __low2half(x[i].dm);
|
744
748
|
float dmin = __high2half(x[i].dm);
|
745
749
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
@@ -748,7 +752,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
748
752
|
|
749
753
|
}
|
750
754
|
|
751
|
-
|
755
|
+
template<typename dst_t>
|
756
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
752
757
|
|
753
758
|
const int i = blockIdx.x;
|
754
759
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -772,7 +777,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
|
|
772
777
|
float d_all = x[i].d;
|
773
778
|
float dl = d_all * (us - 32);
|
774
779
|
|
775
|
-
|
780
|
+
dst_t * y = yy + i*QK_K + 128*n + 32*j;
|
776
781
|
const uint8_t * q = x[i].qs + 32*n;
|
777
782
|
const uint8_t * hm = x[i].hmask;
|
778
783
|
|
@@ -784,7 +789,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
|
|
784
789
|
const int im = il/8; // 0...1
|
785
790
|
const int in = il%8; // 0...7
|
786
791
|
|
787
|
-
|
792
|
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
788
793
|
|
789
794
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
790
795
|
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
@@ -812,7 +817,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
812
817
|
}
|
813
818
|
#endif
|
814
819
|
|
815
|
-
|
820
|
+
template<typename dst_t>
|
821
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
816
822
|
const block_q4_K * x = (const block_q4_K *) vx;
|
817
823
|
|
818
824
|
const int i = blockIdx.x;
|
@@ -825,7 +831,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
825
831
|
const int is = 2*il;
|
826
832
|
const int n = 4;
|
827
833
|
|
828
|
-
|
834
|
+
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
829
835
|
|
830
836
|
const float dall = __low2half(x[i].dm);
|
831
837
|
const float dmin = __high2half(x[i].dm);
|
@@ -844,7 +850,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
844
850
|
#else
|
845
851
|
const int tid = threadIdx.x;
|
846
852
|
const uint8_t * q = x[i].qs;
|
847
|
-
|
853
|
+
dst_t * y = yy + i*QK_K;
|
848
854
|
const float d = (float)x[i].dm[0];
|
849
855
|
const float m = (float)x[i].dm[1];
|
850
856
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
@@ -852,7 +858,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
852
858
|
#endif
|
853
859
|
}
|
854
860
|
|
855
|
-
|
861
|
+
template<typename dst_t>
|
862
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
856
863
|
const block_q5_K * x = (const block_q5_K *) vx;
|
857
864
|
|
858
865
|
const int i = blockIdx.x;
|
@@ -864,7 +871,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
864
871
|
const int ir = tid%16; // ir is in 0...15
|
865
872
|
const int is = 2*il; // is is in 0...6
|
866
873
|
|
867
|
-
|
874
|
+
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
868
875
|
|
869
876
|
const float dall = __low2half(x[i].dm);
|
870
877
|
const float dmin = __high2half(x[i].dm);
|
@@ -892,13 +899,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
892
899
|
const int is = tid/16; // 0 or 1
|
893
900
|
const uint8_t h = x[i].qh[in] >> im;
|
894
901
|
const float d = x[i].d;
|
895
|
-
|
902
|
+
dst_t * y = yy + i*QK_K + tid;
|
896
903
|
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
897
904
|
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
898
905
|
#endif
|
899
906
|
}
|
900
907
|
|
901
|
-
|
908
|
+
template<typename dst_t>
|
909
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
902
910
|
const block_q6_K * x = (const block_q6_K *) vx;
|
903
911
|
|
904
912
|
const int i = blockIdx.x;
|
@@ -910,7 +918,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
|
|
910
918
|
const int il = tid - 32*ip; // 0...32
|
911
919
|
const int is = 8*ip + il/16;
|
912
920
|
|
913
|
-
|
921
|
+
dst_t * y = yy + i*QK_K + 128*ip + il;
|
914
922
|
|
915
923
|
const float d = x[i].d;
|
916
924
|
|
@@ -929,7 +937,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
|
|
929
937
|
const int ip = tid/16; // 0 or 1
|
930
938
|
const int il = tid - 16*ip; // 0...15
|
931
939
|
|
932
|
-
|
940
|
+
dst_t * y = yy + i*QK_K + 16*ip + il;
|
933
941
|
|
934
942
|
const float d = x[i].d;
|
935
943
|
|
@@ -1569,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1569
1577
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1570
1578
|
}
|
1571
1579
|
|
1580
|
+
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1581
|
+
static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
|
1582
|
+
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1583
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1584
|
+
|
1585
|
+
if (col >= ncols) {
|
1586
|
+
return;
|
1587
|
+
}
|
1588
|
+
|
1589
|
+
const int r = y[row];
|
1590
|
+
|
1591
|
+
// copy x[r*ncols + col] to dst[row*ncols + col]
|
1592
|
+
const int xi = r*ncols + col;
|
1593
|
+
const int di = row*ncols + col;
|
1594
|
+
|
1595
|
+
const int ib = xi/qk; // block index
|
1596
|
+
const int iqs = (xi%qk)/qr; // quant index
|
1597
|
+
const int iybs = di - di%qk; // y block start index
|
1598
|
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
1599
|
+
|
1600
|
+
// dequantize
|
1601
|
+
dfloat2 v;
|
1602
|
+
dequantize_kernel(x, ib, iqs, v);
|
1603
|
+
|
1604
|
+
dst[iybs + iqs + 0] = v.x;
|
1605
|
+
dst[iybs + iqs + y_offset] = v.y;
|
1606
|
+
}
|
1607
|
+
|
1572
1608
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1573
1609
|
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1574
1610
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
@@ -3548,7 +3584,7 @@ template <bool need_check> static __global__ void
|
|
3548
3584
|
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3549
3585
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3550
3586
|
|
3551
|
-
#elif __CUDA_ARCH__ >=
|
3587
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3552
3588
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3553
3589
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3554
3590
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -3568,7 +3604,7 @@ template <bool need_check> static __global__ void
|
|
3568
3604
|
#else
|
3569
3605
|
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3570
3606
|
assert(false);
|
3571
|
-
#endif // __CUDA_ARCH__ >=
|
3607
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3572
3608
|
}
|
3573
3609
|
|
3574
3610
|
#define MMQ_X_Q4_1_RDNA2 64
|
@@ -3589,9 +3625,9 @@ template <bool need_check> static __global__ void
|
|
3589
3625
|
#if defined(RDNA3) || defined(RDNA2)
|
3590
3626
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
3591
3627
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3592
|
-
#elif __CUDA_ARCH__ <
|
3628
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3593
3629
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3594
|
-
#endif // __CUDA_ARCH__ <
|
3630
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3595
3631
|
mul_mat_q4_1(
|
3596
3632
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3597
3633
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3611,7 +3647,7 @@ template <bool need_check> static __global__ void
|
|
3611
3647
|
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3612
3648
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3613
3649
|
|
3614
|
-
#elif __CUDA_ARCH__ >=
|
3650
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3615
3651
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3616
3652
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3617
3653
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -3631,7 +3667,7 @@ template <bool need_check> static __global__ void
|
|
3631
3667
|
#else
|
3632
3668
|
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3633
3669
|
assert(false);
|
3634
|
-
#endif // __CUDA_ARCH__ >=
|
3670
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3635
3671
|
}
|
3636
3672
|
|
3637
3673
|
#define MMQ_X_Q5_0_RDNA2 64
|
@@ -3672,7 +3708,7 @@ template <bool need_check> static __global__ void
|
|
3672
3708
|
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3673
3709
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3674
3710
|
|
3675
|
-
#elif __CUDA_ARCH__ >=
|
3711
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3676
3712
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3677
3713
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3678
3714
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -3692,7 +3728,7 @@ template <bool need_check> static __global__ void
|
|
3692
3728
|
#else
|
3693
3729
|
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3694
3730
|
assert(false);
|
3695
|
-
#endif // __CUDA_ARCH__ >=
|
3731
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3696
3732
|
}
|
3697
3733
|
|
3698
3734
|
#define MMQ_X_Q5_1_RDNA2 64
|
@@ -3733,7 +3769,7 @@ mul_mat_q5_1(
|
|
3733
3769
|
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3734
3770
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3735
3771
|
|
3736
|
-
#elif __CUDA_ARCH__ >=
|
3772
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3737
3773
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3738
3774
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3739
3775
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -3753,7 +3789,7 @@ mul_mat_q5_1(
|
|
3753
3789
|
#else
|
3754
3790
|
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3755
3791
|
assert(false);
|
3756
|
-
#endif // __CUDA_ARCH__ >=
|
3792
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3757
3793
|
}
|
3758
3794
|
|
3759
3795
|
#define MMQ_X_Q8_0_RDNA2 64
|
@@ -3794,7 +3830,7 @@ template <bool need_check> static __global__ void
|
|
3794
3830
|
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3795
3831
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3796
3832
|
|
3797
|
-
#elif __CUDA_ARCH__ >=
|
3833
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3798
3834
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3799
3835
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3800
3836
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -3814,7 +3850,7 @@ template <bool need_check> static __global__ void
|
|
3814
3850
|
#else
|
3815
3851
|
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3816
3852
|
assert(false);
|
3817
|
-
#endif // __CUDA_ARCH__ >=
|
3853
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3818
3854
|
}
|
3819
3855
|
|
3820
3856
|
#define MMQ_X_Q2_K_RDNA2 64
|
@@ -3855,7 +3891,7 @@ mul_mat_q2_K(
|
|
3855
3891
|
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3856
3892
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3857
3893
|
|
3858
|
-
#elif __CUDA_ARCH__ >=
|
3894
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3859
3895
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3860
3896
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3861
3897
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -3875,7 +3911,7 @@ mul_mat_q2_K(
|
|
3875
3911
|
#else
|
3876
3912
|
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3877
3913
|
assert(false);
|
3878
|
-
#endif // __CUDA_ARCH__ >=
|
3914
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3879
3915
|
}
|
3880
3916
|
|
3881
3917
|
#define MMQ_X_Q3_K_RDNA2 128
|
@@ -3896,9 +3932,9 @@ template <bool need_check> static __global__ void
|
|
3896
3932
|
#if defined(RDNA3) || defined(RDNA2)
|
3897
3933
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
3898
3934
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3899
|
-
#elif __CUDA_ARCH__ <
|
3935
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3900
3936
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3901
|
-
#endif // __CUDA_ARCH__ <
|
3937
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3902
3938
|
mul_mat_q3_K(
|
3903
3939
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3904
3940
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3918,7 +3954,7 @@ template <bool need_check> static __global__ void
|
|
3918
3954
|
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3919
3955
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3920
3956
|
|
3921
|
-
#elif __CUDA_ARCH__ >=
|
3957
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3922
3958
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3923
3959
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3924
3960
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -3938,7 +3974,7 @@ template <bool need_check> static __global__ void
|
|
3938
3974
|
#else
|
3939
3975
|
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3940
3976
|
assert(false);
|
3941
|
-
#endif // __CUDA_ARCH__ >=
|
3977
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3942
3978
|
}
|
3943
3979
|
|
3944
3980
|
#define MMQ_X_Q4_K_RDNA2 64
|
@@ -3959,9 +3995,9 @@ template <bool need_check> static __global__ void
|
|
3959
3995
|
#if defined(RDNA3) || defined(RDNA2)
|
3960
3996
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
3961
3997
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3962
|
-
#elif __CUDA_ARCH__ <
|
3998
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3963
3999
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3964
|
-
#endif // __CUDA_ARCH__ <
|
4000
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3965
4001
|
mul_mat_q4_K(
|
3966
4002
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3967
4003
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3981,7 +4017,7 @@ template <bool need_check> static __global__ void
|
|
3981
4017
|
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3982
4018
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3983
4019
|
|
3984
|
-
#elif __CUDA_ARCH__ >=
|
4020
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3985
4021
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3986
4022
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3987
4023
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -4001,7 +4037,7 @@ template <bool need_check> static __global__ void
|
|
4001
4037
|
#else
|
4002
4038
|
(void) vec_dot_q4_K_q8_1_mul_mat;
|
4003
4039
|
assert(false);
|
4004
|
-
#endif // __CUDA_ARCH__ >=
|
4040
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4005
4041
|
}
|
4006
4042
|
|
4007
4043
|
#define MMQ_X_Q5_K_RDNA2 64
|
@@ -4042,7 +4078,7 @@ mul_mat_q5_K(
|
|
4042
4078
|
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4043
4079
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4044
4080
|
|
4045
|
-
#elif __CUDA_ARCH__ >=
|
4081
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
4046
4082
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
4047
4083
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4048
4084
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -4062,7 +4098,7 @@ mul_mat_q5_K(
|
|
4062
4098
|
#else
|
4063
4099
|
(void) vec_dot_q5_K_q8_1_mul_mat;
|
4064
4100
|
assert(false);
|
4065
|
-
#endif // __CUDA_ARCH__ >=
|
4101
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4066
4102
|
}
|
4067
4103
|
|
4068
4104
|
#define MMQ_X_Q6_K_RDNA2 64
|
@@ -4083,9 +4119,9 @@ template <bool need_check> static __global__ void
|
|
4083
4119
|
#if defined(RDNA3) || defined(RDNA2)
|
4084
4120
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
4085
4121
|
#endif // defined(RDNA3) || defined(RDNA2)
|
4086
|
-
#elif __CUDA_ARCH__ <
|
4122
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
4087
4123
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
4088
|
-
#endif // __CUDA_ARCH__ <
|
4124
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
4089
4125
|
mul_mat_q6_K(
|
4090
4126
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
4091
4127
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -4105,7 +4141,7 @@ template <bool need_check> static __global__ void
|
|
4105
4141
|
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4106
4142
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4107
4143
|
|
4108
|
-
#elif __CUDA_ARCH__ >=
|
4144
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
4109
4145
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
4110
4146
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4111
4147
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4125,7 +4161,7 @@ template <bool need_check> static __global__ void
|
|
4125
4161
|
#else
|
4126
4162
|
(void) vec_dot_q6_K_q8_1_mul_mat;
|
4127
4163
|
assert(false);
|
4128
|
-
#endif // __CUDA_ARCH__ >=
|
4164
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4129
4165
|
}
|
4130
4166
|
|
4131
4167
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
@@ -4550,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
4550
4586
|
dst[i] = scale * x[i];
|
4551
4587
|
}
|
4552
4588
|
|
4589
|
+
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
4590
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
4591
|
+
|
4592
|
+
if (i >= k) {
|
4593
|
+
return;
|
4594
|
+
}
|
4595
|
+
|
4596
|
+
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4597
|
+
}
|
4598
|
+
|
4599
|
+
template<int qk, int qr, dequantize_kernel_t dq>
|
4600
|
+
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4601
|
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
4602
|
+
const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
4603
|
+
const dim3 block_nums(block_num_x, nrows, 1);
|
4604
|
+
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
4605
|
+
}
|
4606
|
+
|
4553
4607
|
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4554
4608
|
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4555
4609
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -4604,32 +4658,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4604
4658
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4605
4659
|
}
|
4606
4660
|
|
4607
|
-
|
4661
|
+
template<typename dst_t>
|
4662
|
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4608
4663
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4609
4664
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4610
4665
|
}
|
4611
4666
|
|
4612
|
-
|
4667
|
+
template<typename dst_t>
|
4668
|
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4613
4669
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4614
4670
|
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4615
4671
|
}
|
4616
4672
|
|
4617
|
-
|
4673
|
+
template<typename dst_t>
|
4674
|
+
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4618
4675
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4619
4676
|
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4620
4677
|
}
|
4621
4678
|
|
4622
|
-
|
4679
|
+
template<typename dst_t>
|
4680
|
+
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4623
4681
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4624
4682
|
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4625
4683
|
}
|
4626
4684
|
|
4627
|
-
|
4685
|
+
template<typename dst_t>
|
4686
|
+
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4628
4687
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4629
4688
|
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4630
4689
|
}
|
4631
4690
|
|
4632
|
-
|
4691
|
+
template<typename dst_t>
|
4692
|
+
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4633
4693
|
const int nb = k / QK_K;
|
4634
4694
|
#if QK_K == 256
|
4635
4695
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4638,7 +4698,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
|
|
4638
4698
|
#endif
|
4639
4699
|
}
|
4640
4700
|
|
4641
|
-
|
4701
|
+
template<typename dst_t>
|
4702
|
+
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4642
4703
|
const int nb = k / QK_K;
|
4643
4704
|
#if QK_K == 256
|
4644
4705
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4647,12 +4708,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
|
|
4647
4708
|
#endif
|
4648
4709
|
}
|
4649
4710
|
|
4650
|
-
|
4711
|
+
template<typename dst_t>
|
4712
|
+
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4651
4713
|
const int nb = k / QK_K;
|
4652
4714
|
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
4653
4715
|
}
|
4654
4716
|
|
4655
|
-
|
4717
|
+
template<typename dst_t>
|
4718
|
+
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4656
4719
|
const int nb = k / QK_K;
|
4657
4720
|
#if QK_K == 256
|
4658
4721
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4661,7 +4724,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu
|
|
4661
4724
|
#endif
|
4662
4725
|
}
|
4663
4726
|
|
4664
|
-
|
4727
|
+
template<typename dst_t>
|
4728
|
+
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4665
4729
|
const int nb = k / QK_K;
|
4666
4730
|
#if QK_K == 256
|
4667
4731
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4868,6 +4932,26 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
|
|
4868
4932
|
|
4869
4933
|
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
4870
4934
|
switch (type) {
|
4935
|
+
case GGML_TYPE_Q4_0:
|
4936
|
+
return dequantize_row_q4_0_cuda;
|
4937
|
+
case GGML_TYPE_Q4_1:
|
4938
|
+
return dequantize_row_q4_1_cuda;
|
4939
|
+
case GGML_TYPE_Q5_0:
|
4940
|
+
return dequantize_row_q5_0_cuda;
|
4941
|
+
case GGML_TYPE_Q5_1:
|
4942
|
+
return dequantize_row_q5_1_cuda;
|
4943
|
+
case GGML_TYPE_Q8_0:
|
4944
|
+
return dequantize_row_q8_0_cuda;
|
4945
|
+
case GGML_TYPE_Q2_K:
|
4946
|
+
return dequantize_row_q2_K_cuda;
|
4947
|
+
case GGML_TYPE_Q3_K:
|
4948
|
+
return dequantize_row_q3_K_cuda;
|
4949
|
+
case GGML_TYPE_Q4_K:
|
4950
|
+
return dequantize_row_q4_K_cuda;
|
4951
|
+
case GGML_TYPE_Q5_K:
|
4952
|
+
return dequantize_row_q5_K_cuda;
|
4953
|
+
case GGML_TYPE_Q6_K:
|
4954
|
+
return dequantize_row_q6_K_cuda;
|
4871
4955
|
case GGML_TYPE_F32:
|
4872
4956
|
return convert_fp32_to_fp16_cuda;
|
4873
4957
|
default:
|
@@ -4921,7 +5005,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
4921
5005
|
mmq_x = MMQ_X_Q4_0_RDNA1;
|
4922
5006
|
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
4923
5007
|
nwarps = NWARPS_Q4_0_RDNA1;
|
4924
|
-
} else if (compute_capability >=
|
5008
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4925
5009
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4926
5010
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4927
5011
|
nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -4966,7 +5050,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
4966
5050
|
mmq_x = MMQ_X_Q4_1_RDNA1;
|
4967
5051
|
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
4968
5052
|
nwarps = NWARPS_Q4_1_RDNA1;
|
4969
|
-
} else if (compute_capability >=
|
5053
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4970
5054
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4971
5055
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4972
5056
|
nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -5011,7 +5095,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
5011
5095
|
mmq_x = MMQ_X_Q5_0_RDNA1;
|
5012
5096
|
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
5013
5097
|
nwarps = NWARPS_Q5_0_RDNA1;
|
5014
|
-
} else if (compute_capability >=
|
5098
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5015
5099
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
5016
5100
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
5017
5101
|
nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -5056,7 +5140,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
5056
5140
|
mmq_x = MMQ_X_Q5_1_RDNA1;
|
5057
5141
|
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
5058
5142
|
nwarps = NWARPS_Q5_1_RDNA1;
|
5059
|
-
} else if (compute_capability >=
|
5143
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5060
5144
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
5061
5145
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
5062
5146
|
nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -5101,7 +5185,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
5101
5185
|
mmq_x = MMQ_X_Q8_0_RDNA1;
|
5102
5186
|
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
5103
5187
|
nwarps = NWARPS_Q8_0_RDNA1;
|
5104
|
-
} else if (compute_capability >=
|
5188
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5105
5189
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
5106
5190
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
5107
5191
|
nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -5146,7 +5230,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
5146
5230
|
mmq_x = MMQ_X_Q2_K_RDNA1;
|
5147
5231
|
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
5148
5232
|
nwarps = NWARPS_Q2_K_RDNA1;
|
5149
|
-
} else if (compute_capability >=
|
5233
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5150
5234
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
5151
5235
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
5152
5236
|
nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -5193,7 +5277,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
5193
5277
|
mmq_x = MMQ_X_Q3_K_RDNA1;
|
5194
5278
|
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
5195
5279
|
nwarps = NWARPS_Q3_K_RDNA1;
|
5196
|
-
} else if (compute_capability >=
|
5280
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5197
5281
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
5198
5282
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
5199
5283
|
nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -5239,7 +5323,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
5239
5323
|
mmq_x = MMQ_X_Q4_K_RDNA1;
|
5240
5324
|
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
5241
5325
|
nwarps = NWARPS_Q4_K_RDNA1;
|
5242
|
-
} else if (compute_capability >=
|
5326
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5243
5327
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
5244
5328
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
5245
5329
|
nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -5284,7 +5368,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
5284
5368
|
mmq_x = MMQ_X_Q5_K_RDNA1;
|
5285
5369
|
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
5286
5370
|
nwarps = NWARPS_Q5_K_RDNA1;
|
5287
|
-
} else if (compute_capability >=
|
5371
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5288
5372
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
5289
5373
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
5290
5374
|
nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -5329,7 +5413,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
5329
5413
|
mmq_x = MMQ_X_Q6_K_RDNA1;
|
5330
5414
|
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
5331
5415
|
nwarps = NWARPS_Q6_K_RDNA1;
|
5332
|
-
} else if (compute_capability >=
|
5416
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5333
5417
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
5334
5418
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
5335
5419
|
nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -5401,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
5401
5485
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5402
5486
|
}
|
5403
5487
|
|
5488
|
+
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
5489
|
+
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
5490
|
+
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
5491
|
+
}
|
5492
|
+
|
5404
5493
|
template<typename T>
|
5405
5494
|
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5406
5495
|
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
@@ -5668,7 +5757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5668
5757
|
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5669
5758
|
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5670
5759
|
kind = cudaMemcpyDeviceToDevice;
|
5671
|
-
|
5760
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5672
5761
|
int id;
|
5673
5762
|
CUDA_CHECK(cudaGetDevice(&id));
|
5674
5763
|
src_ptr = (char *) extra->data_device[id];
|
@@ -5704,6 +5793,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5704
5793
|
}
|
5705
5794
|
}
|
5706
5795
|
|
5796
|
+
static void ggml_cuda_op_repeat(
|
5797
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5798
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5799
|
+
// guaranteed to be an integer due to the check in ggml_can_repeat
|
5800
|
+
const int64_t ne0 = dst->ne[0];
|
5801
|
+
const int64_t ne1 = dst->ne[1];
|
5802
|
+
const int64_t ne2 = dst->ne[2];
|
5803
|
+
const int64_t ne3 = dst->ne[3];
|
5804
|
+
|
5805
|
+
const int64_t ne00 = src0->ne[0];
|
5806
|
+
const int64_t ne01 = src0->ne[1];
|
5807
|
+
const int64_t ne02 = src0->ne[2];
|
5808
|
+
const int64_t ne03 = src0->ne[3];
|
5809
|
+
|
5810
|
+
const size_t nb0 = dst->nb[0];
|
5811
|
+
const size_t nb1 = dst->nb[1];
|
5812
|
+
const size_t nb2 = dst->nb[2];
|
5813
|
+
const size_t nb3 = dst->nb[3];
|
5814
|
+
|
5815
|
+
const size_t nb00 = src0->nb[0];
|
5816
|
+
const size_t nb01 = src0->nb[1];
|
5817
|
+
const size_t nb02 = src0->nb[2];
|
5818
|
+
const size_t nb03 = src0->nb[3];
|
5819
|
+
|
5820
|
+
const int nr0 = (int)(ne0/ne00);
|
5821
|
+
const int nr1 = (int)(ne1/ne01);
|
5822
|
+
const int nr2 = (int)(ne2/ne02);
|
5823
|
+
const int nr3 = (int)(ne3/ne03);
|
5824
|
+
|
5825
|
+
// TODO: support for transposed / permuted tensors
|
5826
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
5827
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
5828
|
+
|
5829
|
+
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
5830
|
+
for (int i3 = 0; i3 < nr3; i3++) {
|
5831
|
+
for (int k3 = 0; k3 < ne03; k3++) {
|
5832
|
+
for (int i2 = 0; i2 < nr2; i2++) {
|
5833
|
+
for (int k2 = 0; k2 < ne02; k2++) {
|
5834
|
+
for (int i1 = 0; i1 < nr1; i1++) {
|
5835
|
+
for (int k1 = 0; k1 < ne01; k1++) {
|
5836
|
+
for (int i0 = 0; i0 < nr0; i0++) {
|
5837
|
+
CUDA_CHECK(cudaMemcpyAsync(
|
5838
|
+
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
5839
|
+
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
5840
|
+
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
5841
|
+
}
|
5842
|
+
}
|
5843
|
+
}
|
5844
|
+
}
|
5845
|
+
}
|
5846
|
+
}
|
5847
|
+
}
|
5848
|
+
|
5849
|
+
(void) src1;
|
5850
|
+
(void) src1_d;
|
5851
|
+
}
|
5852
|
+
|
5853
|
+
static void ggml_cuda_op_get_rows(
|
5854
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5855
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5856
|
+
|
5857
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
5858
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
5859
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
5860
|
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
5861
|
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
5862
|
+
|
5863
|
+
const int ncols = src0->ne[0];
|
5864
|
+
const int nrows = ggml_nelements(src1);
|
5865
|
+
|
5866
|
+
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
5867
|
+
|
5868
|
+
switch (src0->type) {
|
5869
|
+
case GGML_TYPE_F16:
|
5870
|
+
get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5871
|
+
break;
|
5872
|
+
case GGML_TYPE_F32:
|
5873
|
+
get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5874
|
+
break;
|
5875
|
+
case GGML_TYPE_Q4_0:
|
5876
|
+
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5877
|
+
break;
|
5878
|
+
case GGML_TYPE_Q4_1:
|
5879
|
+
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5880
|
+
break;
|
5881
|
+
case GGML_TYPE_Q5_0:
|
5882
|
+
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5883
|
+
break;
|
5884
|
+
case GGML_TYPE_Q5_1:
|
5885
|
+
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5886
|
+
break;
|
5887
|
+
case GGML_TYPE_Q8_0:
|
5888
|
+
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5889
|
+
break;
|
5890
|
+
default:
|
5891
|
+
// TODO: k-quants
|
5892
|
+
GGML_ASSERT(false);
|
5893
|
+
break;
|
5894
|
+
}
|
5895
|
+
}
|
5896
|
+
|
5707
5897
|
inline void ggml_cuda_op_add(
|
5708
5898
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5709
5899
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -5907,7 +6097,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5907
6097
|
switch(type) {
|
5908
6098
|
case GGML_TYPE_Q4_0:
|
5909
6099
|
case GGML_TYPE_Q4_1:
|
5910
|
-
return max_compute_capability >=
|
6100
|
+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
5911
6101
|
case GGML_TYPE_Q5_0:
|
5912
6102
|
case GGML_TYPE_Q5_1:
|
5913
6103
|
case GGML_TYPE_Q8_0:
|
@@ -5918,7 +6108,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5918
6108
|
case GGML_TYPE_Q3_K:
|
5919
6109
|
case GGML_TYPE_Q4_K:
|
5920
6110
|
case GGML_TYPE_Q5_K:
|
5921
|
-
return max_compute_capability >=
|
6111
|
+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
5922
6112
|
case GGML_TYPE_Q6_K:
|
5923
6113
|
return 64;
|
5924
6114
|
default:
|
@@ -6083,8 +6273,19 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6083
6273
|
|
6084
6274
|
const int compute_capability = g_compute_capabilities[id];
|
6085
6275
|
|
6086
|
-
if (compute_capability >=
|
6087
|
-
// convert src1 to fp16, multiply as fp16, convert dst to fp32
|
6276
|
+
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
6277
|
+
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
6278
|
+
half * src0_as_f16 = nullptr;
|
6279
|
+
size_t src0_as = 0;
|
6280
|
+
if (src0->type != GGML_TYPE_F16) {
|
6281
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
6282
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
6283
|
+
size_t ne = row_diff*ne00;
|
6284
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
|
6285
|
+
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
6286
|
+
}
|
6287
|
+
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
6288
|
+
|
6088
6289
|
half * src1_as_f16 = nullptr;
|
6089
6290
|
size_t src1_as = 0;
|
6090
6291
|
if (src1->type != GGML_TYPE_F16) {
|
@@ -6106,9 +6307,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6106
6307
|
CUBLAS_CHECK(
|
6107
6308
|
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6108
6309
|
row_diff, src1_ncols, ne10,
|
6109
|
-
&alpha_f16,
|
6110
|
-
src1_ptr,
|
6111
|
-
&beta_f16, dst_f16,
|
6310
|
+
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
6311
|
+
src1_ptr, CUDA_R_16F, ne10,
|
6312
|
+
&beta_f16, dst_f16, CUDA_R_16F, ldc,
|
6112
6313
|
CUBLAS_COMPUTE_16F,
|
6113
6314
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
6114
6315
|
|
@@ -6117,6 +6318,10 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6117
6318
|
|
6118
6319
|
ggml_cuda_pool_free(dst_f16, dst_as);
|
6119
6320
|
|
6321
|
+
if (src0_as != 0) {
|
6322
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
6323
|
+
}
|
6324
|
+
|
6120
6325
|
if (src1_as != 0) {
|
6121
6326
|
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
6122
6327
|
}
|
@@ -6229,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
|
|
6229
6434
|
const int64_t ne02 = src0->ne[2];
|
6230
6435
|
const int64_t nrows = ggml_nrows(src0);
|
6231
6436
|
|
6232
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
6437
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6233
6438
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
6234
6439
|
float max_bias;
|
6235
6440
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
6236
6441
|
|
6237
|
-
GGML_ASSERT(ne01 + n_past == ne00);
|
6442
|
+
//GGML_ASSERT(ne01 + n_past == ne00);
|
6238
6443
|
GGML_ASSERT(n_head == ne02);
|
6239
6444
|
|
6240
6445
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
@@ -6293,7 +6498,14 @@ inline void ggml_cuda_op_scale(
|
|
6293
6498
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6294
6499
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6295
6500
|
|
6296
|
-
|
6501
|
+
float scale;
|
6502
|
+
// HACK: support for ggml backend interface
|
6503
|
+
if (src1->backend == GGML_BACKEND_CPU) {
|
6504
|
+
scale = ((float *) src1->data)[0];
|
6505
|
+
} else {
|
6506
|
+
// TODO: pass pointer to kernel instead of copying to host
|
6507
|
+
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
6508
|
+
}
|
6297
6509
|
|
6298
6510
|
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
6299
6511
|
CUDA_CHECK(cudaGetLastError());
|
@@ -6303,6 +6515,24 @@ inline void ggml_cuda_op_scale(
|
|
6303
6515
|
(void) src1_dd;
|
6304
6516
|
}
|
6305
6517
|
|
6518
|
+
inline void ggml_cuda_op_clamp(
|
6519
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6520
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6521
|
+
|
6522
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6523
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6524
|
+
|
6525
|
+
const float min = ((float *) dst->op_params)[0];
|
6526
|
+
const float max = ((float *) dst->op_params)[1];
|
6527
|
+
|
6528
|
+
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6529
|
+
CUDA_CHECK(cudaGetLastError());
|
6530
|
+
|
6531
|
+
(void) src1;
|
6532
|
+
(void) dst;
|
6533
|
+
(void) src1_dd;
|
6534
|
+
}
|
6535
|
+
|
6306
6536
|
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6307
6537
|
const int64_t nrows0 = ggml_nrows(src0);
|
6308
6538
|
|
@@ -6312,9 +6542,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6312
6542
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6313
6543
|
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6314
6544
|
|
6315
|
-
|
6316
|
-
|
6317
|
-
|
6545
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6546
|
+
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6547
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6318
6548
|
|
6319
6549
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6320
6550
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
@@ -6455,9 +6685,9 @@ static void ggml_cuda_op_mul_mat(
|
|
6455
6685
|
const size_t q8_1_ts = sizeof(block_q8_1);
|
6456
6686
|
const size_t q8_1_bs = QK8_1;
|
6457
6687
|
|
6458
|
-
|
6459
|
-
|
6460
|
-
|
6688
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6689
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6690
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6461
6691
|
|
6462
6692
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6463
6693
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
@@ -6535,7 +6765,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6535
6765
|
if (convert_src1_to_q8_1) {
|
6536
6766
|
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6537
6767
|
|
6538
|
-
if (
|
6768
|
+
if (src1_on_device && src1_is_contiguous) {
|
6539
6769
|
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6540
6770
|
CUDA_CHECK(cudaGetLastError());
|
6541
6771
|
}
|
@@ -6617,7 +6847,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6617
6847
|
GGML_ASSERT(false);
|
6618
6848
|
}
|
6619
6849
|
|
6620
|
-
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6850
|
+
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
6621
6851
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6622
6852
|
CUDA_CHECK(cudaGetLastError());
|
6623
6853
|
}
|
@@ -6708,6 +6938,14 @@ static void ggml_cuda_op_mul_mat(
|
|
6708
6938
|
}
|
6709
6939
|
}
|
6710
6940
|
|
6941
|
+
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6942
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
|
6943
|
+
}
|
6944
|
+
|
6945
|
+
static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6946
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
|
6947
|
+
}
|
6948
|
+
|
6711
6949
|
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6712
6950
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6713
6951
|
}
|
@@ -6762,13 +7000,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
6762
7000
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6763
7001
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6764
7002
|
|
6765
|
-
|
7003
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6766
7004
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6767
7005
|
|
6768
|
-
|
7006
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6769
7007
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6770
7008
|
|
6771
|
-
|
7009
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6772
7010
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6773
7011
|
|
6774
7012
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
@@ -6793,13 +7031,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
6793
7031
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6794
7032
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6795
7033
|
|
6796
|
-
|
7034
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6797
7035
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6798
7036
|
|
6799
|
-
|
7037
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6800
7038
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6801
7039
|
|
6802
|
-
|
7040
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6803
7041
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6804
7042
|
|
6805
7043
|
const int64_t row_stride_x = nb01 / sizeof(half);
|
@@ -6820,11 +7058,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6820
7058
|
}
|
6821
7059
|
}
|
6822
7060
|
|
6823
|
-
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7061
|
+
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6824
7062
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6825
7063
|
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6826
7064
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6827
|
-
}else if (src0->type == GGML_TYPE_F32) {
|
7065
|
+
} else if (src0->type == GGML_TYPE_F32) {
|
6828
7066
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6829
7067
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6830
7068
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
@@ -6856,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
6856
7094
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6857
7095
|
}
|
6858
7096
|
|
7097
|
+
static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7098
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
|
7099
|
+
}
|
7100
|
+
|
6859
7101
|
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6860
7102
|
const int64_t ne = ggml_nelements(src0);
|
6861
7103
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
@@ -6885,8 +7127,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
6885
7127
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6886
7128
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6887
7129
|
|
6888
|
-
const
|
6889
|
-
const
|
7130
|
+
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7131
|
+
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6890
7132
|
|
6891
7133
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6892
7134
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
@@ -6941,8 +7183,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6941
7183
|
|
6942
7184
|
const size_t nb1 = tensor->nb[1];
|
6943
7185
|
|
6944
|
-
|
6945
|
-
|
7186
|
+
ggml_backend_type backend = tensor->backend;
|
7187
|
+
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
6946
7188
|
memset(extra, 0, sizeof(*extra));
|
6947
7189
|
|
6948
7190
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
@@ -6996,7 +7238,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6996
7238
|
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
6997
7239
|
}
|
6998
7240
|
|
6999
|
-
|
7000
7241
|
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
7001
7242
|
|
7002
7243
|
extra->data_device[id] = buf;
|
@@ -7035,17 +7276,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
7035
7276
|
delete extra;
|
7036
7277
|
}
|
7037
7278
|
|
7038
|
-
static
|
7279
|
+
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
7039
7280
|
static size_t g_temp_tensor_extra_index = 0;
|
7040
7281
|
|
7041
|
-
static
|
7282
|
+
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7042
7283
|
if (g_temp_tensor_extras == nullptr) {
|
7043
7284
|
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7044
7285
|
}
|
7045
7286
|
|
7046
7287
|
size_t alloc_index = g_temp_tensor_extra_index;
|
7047
7288
|
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7048
|
-
|
7289
|
+
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7049
7290
|
memset(extra, 0, sizeof(*extra));
|
7050
7291
|
|
7051
7292
|
return extra;
|
@@ -7073,7 +7314,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7073
7314
|
return;
|
7074
7315
|
}
|
7075
7316
|
|
7076
|
-
|
7317
|
+
ggml_tensor_extra_gpu * extra;
|
7077
7318
|
|
7078
7319
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7079
7320
|
tensor->op == GGML_OP_VIEW ||
|
@@ -7082,7 +7323,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7082
7323
|
|
7083
7324
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7084
7325
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7085
|
-
|
7326
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7086
7327
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7087
7328
|
size_t offset = 0;
|
7088
7329
|
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7091,7 +7332,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7091
7332
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7092
7333
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
7093
7334
|
} else if (tensor->op == GGML_OP_CPY) {
|
7094
|
-
|
7335
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
7095
7336
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
7096
7337
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7097
7338
|
extra->data_device[g_main_device] = src1_ddv;
|
@@ -7133,13 +7374,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
7133
7374
|
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
7134
7375
|
}
|
7135
7376
|
|
7136
|
-
|
7377
|
+
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
7137
7378
|
|
7138
7379
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7139
7380
|
tensor->op == GGML_OP_VIEW;
|
7140
7381
|
|
7141
7382
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7142
|
-
|
7383
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7143
7384
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7144
7385
|
size_t view_offset = 0;
|
7145
7386
|
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7157,7 +7398,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
|
7157
7398
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7158
7399
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7159
7400
|
|
7160
|
-
|
7401
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7161
7402
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7162
7403
|
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7163
7404
|
}
|
@@ -7214,58 +7455,47 @@ void ggml_cuda_free_scratch() {
|
|
7214
7455
|
g_scratch_buffer = nullptr;
|
7215
7456
|
}
|
7216
7457
|
|
7217
|
-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
7458
|
+
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7218
7459
|
ggml_cuda_func_t func;
|
7219
7460
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7220
7461
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
7221
7462
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
7222
7463
|
|
7464
|
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
7465
|
+
return false;
|
7466
|
+
}
|
7467
|
+
|
7223
7468
|
switch (tensor->op) {
|
7469
|
+
case GGML_OP_REPEAT:
|
7470
|
+
func = ggml_cuda_repeat;
|
7471
|
+
break;
|
7472
|
+
case GGML_OP_GET_ROWS:
|
7473
|
+
func = ggml_cuda_get_rows;
|
7474
|
+
break;
|
7224
7475
|
case GGML_OP_DUP:
|
7225
|
-
if (!any_on_device) {
|
7226
|
-
return false;
|
7227
|
-
}
|
7228
7476
|
func = ggml_cuda_dup;
|
7229
7477
|
break;
|
7230
7478
|
case GGML_OP_ADD:
|
7231
|
-
if (!any_on_device) {
|
7232
|
-
return false;
|
7233
|
-
}
|
7234
7479
|
func = ggml_cuda_add;
|
7235
7480
|
break;
|
7236
7481
|
case GGML_OP_MUL:
|
7237
|
-
if (!any_on_device) {
|
7238
|
-
return false;
|
7239
|
-
}
|
7240
7482
|
func = ggml_cuda_mul;
|
7241
7483
|
break;
|
7242
7484
|
case GGML_OP_UNARY:
|
7243
7485
|
switch (ggml_get_unary_op(tensor)) {
|
7244
7486
|
case GGML_UNARY_OP_GELU:
|
7245
|
-
if (!any_on_device) {
|
7246
|
-
return false;
|
7247
|
-
}
|
7248
7487
|
func = ggml_cuda_gelu;
|
7249
7488
|
break;
|
7250
7489
|
case GGML_UNARY_OP_SILU:
|
7251
|
-
if (!any_on_device) {
|
7252
|
-
return false;
|
7253
|
-
}
|
7254
7490
|
func = ggml_cuda_silu;
|
7255
7491
|
break;
|
7256
7492
|
default:
|
7257
7493
|
return false;
|
7258
7494
|
} break;
|
7259
7495
|
case GGML_OP_NORM:
|
7260
|
-
if (!any_on_device) {
|
7261
|
-
return false;
|
7262
|
-
}
|
7263
7496
|
func = ggml_cuda_norm;
|
7264
7497
|
break;
|
7265
7498
|
case GGML_OP_RMS_NORM:
|
7266
|
-
if (!any_on_device) {
|
7267
|
-
return false;
|
7268
|
-
}
|
7269
7499
|
func = ggml_cuda_rms_norm;
|
7270
7500
|
break;
|
7271
7501
|
case GGML_OP_MUL_MAT:
|
@@ -7275,54 +7505,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7275
7505
|
func = ggml_cuda_mul_mat;
|
7276
7506
|
break;
|
7277
7507
|
case GGML_OP_SCALE:
|
7278
|
-
if (!any_on_device) {
|
7279
|
-
return false;
|
7280
|
-
}
|
7281
7508
|
func = ggml_cuda_scale;
|
7282
7509
|
break;
|
7283
|
-
case
|
7510
|
+
case GGML_OP_CLAMP:
|
7284
7511
|
if (!any_on_device) {
|
7285
7512
|
return false;
|
7286
7513
|
}
|
7514
|
+
func = ggml_cuda_clamp;
|
7515
|
+
break;
|
7516
|
+
case GGML_OP_CPY:
|
7287
7517
|
func = ggml_cuda_cpy;
|
7288
7518
|
break;
|
7289
7519
|
case GGML_OP_CONT:
|
7290
|
-
if (!any_on_device) {
|
7291
|
-
return false;
|
7292
|
-
}
|
7293
7520
|
func = ggml_cuda_dup;
|
7294
7521
|
break;
|
7295
7522
|
case GGML_OP_RESHAPE:
|
7296
7523
|
case GGML_OP_VIEW:
|
7297
7524
|
case GGML_OP_PERMUTE:
|
7298
7525
|
case GGML_OP_TRANSPOSE:
|
7299
|
-
if (!any_on_device) {
|
7300
|
-
return false;
|
7301
|
-
}
|
7302
7526
|
func = ggml_cuda_nop;
|
7303
7527
|
break;
|
7304
7528
|
case GGML_OP_DIAG_MASK_INF:
|
7305
|
-
if (!any_on_device) {
|
7306
|
-
return false;
|
7307
|
-
}
|
7308
7529
|
func = ggml_cuda_diag_mask_inf;
|
7309
7530
|
break;
|
7310
7531
|
case GGML_OP_SOFT_MAX:
|
7311
|
-
if (!any_on_device) {
|
7312
|
-
return false;
|
7313
|
-
}
|
7314
7532
|
func = ggml_cuda_soft_max;
|
7315
7533
|
break;
|
7316
7534
|
case GGML_OP_ROPE:
|
7317
|
-
if (!any_on_device) {
|
7318
|
-
return false;
|
7319
|
-
}
|
7320
7535
|
func = ggml_cuda_rope;
|
7321
7536
|
break;
|
7322
7537
|
case GGML_OP_ALIBI:
|
7323
|
-
if (!any_on_device) {
|
7324
|
-
return false;
|
7325
|
-
}
|
7326
7538
|
func = ggml_cuda_alibi;
|
7327
7539
|
break;
|
7328
7540
|
default:
|
@@ -7350,3 +7562,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
7350
7562
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
7351
7563
|
snprintf(description, description_size, "%s", prop.name);
|
7352
7564
|
}
|
7565
|
+
|
7566
|
+
////////////////////////////////////////////////////////////////////////////////
|
7567
|
+
|
7568
|
+
// backend interface
|
7569
|
+
|
7570
|
+
#define UNUSED GGML_UNUSED
|
7571
|
+
|
7572
|
+
struct ggml_backend_context_cuda {
|
7573
|
+
};
|
7574
|
+
|
7575
|
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
7576
|
+
return GGML_CUDA_NAME;
|
7577
|
+
|
7578
|
+
UNUSED(backend);
|
7579
|
+
}
|
7580
|
+
|
7581
|
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
7582
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
7583
|
+
delete cuda_ctx;
|
7584
|
+
delete backend;
|
7585
|
+
}
|
7586
|
+
|
7587
|
+
struct ggml_backend_buffer_context_cuda {
|
7588
|
+
void * device;
|
7589
|
+
|
7590
|
+
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
7591
|
+
size_t temp_tensor_extra_index = 0;
|
7592
|
+
|
7593
|
+
~ggml_backend_buffer_context_cuda() {
|
7594
|
+
delete[] temp_tensor_extras;
|
7595
|
+
}
|
7596
|
+
|
7597
|
+
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7598
|
+
if (temp_tensor_extras == nullptr) {
|
7599
|
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7600
|
+
}
|
7601
|
+
|
7602
|
+
size_t alloc_index = temp_tensor_extra_index;
|
7603
|
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7604
|
+
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
7605
|
+
memset(extra, 0, sizeof(*extra));
|
7606
|
+
|
7607
|
+
return extra;
|
7608
|
+
}
|
7609
|
+
};
|
7610
|
+
|
7611
|
+
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
7612
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7613
|
+
CUDA_CHECK(cudaFree(ctx->device));
|
7614
|
+
delete ctx;
|
7615
|
+
}
|
7616
|
+
|
7617
|
+
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
7618
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7619
|
+
return ctx->device;
|
7620
|
+
}
|
7621
|
+
|
7622
|
+
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7623
|
+
int64_t row_low = 0;
|
7624
|
+
int64_t row_high = ggml_nrows(tensor);
|
7625
|
+
int64_t nrows_split = row_high - row_low;
|
7626
|
+
|
7627
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
7628
|
+
|
7629
|
+
int64_t ne0 = tensor->ne[0];
|
7630
|
+
|
7631
|
+
if (ggml_is_quantized(tensor->type)) {
|
7632
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
7633
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
7634
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
7635
|
+
}
|
7636
|
+
}
|
7637
|
+
|
7638
|
+
return size;
|
7639
|
+
|
7640
|
+
UNUSED(buffer);
|
7641
|
+
}
|
7642
|
+
|
7643
|
+
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7644
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7645
|
+
|
7646
|
+
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
7647
|
+
assert(tensor->view_src->buffer->backend == buffer->backend);
|
7648
|
+
tensor->backend = tensor->view_src->backend;
|
7649
|
+
tensor->extra = tensor->view_src->extra;
|
7650
|
+
return;
|
7651
|
+
}
|
7652
|
+
|
7653
|
+
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
7654
|
+
|
7655
|
+
extra->data_device[g_main_device] = tensor->data;
|
7656
|
+
|
7657
|
+
tensor->backend = GGML_BACKEND_GPU;
|
7658
|
+
tensor->extra = extra;
|
7659
|
+
|
7660
|
+
if (ggml_is_quantized(tensor->type)) {
|
7661
|
+
// initialize padding to 0 to avoid possible NaN values
|
7662
|
+
int64_t row_low = 0;
|
7663
|
+
int64_t row_high = ggml_nrows(tensor);
|
7664
|
+
int64_t nrows_split = row_high - row_low;
|
7665
|
+
|
7666
|
+
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
7667
|
+
size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
|
7668
|
+
|
7669
|
+
if (padded_size > original_size && tensor->view_src == nullptr) {
|
7670
|
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
|
7671
|
+
}
|
7672
|
+
}
|
7673
|
+
|
7674
|
+
UNUSED(buffer);
|
7675
|
+
}
|
7676
|
+
|
7677
|
+
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
7678
|
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
7679
|
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
7680
|
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
|
7681
|
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
7682
|
+
/* .free_tensor = */ NULL,
|
7683
|
+
};
|
7684
|
+
|
7685
|
+
static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
|
7686
|
+
ggml_cuda_set_device(g_main_device);
|
7687
|
+
|
7688
|
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
7689
|
+
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
7690
|
+
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
7691
|
+
}
|
7692
|
+
|
7693
|
+
static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
|
7694
|
+
return 128;
|
7695
|
+
UNUSED(backend);
|
7696
|
+
}
|
7697
|
+
|
7698
|
+
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
7699
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
7700
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7701
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7702
|
+
|
7703
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
|
7704
|
+
|
7705
|
+
UNUSED(backend);
|
7706
|
+
}
|
7707
|
+
|
7708
|
+
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
7709
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
7710
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7711
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7712
|
+
|
7713
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
7714
|
+
|
7715
|
+
UNUSED(backend);
|
7716
|
+
}
|
7717
|
+
|
7718
|
+
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
7719
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
7720
|
+
|
7721
|
+
UNUSED(backend);
|
7722
|
+
}
|
7723
|
+
|
7724
|
+
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7725
|
+
GGML_ASSERT(!"not implemented");
|
7726
|
+
|
7727
|
+
return nullptr;
|
7728
|
+
|
7729
|
+
UNUSED(backend);
|
7730
|
+
UNUSED(cgraph);
|
7731
|
+
}
|
7732
|
+
|
7733
|
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7734
|
+
GGML_ASSERT(!"not implemented");
|
7735
|
+
|
7736
|
+
UNUSED(backend);
|
7737
|
+
UNUSED(plan);
|
7738
|
+
}
|
7739
|
+
|
7740
|
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7741
|
+
GGML_ASSERT(!"not implemented");
|
7742
|
+
|
7743
|
+
UNUSED(backend);
|
7744
|
+
UNUSED(plan);
|
7745
|
+
}
|
7746
|
+
|
7747
|
+
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7748
|
+
ggml_cuda_set_device(g_main_device);
|
7749
|
+
|
7750
|
+
ggml_compute_params params = {};
|
7751
|
+
params.type = GGML_TASK_COMPUTE;
|
7752
|
+
params.ith = 0;
|
7753
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
7754
|
+
ggml_tensor * node = cgraph->nodes[i];
|
7755
|
+
|
7756
|
+
assert(node->backend == GGML_BACKEND_GPU);
|
7757
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
7758
|
+
if (node->src[j] != nullptr) {
|
7759
|
+
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
7760
|
+
}
|
7761
|
+
}
|
7762
|
+
|
7763
|
+
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
7764
|
+
if (!ok) {
|
7765
|
+
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
7766
|
+
}
|
7767
|
+
GGML_ASSERT(ok);
|
7768
|
+
|
7769
|
+
#if 0
|
7770
|
+
if (node->type == GGML_TYPE_F32) {
|
7771
|
+
cudaDeviceSynchronize();
|
7772
|
+
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
7773
|
+
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
7774
|
+
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
7775
|
+
ggml_type_name(node->src[0]->type),
|
7776
|
+
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
7777
|
+
node->src[0]->name,
|
7778
|
+
node->src[1] ? node->src[1]->name : "none");
|
7779
|
+
double sum = 0.0;
|
7780
|
+
double sq_sum = 0.0;
|
7781
|
+
for (int i = 0; i < ggml_nelements(node); i++) {
|
7782
|
+
printf("%f ", tmp[i]);
|
7783
|
+
sum += tmp[i];
|
7784
|
+
sq_sum += tmp[i]*tmp[i];
|
7785
|
+
}
|
7786
|
+
printf("\n");
|
7787
|
+
printf("sum: %f, ", sum);
|
7788
|
+
printf("sq_sum: %f\n", sq_sum);
|
7789
|
+
}
|
7790
|
+
#endif
|
7791
|
+
}
|
7792
|
+
|
7793
|
+
UNUSED(backend);
|
7794
|
+
}
|
7795
|
+
|
7796
|
+
static ggml_backend_i cuda_backend_i = {
|
7797
|
+
/* .get_name = */ ggml_backend_cuda_name,
|
7798
|
+
/* .free = */ ggml_backend_cuda_free,
|
7799
|
+
/* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
|
7800
|
+
/* .get_alignment = */ ggml_backend_cuda_get_alignment,
|
7801
|
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
7802
|
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
7803
|
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
7804
|
+
/* .cpy_tensor_from = */ nullptr,
|
7805
|
+
/* .cpy_tensor_to = */ nullptr,
|
7806
|
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
7807
|
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
7808
|
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
7809
|
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
7810
|
+
/* .supports_op = */ nullptr,
|
7811
|
+
};
|
7812
|
+
|
7813
|
+
ggml_backend_t ggml_backend_cuda_init() {
|
7814
|
+
ggml_init_cublas(); // TODO: remove from ggml.c
|
7815
|
+
|
7816
|
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
|
7817
|
+
|
7818
|
+
ggml_backend_t cuda_backend = new ggml_backend {
|
7819
|
+
/* .interface = */ cuda_backend_i,
|
7820
|
+
/* .context = */ ctx
|
7821
|
+
};
|
7822
|
+
|
7823
|
+
return cuda_backend;
|
7824
|
+
}
|