llama_cpp 0.5.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +583 -262
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +326 -149
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +167 -89
- data/ext/llama_cpp/src/ggml-metal.metal +130 -40
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +2355 -1166
- data/ext/llama_cpp/src/ggml.h +129 -35
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +1766 -671
- data/ext/llama_cpp/src/llama.h +321 -120
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -10
- data/sig/llama_cpp.rbs +70 -34
- metadata +4 -3
@@ -1,3 +1,4 @@
|
|
1
|
+
#include <algorithm>
|
1
2
|
#include <cstddef>
|
2
3
|
#include <cstdint>
|
3
4
|
#include <limits>
|
@@ -14,9 +15,11 @@
|
|
14
15
|
// for rocblas_initialize()
|
15
16
|
#include "rocblas/rocblas.h"
|
16
17
|
#endif // __HIP_PLATFORM_AMD__
|
18
|
+
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
17
19
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
20
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
21
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
22
|
+
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
20
23
|
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
24
|
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
25
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
@@ -77,9 +80,9 @@
|
|
77
80
|
#include "ggml.h"
|
78
81
|
|
79
82
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
80
|
-
#define
|
83
|
+
#define CC_VOLTA 700
|
81
84
|
#define CC_OFFSET_AMD 1000000
|
82
|
-
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
85
|
+
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
83
86
|
|
84
87
|
#if defined(GGML_USE_HIPBLAS)
|
85
88
|
#define __CUDA_ARCH__ 1300
|
@@ -235,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
|
|
235
238
|
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
236
239
|
}
|
237
240
|
|
241
|
+
template<typename T>
|
242
|
+
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
243
|
+
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
244
|
+
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
245
|
+
|
238
246
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
239
|
-
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
240
247
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
241
248
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
242
249
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
@@ -461,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
|
461
468
|
static bool g_mul_mat_q = true;
|
462
469
|
|
463
470
|
static void * g_scratch_buffer = nullptr;
|
464
|
-
static size_t g_scratch_size =
|
471
|
+
static size_t g_scratch_size = 0; // disabled by default
|
465
472
|
static size_t g_scratch_offset = 0;
|
466
473
|
|
467
474
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -708,7 +715,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
708
715
|
|
709
716
|
//================================== k-quants
|
710
717
|
|
711
|
-
|
718
|
+
template<typename dst_t>
|
719
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
712
720
|
|
713
721
|
const int i = blockIdx.x;
|
714
722
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -720,7 +728,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
720
728
|
const int is = 8*n + l/16;
|
721
729
|
|
722
730
|
const uint8_t q = x[i].qs[32*n + l];
|
723
|
-
|
731
|
+
dst_t * y = yy + i*QK_K + 128*n;
|
724
732
|
|
725
733
|
float dall = __low2half(x[i].dm);
|
726
734
|
float dmin = __high2half(x[i].dm);
|
@@ -732,7 +740,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
732
740
|
const int is = tid/16; // 0 or 1
|
733
741
|
const int il = tid%16; // 0...15
|
734
742
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
735
|
-
|
743
|
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
736
744
|
float dall = __low2half(x[i].dm);
|
737
745
|
float dmin = __high2half(x[i].dm);
|
738
746
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
@@ -741,7 +749,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
741
749
|
|
742
750
|
}
|
743
751
|
|
744
|
-
|
752
|
+
template<typename dst_t>
|
753
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
745
754
|
|
746
755
|
const int i = blockIdx.x;
|
747
756
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -765,7 +774,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
|
|
765
774
|
float d_all = x[i].d;
|
766
775
|
float dl = d_all * (us - 32);
|
767
776
|
|
768
|
-
|
777
|
+
dst_t * y = yy + i*QK_K + 128*n + 32*j;
|
769
778
|
const uint8_t * q = x[i].qs + 32*n;
|
770
779
|
const uint8_t * hm = x[i].hmask;
|
771
780
|
|
@@ -777,7 +786,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
|
|
777
786
|
const int im = il/8; // 0...1
|
778
787
|
const int in = il%8; // 0...7
|
779
788
|
|
780
|
-
|
789
|
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
781
790
|
|
782
791
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
783
792
|
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
@@ -805,7 +814,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
805
814
|
}
|
806
815
|
#endif
|
807
816
|
|
808
|
-
|
817
|
+
template<typename dst_t>
|
818
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
809
819
|
const block_q4_K * x = (const block_q4_K *) vx;
|
810
820
|
|
811
821
|
const int i = blockIdx.x;
|
@@ -818,7 +828,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
818
828
|
const int is = 2*il;
|
819
829
|
const int n = 4;
|
820
830
|
|
821
|
-
|
831
|
+
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
822
832
|
|
823
833
|
const float dall = __low2half(x[i].dm);
|
824
834
|
const float dmin = __high2half(x[i].dm);
|
@@ -837,7 +847,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
837
847
|
#else
|
838
848
|
const int tid = threadIdx.x;
|
839
849
|
const uint8_t * q = x[i].qs;
|
840
|
-
|
850
|
+
dst_t * y = yy + i*QK_K;
|
841
851
|
const float d = (float)x[i].dm[0];
|
842
852
|
const float m = (float)x[i].dm[1];
|
843
853
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
@@ -845,7 +855,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
845
855
|
#endif
|
846
856
|
}
|
847
857
|
|
848
|
-
|
858
|
+
template<typename dst_t>
|
859
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
849
860
|
const block_q5_K * x = (const block_q5_K *) vx;
|
850
861
|
|
851
862
|
const int i = blockIdx.x;
|
@@ -857,7 +868,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
857
868
|
const int ir = tid%16; // ir is in 0...15
|
858
869
|
const int is = 2*il; // is is in 0...6
|
859
870
|
|
860
|
-
|
871
|
+
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
861
872
|
|
862
873
|
const float dall = __low2half(x[i].dm);
|
863
874
|
const float dmin = __high2half(x[i].dm);
|
@@ -885,13 +896,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
885
896
|
const int is = tid/16; // 0 or 1
|
886
897
|
const uint8_t h = x[i].qh[in] >> im;
|
887
898
|
const float d = x[i].d;
|
888
|
-
|
899
|
+
dst_t * y = yy + i*QK_K + tid;
|
889
900
|
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
890
901
|
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
891
902
|
#endif
|
892
903
|
}
|
893
904
|
|
894
|
-
|
905
|
+
template<typename dst_t>
|
906
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
895
907
|
const block_q6_K * x = (const block_q6_K *) vx;
|
896
908
|
|
897
909
|
const int i = blockIdx.x;
|
@@ -903,7 +915,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
|
|
903
915
|
const int il = tid - 32*ip; // 0...32
|
904
916
|
const int is = 8*ip + il/16;
|
905
917
|
|
906
|
-
|
918
|
+
dst_t * y = yy + i*QK_K + 128*ip + il;
|
907
919
|
|
908
920
|
const float d = x[i].d;
|
909
921
|
|
@@ -922,7 +934,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
|
|
922
934
|
const int ip = tid/16; // 0 or 1
|
923
935
|
const int il = tid - 16*ip; // 0...15
|
924
936
|
|
925
|
-
|
937
|
+
dst_t * y = yy + i*QK_K + 16*ip + il;
|
926
938
|
|
927
939
|
const float d = x[i].d;
|
928
940
|
|
@@ -1515,6 +1527,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1515
1527
|
v.y = x[ib + iqs + 1];
|
1516
1528
|
}
|
1517
1529
|
|
1530
|
+
static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
1531
|
+
const float * x = (const float *) vx;
|
1532
|
+
|
1533
|
+
// automatic half -> float type cast if dfloat == float
|
1534
|
+
v.x = x[ib + iqs + 0];
|
1535
|
+
v.y = x[ib + iqs + 1];
|
1536
|
+
}
|
1537
|
+
|
1518
1538
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
1519
1539
|
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
1520
1540
|
|
@@ -1554,8 +1574,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1554
1574
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1555
1575
|
}
|
1556
1576
|
|
1557
|
-
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1558
|
-
static __global__ void dequantize_block(const void * __restrict__ vx,
|
1577
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1578
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1559
1579
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1560
1580
|
|
1561
1581
|
if (i >= k) {
|
@@ -3533,7 +3553,7 @@ template <bool need_check> static __global__ void
|
|
3533
3553
|
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3534
3554
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3535
3555
|
|
3536
|
-
#elif __CUDA_ARCH__ >=
|
3556
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3537
3557
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3538
3558
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3539
3559
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -3553,7 +3573,7 @@ template <bool need_check> static __global__ void
|
|
3553
3573
|
#else
|
3554
3574
|
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3555
3575
|
assert(false);
|
3556
|
-
#endif // __CUDA_ARCH__ >=
|
3576
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3557
3577
|
}
|
3558
3578
|
|
3559
3579
|
#define MMQ_X_Q4_1_RDNA2 64
|
@@ -3574,9 +3594,9 @@ template <bool need_check> static __global__ void
|
|
3574
3594
|
#if defined(RDNA3) || defined(RDNA2)
|
3575
3595
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
3576
3596
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3577
|
-
#elif __CUDA_ARCH__ <
|
3597
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3578
3598
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3579
|
-
#endif // __CUDA_ARCH__ <
|
3599
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3580
3600
|
mul_mat_q4_1(
|
3581
3601
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3582
3602
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3596,7 +3616,7 @@ template <bool need_check> static __global__ void
|
|
3596
3616
|
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3597
3617
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3598
3618
|
|
3599
|
-
#elif __CUDA_ARCH__ >=
|
3619
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3600
3620
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3601
3621
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3602
3622
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -3616,7 +3636,7 @@ template <bool need_check> static __global__ void
|
|
3616
3636
|
#else
|
3617
3637
|
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3618
3638
|
assert(false);
|
3619
|
-
#endif // __CUDA_ARCH__ >=
|
3639
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3620
3640
|
}
|
3621
3641
|
|
3622
3642
|
#define MMQ_X_Q5_0_RDNA2 64
|
@@ -3657,7 +3677,7 @@ template <bool need_check> static __global__ void
|
|
3657
3677
|
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3658
3678
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3659
3679
|
|
3660
|
-
#elif __CUDA_ARCH__ >=
|
3680
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3661
3681
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3662
3682
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3663
3683
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -3677,7 +3697,7 @@ template <bool need_check> static __global__ void
|
|
3677
3697
|
#else
|
3678
3698
|
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3679
3699
|
assert(false);
|
3680
|
-
#endif // __CUDA_ARCH__ >=
|
3700
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3681
3701
|
}
|
3682
3702
|
|
3683
3703
|
#define MMQ_X_Q5_1_RDNA2 64
|
@@ -3718,7 +3738,7 @@ mul_mat_q5_1(
|
|
3718
3738
|
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3719
3739
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3720
3740
|
|
3721
|
-
#elif __CUDA_ARCH__ >=
|
3741
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3722
3742
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3723
3743
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3724
3744
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -3738,7 +3758,7 @@ mul_mat_q5_1(
|
|
3738
3758
|
#else
|
3739
3759
|
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3740
3760
|
assert(false);
|
3741
|
-
#endif // __CUDA_ARCH__ >=
|
3761
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3742
3762
|
}
|
3743
3763
|
|
3744
3764
|
#define MMQ_X_Q8_0_RDNA2 64
|
@@ -3779,7 +3799,7 @@ template <bool need_check> static __global__ void
|
|
3779
3799
|
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3780
3800
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3781
3801
|
|
3782
|
-
#elif __CUDA_ARCH__ >=
|
3802
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3783
3803
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3784
3804
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3785
3805
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -3799,7 +3819,7 @@ template <bool need_check> static __global__ void
|
|
3799
3819
|
#else
|
3800
3820
|
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3801
3821
|
assert(false);
|
3802
|
-
#endif // __CUDA_ARCH__ >=
|
3822
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3803
3823
|
}
|
3804
3824
|
|
3805
3825
|
#define MMQ_X_Q2_K_RDNA2 64
|
@@ -3840,7 +3860,7 @@ mul_mat_q2_K(
|
|
3840
3860
|
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3841
3861
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3842
3862
|
|
3843
|
-
#elif __CUDA_ARCH__ >=
|
3863
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3844
3864
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3845
3865
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3846
3866
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -3860,7 +3880,7 @@ mul_mat_q2_K(
|
|
3860
3880
|
#else
|
3861
3881
|
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3862
3882
|
assert(false);
|
3863
|
-
#endif // __CUDA_ARCH__ >=
|
3883
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3864
3884
|
}
|
3865
3885
|
|
3866
3886
|
#define MMQ_X_Q3_K_RDNA2 128
|
@@ -3881,9 +3901,9 @@ template <bool need_check> static __global__ void
|
|
3881
3901
|
#if defined(RDNA3) || defined(RDNA2)
|
3882
3902
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
3883
3903
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3884
|
-
#elif __CUDA_ARCH__ <
|
3904
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3885
3905
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3886
|
-
#endif // __CUDA_ARCH__ <
|
3906
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3887
3907
|
mul_mat_q3_K(
|
3888
3908
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3889
3909
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3903,7 +3923,7 @@ template <bool need_check> static __global__ void
|
|
3903
3923
|
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3904
3924
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3905
3925
|
|
3906
|
-
#elif __CUDA_ARCH__ >=
|
3926
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3907
3927
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3908
3928
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3909
3929
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -3923,7 +3943,7 @@ template <bool need_check> static __global__ void
|
|
3923
3943
|
#else
|
3924
3944
|
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3925
3945
|
assert(false);
|
3926
|
-
#endif // __CUDA_ARCH__ >=
|
3946
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3927
3947
|
}
|
3928
3948
|
|
3929
3949
|
#define MMQ_X_Q4_K_RDNA2 64
|
@@ -3944,9 +3964,9 @@ template <bool need_check> static __global__ void
|
|
3944
3964
|
#if defined(RDNA3) || defined(RDNA2)
|
3945
3965
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
3946
3966
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3947
|
-
#elif __CUDA_ARCH__ <
|
3967
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3948
3968
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3949
|
-
#endif // __CUDA_ARCH__ <
|
3969
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3950
3970
|
mul_mat_q4_K(
|
3951
3971
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3952
3972
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3966,7 +3986,7 @@ template <bool need_check> static __global__ void
|
|
3966
3986
|
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3967
3987
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3968
3988
|
|
3969
|
-
#elif __CUDA_ARCH__ >=
|
3989
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3970
3990
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3971
3991
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3972
3992
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -3986,7 +4006,7 @@ template <bool need_check> static __global__ void
|
|
3986
4006
|
#else
|
3987
4007
|
(void) vec_dot_q4_K_q8_1_mul_mat;
|
3988
4008
|
assert(false);
|
3989
|
-
#endif // __CUDA_ARCH__ >=
|
4009
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3990
4010
|
}
|
3991
4011
|
|
3992
4012
|
#define MMQ_X_Q5_K_RDNA2 64
|
@@ -4027,7 +4047,7 @@ mul_mat_q5_K(
|
|
4027
4047
|
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4028
4048
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4029
4049
|
|
4030
|
-
#elif __CUDA_ARCH__ >=
|
4050
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
4031
4051
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
4032
4052
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4033
4053
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -4047,7 +4067,7 @@ mul_mat_q5_K(
|
|
4047
4067
|
#else
|
4048
4068
|
(void) vec_dot_q5_K_q8_1_mul_mat;
|
4049
4069
|
assert(false);
|
4050
|
-
#endif // __CUDA_ARCH__ >=
|
4070
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4051
4071
|
}
|
4052
4072
|
|
4053
4073
|
#define MMQ_X_Q6_K_RDNA2 64
|
@@ -4068,9 +4088,9 @@ template <bool need_check> static __global__ void
|
|
4068
4088
|
#if defined(RDNA3) || defined(RDNA2)
|
4069
4089
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
4070
4090
|
#endif // defined(RDNA3) || defined(RDNA2)
|
4071
|
-
#elif __CUDA_ARCH__ <
|
4091
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
4072
4092
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
4073
|
-
#endif // __CUDA_ARCH__ <
|
4093
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
4074
4094
|
mul_mat_q6_K(
|
4075
4095
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
4076
4096
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -4090,7 +4110,7 @@ template <bool need_check> static __global__ void
|
|
4090
4110
|
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4091
4111
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4092
4112
|
|
4093
|
-
#elif __CUDA_ARCH__ >=
|
4113
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
4094
4114
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
4095
4115
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4096
4116
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4110,7 +4130,7 @@ template <bool need_check> static __global__ void
|
|
4110
4130
|
#else
|
4111
4131
|
(void) vec_dot_q6_K_q8_1_mul_mat;
|
4112
4132
|
assert(false);
|
4113
|
-
#endif // __CUDA_ARCH__ >=
|
4133
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4114
4134
|
}
|
4115
4135
|
|
4116
4136
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
@@ -4355,8 +4375,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4355
4375
|
}
|
4356
4376
|
|
4357
4377
|
// rope == RoPE == rotary positional embedding
|
4358
|
-
|
4359
|
-
|
4378
|
+
|
4379
|
+
template<typename T, bool has_pos>
|
4380
|
+
static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4381
|
+
const int p_delta_rows, const float theta_scale) {
|
4360
4382
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4361
4383
|
|
4362
4384
|
if (col >= ncols) {
|
@@ -4365,8 +4387,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
4365
4387
|
|
4366
4388
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4367
4389
|
const int i = row*ncols + col;
|
4390
|
+
const int i2 = row/p_delta_rows;
|
4368
4391
|
|
4369
|
-
const
|
4392
|
+
const int p = has_pos ? pos[i2] : 0;
|
4393
|
+
const float p0 = p*freq_scale;
|
4394
|
+
const float theta = p0*powf(theta_scale, col/2);
|
4370
4395
|
const float sin_theta = sinf(theta);
|
4371
4396
|
const float cos_theta = cosf(theta);
|
4372
4397
|
|
@@ -4377,8 +4402,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
4377
4402
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
4378
4403
|
}
|
4379
4404
|
|
4380
|
-
|
4381
|
-
|
4405
|
+
template<typename T, bool has_pos>
|
4406
|
+
static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4407
|
+
const int p_delta_rows, const float theta_scale) {
|
4382
4408
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4383
4409
|
|
4384
4410
|
if (col >= ncols) {
|
@@ -4387,8 +4413,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4387
4413
|
|
4388
4414
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4389
4415
|
const int i = row*ncols + col/2;
|
4416
|
+
const int i2 = row/p_delta_rows;
|
4390
4417
|
|
4391
|
-
const
|
4418
|
+
const int p = has_pos ? pos[i2] : 0;
|
4419
|
+
const float p0 = p*freq_scale;
|
4420
|
+
const float theta = p0*powf(theta_scale, col/2);
|
4392
4421
|
const float sin_theta = sinf(theta);
|
4393
4422
|
const float cos_theta = cosf(theta);
|
4394
4423
|
|
@@ -4399,8 +4428,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4399
4428
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4400
4429
|
}
|
4401
4430
|
|
4402
|
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
4403
|
-
const
|
4431
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4432
|
+
const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
4404
4433
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4405
4434
|
const int half_n_dims = ncols/4;
|
4406
4435
|
|
@@ -4410,11 +4439,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4410
4439
|
|
4411
4440
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4412
4441
|
const int i = row*ncols + col;
|
4442
|
+
const int i2 = row/p_delta_rows;
|
4413
4443
|
|
4414
4444
|
const float col_theta_scale = powf(theta_scale, col);
|
4415
|
-
|
4445
|
+
// FIXME: this is likely wrong
|
4446
|
+
const int p = pos != nullptr ? pos[i2] : 0;
|
4416
4447
|
|
4417
|
-
const float theta = min(p,
|
4448
|
+
const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
|
4418
4449
|
const float sin_theta = sinf(theta);
|
4419
4450
|
const float cos_theta = cosf(theta);
|
4420
4451
|
|
@@ -4424,7 +4455,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4424
4455
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4425
4456
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
4426
4457
|
|
4427
|
-
const float block_theta = max(p -
|
4458
|
+
const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
|
4428
4459
|
const float sin_block_theta = sinf(block_theta);
|
4429
4460
|
const float cos_block_theta = cosf(block_theta);
|
4430
4461
|
|
@@ -4578,32 +4609,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4578
4609
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4579
4610
|
}
|
4580
4611
|
|
4581
|
-
|
4612
|
+
template<typename dst_t>
|
4613
|
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4582
4614
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4583
4615
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4584
4616
|
}
|
4585
4617
|
|
4586
|
-
|
4618
|
+
template<typename dst_t>
|
4619
|
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4587
4620
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4588
4621
|
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4589
4622
|
}
|
4590
4623
|
|
4591
|
-
|
4624
|
+
template<typename dst_t>
|
4625
|
+
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4592
4626
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4593
4627
|
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4594
4628
|
}
|
4595
4629
|
|
4596
|
-
|
4630
|
+
template<typename dst_t>
|
4631
|
+
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4597
4632
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4598
4633
|
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4599
4634
|
}
|
4600
4635
|
|
4601
|
-
|
4636
|
+
template<typename dst_t>
|
4637
|
+
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4602
4638
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4603
4639
|
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4604
4640
|
}
|
4605
4641
|
|
4606
|
-
|
4642
|
+
template<typename dst_t>
|
4643
|
+
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4607
4644
|
const int nb = k / QK_K;
|
4608
4645
|
#if QK_K == 256
|
4609
4646
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4612,7 +4649,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
|
|
4612
4649
|
#endif
|
4613
4650
|
}
|
4614
4651
|
|
4615
|
-
|
4652
|
+
template<typename dst_t>
|
4653
|
+
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4616
4654
|
const int nb = k / QK_K;
|
4617
4655
|
#if QK_K == 256
|
4618
4656
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4621,12 +4659,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
|
|
4621
4659
|
#endif
|
4622
4660
|
}
|
4623
4661
|
|
4624
|
-
|
4662
|
+
template<typename dst_t>
|
4663
|
+
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4625
4664
|
const int nb = k / QK_K;
|
4626
4665
|
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
4627
4666
|
}
|
4628
4667
|
|
4629
|
-
|
4668
|
+
template<typename dst_t>
|
4669
|
+
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4630
4670
|
const int nb = k / QK_K;
|
4631
4671
|
#if QK_K == 256
|
4632
4672
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4635,7 +4675,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu
|
|
4635
4675
|
#endif
|
4636
4676
|
}
|
4637
4677
|
|
4638
|
-
|
4678
|
+
template<typename dst_t>
|
4679
|
+
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4639
4680
|
const int nb = k / QK_K;
|
4640
4681
|
#if QK_K == 256
|
4641
4682
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4826,6 +4867,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
4826
4867
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4827
4868
|
}
|
4828
4869
|
|
4870
|
+
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
4871
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
4872
|
+
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4873
|
+
}
|
4874
|
+
|
4829
4875
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4830
4876
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4831
4877
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -4835,6 +4881,35 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
|
|
4835
4881
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4836
4882
|
}
|
4837
4883
|
|
4884
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
4885
|
+
switch (type) {
|
4886
|
+
case GGML_TYPE_Q4_0:
|
4887
|
+
return dequantize_row_q4_0_cuda;
|
4888
|
+
case GGML_TYPE_Q4_1:
|
4889
|
+
return dequantize_row_q4_1_cuda;
|
4890
|
+
case GGML_TYPE_Q5_0:
|
4891
|
+
return dequantize_row_q5_0_cuda;
|
4892
|
+
case GGML_TYPE_Q5_1:
|
4893
|
+
return dequantize_row_q5_1_cuda;
|
4894
|
+
case GGML_TYPE_Q8_0:
|
4895
|
+
return dequantize_row_q8_0_cuda;
|
4896
|
+
case GGML_TYPE_Q2_K:
|
4897
|
+
return dequantize_row_q2_K_cuda;
|
4898
|
+
case GGML_TYPE_Q3_K:
|
4899
|
+
return dequantize_row_q3_K_cuda;
|
4900
|
+
case GGML_TYPE_Q4_K:
|
4901
|
+
return dequantize_row_q4_K_cuda;
|
4902
|
+
case GGML_TYPE_Q5_K:
|
4903
|
+
return dequantize_row_q5_K_cuda;
|
4904
|
+
case GGML_TYPE_Q6_K:
|
4905
|
+
return dequantize_row_q6_K_cuda;
|
4906
|
+
case GGML_TYPE_F32:
|
4907
|
+
return convert_fp32_to_fp16_cuda;
|
4908
|
+
default:
|
4909
|
+
return nullptr;
|
4910
|
+
}
|
4911
|
+
}
|
4912
|
+
|
4838
4913
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
4839
4914
|
switch (type) {
|
4840
4915
|
case GGML_TYPE_Q4_0:
|
@@ -4881,7 +4956,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
4881
4956
|
mmq_x = MMQ_X_Q4_0_RDNA1;
|
4882
4957
|
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
4883
4958
|
nwarps = NWARPS_Q4_0_RDNA1;
|
4884
|
-
} else if (compute_capability >=
|
4959
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4885
4960
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4886
4961
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4887
4962
|
nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -4926,7 +5001,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
4926
5001
|
mmq_x = MMQ_X_Q4_1_RDNA1;
|
4927
5002
|
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
4928
5003
|
nwarps = NWARPS_Q4_1_RDNA1;
|
4929
|
-
} else if (compute_capability >=
|
5004
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4930
5005
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4931
5006
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4932
5007
|
nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -4971,7 +5046,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4971
5046
|
mmq_x = MMQ_X_Q5_0_RDNA1;
|
4972
5047
|
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
4973
5048
|
nwarps = NWARPS_Q5_0_RDNA1;
|
4974
|
-
} else if (compute_capability >=
|
5049
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4975
5050
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4976
5051
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4977
5052
|
nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -5016,7 +5091,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
5016
5091
|
mmq_x = MMQ_X_Q5_1_RDNA1;
|
5017
5092
|
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
5018
5093
|
nwarps = NWARPS_Q5_1_RDNA1;
|
5019
|
-
} else if (compute_capability >=
|
5094
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5020
5095
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
5021
5096
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
5022
5097
|
nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -5061,7 +5136,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
5061
5136
|
mmq_x = MMQ_X_Q8_0_RDNA1;
|
5062
5137
|
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
5063
5138
|
nwarps = NWARPS_Q8_0_RDNA1;
|
5064
|
-
} else if (compute_capability >=
|
5139
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5065
5140
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
5066
5141
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
5067
5142
|
nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -5106,7 +5181,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
5106
5181
|
mmq_x = MMQ_X_Q2_K_RDNA1;
|
5107
5182
|
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
5108
5183
|
nwarps = NWARPS_Q2_K_RDNA1;
|
5109
|
-
} else if (compute_capability >=
|
5184
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5110
5185
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
5111
5186
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
5112
5187
|
nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -5153,7 +5228,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
5153
5228
|
mmq_x = MMQ_X_Q3_K_RDNA1;
|
5154
5229
|
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
5155
5230
|
nwarps = NWARPS_Q3_K_RDNA1;
|
5156
|
-
} else if (compute_capability >=
|
5231
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5157
5232
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
5158
5233
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
5159
5234
|
nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -5199,7 +5274,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
5199
5274
|
mmq_x = MMQ_X_Q4_K_RDNA1;
|
5200
5275
|
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
5201
5276
|
nwarps = NWARPS_Q4_K_RDNA1;
|
5202
|
-
} else if (compute_capability >=
|
5277
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5203
5278
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
5204
5279
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
5205
5280
|
nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -5244,7 +5319,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
5244
5319
|
mmq_x = MMQ_X_Q5_K_RDNA1;
|
5245
5320
|
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
5246
5321
|
nwarps = NWARPS_Q5_K_RDNA1;
|
5247
|
-
} else if (compute_capability >=
|
5322
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5248
5323
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
5249
5324
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
5250
5325
|
nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -5289,7 +5364,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
5289
5364
|
mmq_x = MMQ_X_Q6_K_RDNA1;
|
5290
5365
|
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
5291
5366
|
nwarps = NWARPS_Q6_K_RDNA1;
|
5292
|
-
} else if (compute_capability >=
|
5367
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5293
5368
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
5294
5369
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
5295
5370
|
nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -5361,31 +5436,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
5361
5436
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5362
5437
|
}
|
5363
5438
|
|
5364
|
-
|
5365
|
-
|
5439
|
+
template<typename T>
|
5440
|
+
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5441
|
+
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
5366
5442
|
GGML_ASSERT(ncols % 2 == 0);
|
5367
5443
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5368
5444
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5369
5445
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5370
|
-
|
5446
|
+
if (pos == nullptr) {
|
5447
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5448
|
+
} else {
|
5449
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5450
|
+
}
|
5371
5451
|
}
|
5372
5452
|
|
5373
|
-
|
5374
|
-
|
5453
|
+
template<typename T>
|
5454
|
+
static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5455
|
+
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
5375
5456
|
GGML_ASSERT(ncols % 2 == 0);
|
5376
5457
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5377
5458
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5378
5459
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5379
|
-
|
5460
|
+
if (pos == nullptr) {
|
5461
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5462
|
+
} else {
|
5463
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5464
|
+
}
|
5380
5465
|
}
|
5381
5466
|
|
5382
|
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
5383
|
-
const
|
5467
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5468
|
+
const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
5384
5469
|
GGML_ASSERT(ncols % 4 == 0);
|
5385
5470
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5386
5471
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5387
5472
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5388
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
5473
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
|
5389
5474
|
}
|
5390
5475
|
|
5391
5476
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5857,7 +5942,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5857
5942
|
switch(type) {
|
5858
5943
|
case GGML_TYPE_Q4_0:
|
5859
5944
|
case GGML_TYPE_Q4_1:
|
5860
|
-
return max_compute_capability >=
|
5945
|
+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
5861
5946
|
case GGML_TYPE_Q5_0:
|
5862
5947
|
case GGML_TYPE_Q5_1:
|
5863
5948
|
case GGML_TYPE_Q8_0:
|
@@ -5868,7 +5953,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5868
5953
|
case GGML_TYPE_Q3_K:
|
5869
5954
|
case GGML_TYPE_Q4_K:
|
5870
5955
|
case GGML_TYPE_Q5_K:
|
5871
|
-
return max_compute_capability >=
|
5956
|
+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
5872
5957
|
case GGML_TYPE_Q6_K:
|
5873
5958
|
return 64;
|
5874
5959
|
default:
|
@@ -6016,8 +6101,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6016
6101
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
6017
6102
|
GGML_ASSERT(dst_dd_i != nullptr);
|
6018
6103
|
|
6019
|
-
const float alpha = 1.0f;
|
6020
|
-
const float beta = 0.0f;
|
6021
6104
|
|
6022
6105
|
const int64_t ne00 = src0->ne[0];
|
6023
6106
|
|
@@ -6026,16 +6109,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6026
6109
|
const int64_t ne0 = dst->ne[0];
|
6027
6110
|
const int64_t row_diff = row_high - row_low;
|
6028
6111
|
|
6029
|
-
float * src0_ddq_as_f32;
|
6030
|
-
size_t src0_as = 0;
|
6031
|
-
|
6032
|
-
if (src0->type != GGML_TYPE_F32) {
|
6033
|
-
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6034
|
-
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6035
|
-
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6036
|
-
}
|
6037
|
-
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
6038
|
-
|
6039
6112
|
int id;
|
6040
6113
|
CUDA_CHECK(cudaGetDevice(&id));
|
6041
6114
|
|
@@ -6043,16 +6116,87 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6043
6116
|
// ldc == nrows of the matrix that cuBLAS writes into
|
6044
6117
|
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
6045
6118
|
|
6046
|
-
|
6047
|
-
|
6048
|
-
|
6049
|
-
|
6050
|
-
|
6051
|
-
|
6052
|
-
|
6119
|
+
const int compute_capability = g_compute_capabilities[id];
|
6120
|
+
|
6121
|
+
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
6122
|
+
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
6123
|
+
half * src0_as_f16 = nullptr;
|
6124
|
+
size_t src0_as = 0;
|
6125
|
+
if (src0->type != GGML_TYPE_F16) {
|
6126
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
6127
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
6128
|
+
size_t ne = row_diff*ne00;
|
6129
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
|
6130
|
+
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
6131
|
+
}
|
6132
|
+
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
6133
|
+
|
6134
|
+
half * src1_as_f16 = nullptr;
|
6135
|
+
size_t src1_as = 0;
|
6136
|
+
if (src1->type != GGML_TYPE_F16) {
|
6137
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
6138
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
6139
|
+
size_t ne = src1_ncols*ne10;
|
6140
|
+
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6141
|
+
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6142
|
+
}
|
6143
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
|
6144
|
+
|
6145
|
+
size_t dst_as = 0;
|
6146
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6147
|
+
|
6148
|
+
const half alpha_f16 = 1.0f;
|
6149
|
+
const half beta_f16 = 0.0f;
|
6150
|
+
|
6151
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
6152
|
+
CUBLAS_CHECK(
|
6153
|
+
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6154
|
+
row_diff, src1_ncols, ne10,
|
6155
|
+
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
6156
|
+
src1_ptr, CUDA_R_16F, ne10,
|
6157
|
+
&beta_f16, dst_f16, CUDA_R_16F, ldc,
|
6158
|
+
CUBLAS_COMPUTE_16F,
|
6159
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
6160
|
+
|
6161
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
6162
|
+
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
|
6163
|
+
|
6164
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
6165
|
+
|
6166
|
+
if (src0_as != 0) {
|
6167
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
6168
|
+
}
|
6053
6169
|
|
6054
|
-
|
6055
|
-
|
6170
|
+
if (src1_as != 0) {
|
6171
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
6172
|
+
}
|
6173
|
+
}
|
6174
|
+
else {
|
6175
|
+
float * src0_ddq_as_f32 = nullptr;
|
6176
|
+
size_t src0_as = 0;
|
6177
|
+
|
6178
|
+
if (src0->type != GGML_TYPE_F32) {
|
6179
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6180
|
+
GGML_ASSERT(to_fp32_cuda != nullptr);
|
6181
|
+
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6182
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6183
|
+
}
|
6184
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
6185
|
+
|
6186
|
+
const float alpha = 1.0f;
|
6187
|
+
const float beta = 0.0f;
|
6188
|
+
|
6189
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
6190
|
+
CUBLAS_CHECK(
|
6191
|
+
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6192
|
+
row_diff, src1_ncols, ne10,
|
6193
|
+
&alpha, src0_ddf_i, ne00,
|
6194
|
+
src1_ddf_i, ne10,
|
6195
|
+
&beta, dst_dd_i, ldc));
|
6196
|
+
|
6197
|
+
if (src0_as != 0) {
|
6198
|
+
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
6199
|
+
}
|
6056
6200
|
}
|
6057
6201
|
|
6058
6202
|
(void) dst;
|
@@ -6064,14 +6208,16 @@ inline void ggml_cuda_op_rope(
|
|
6064
6208
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6065
6209
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6066
6210
|
|
6067
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6068
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6211
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
6212
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
6213
|
+
GGML_ASSERT(src0->type == dst->type);
|
6069
6214
|
|
6070
6215
|
const int64_t ne00 = src0->ne[0];
|
6071
6216
|
const int64_t ne01 = src0->ne[1];
|
6217
|
+
const int64_t ne2 = dst->ne[2];
|
6072
6218
|
const int64_t nrows = ggml_nrows(src0);
|
6073
6219
|
|
6074
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
6220
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6075
6221
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6076
6222
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6077
6223
|
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
@@ -6082,19 +6228,38 @@ inline void ggml_cuda_op_rope(
|
|
6082
6228
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6083
6229
|
|
6084
6230
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6085
|
-
|
6231
|
+
|
6232
|
+
const int32_t * pos = nullptr;
|
6233
|
+
if ((mode & 1) == 0) {
|
6234
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
6235
|
+
GGML_ASSERT(src1->ne[0] == ne2);
|
6236
|
+
pos = (const int32_t *) src1_dd;
|
6237
|
+
}
|
6086
6238
|
|
6087
6239
|
const bool is_neox = mode & 2;
|
6088
6240
|
const bool is_glm = mode & 4;
|
6089
6241
|
|
6090
6242
|
// compute
|
6091
6243
|
if (is_glm) {
|
6092
|
-
|
6244
|
+
GGML_ASSERT(false);
|
6245
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
6093
6246
|
} else if (is_neox) {
|
6094
6247
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6095
|
-
|
6248
|
+
if (src0->type == GGML_TYPE_F32) {
|
6249
|
+
rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6250
|
+
} else if (src0->type == GGML_TYPE_F16) {
|
6251
|
+
rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6252
|
+
} else {
|
6253
|
+
GGML_ASSERT(false);
|
6254
|
+
}
|
6096
6255
|
} else {
|
6097
|
-
|
6256
|
+
if (src0->type == GGML_TYPE_F32) {
|
6257
|
+
rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6258
|
+
} else if (src0->type == GGML_TYPE_F16) {
|
6259
|
+
rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6260
|
+
} else {
|
6261
|
+
GGML_ASSERT(false);
|
6262
|
+
}
|
6098
6263
|
}
|
6099
6264
|
|
6100
6265
|
(void) src1;
|
@@ -6265,7 +6430,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6265
6430
|
}
|
6266
6431
|
}
|
6267
6432
|
|
6268
|
-
void ggml_cuda_set_peer_access(const int n_tokens) {
|
6433
|
+
static void ggml_cuda_set_peer_access(const int n_tokens) {
|
6269
6434
|
static bool peer_access_enabled = false;
|
6270
6435
|
|
6271
6436
|
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
@@ -6593,27 +6758,27 @@ static void ggml_cuda_op_mul_mat(
|
|
6593
6758
|
}
|
6594
6759
|
}
|
6595
6760
|
|
6596
|
-
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6761
|
+
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6597
6762
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6598
6763
|
}
|
6599
6764
|
|
6600
|
-
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6765
|
+
static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6601
6766
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
6602
6767
|
}
|
6603
6768
|
|
6604
|
-
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6769
|
+
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6605
6770
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
6606
6771
|
}
|
6607
6772
|
|
6608
|
-
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6773
|
+
static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6609
6774
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
6610
6775
|
}
|
6611
6776
|
|
6612
|
-
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6777
|
+
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6613
6778
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
6614
6779
|
}
|
6615
6780
|
|
6616
|
-
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6781
|
+
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6617
6782
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
6618
6783
|
}
|
6619
6784
|
|
@@ -6624,17 +6789,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|
6624
6789
|
const int64_t ne1 = dst->ne[1];
|
6625
6790
|
|
6626
6791
|
// TODO: find the optimal values for these
|
6627
|
-
|
6628
|
-
|
6629
|
-
|
6630
|
-
|
6631
|
-
return true;
|
6632
|
-
}
|
6633
|
-
|
6634
|
-
return false;
|
6792
|
+
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
6793
|
+
src1->type == GGML_TYPE_F32 &&
|
6794
|
+
dst->type == GGML_TYPE_F32 &&
|
6795
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
|
6635
6796
|
}
|
6636
6797
|
|
6637
|
-
void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6798
|
+
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6638
6799
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
6639
6800
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
6640
6801
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
@@ -6663,7 +6824,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6663
6824
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
6664
6825
|
}
|
6665
6826
|
|
6666
|
-
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6827
|
+
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6667
6828
|
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
6668
6829
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
6669
6830
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
@@ -6697,7 +6858,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6697
6858
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
6698
6859
|
}
|
6699
6860
|
|
6700
|
-
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6861
|
+
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6701
6862
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
6702
6863
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
6703
6864
|
|
@@ -6741,11 +6902,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
6741
6902
|
}
|
6742
6903
|
}
|
6743
6904
|
|
6744
|
-
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6905
|
+
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6745
6906
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6746
6907
|
}
|
6747
6908
|
|
6748
|
-
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6909
|
+
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6749
6910
|
const int64_t ne = ggml_nelements(src0);
|
6750
6911
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
6751
6912
|
|
@@ -6787,35 +6948,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6787
6948
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6788
6949
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6789
6950
|
} else {
|
6951
|
+
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
6952
|
+
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6790
6953
|
GGML_ASSERT(false);
|
6791
6954
|
}
|
6792
6955
|
|
6793
6956
|
(void) dst;
|
6794
6957
|
}
|
6795
6958
|
|
6796
|
-
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6959
|
+
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6797
6960
|
ggml_cuda_cpy(src0, dst, nullptr);
|
6798
6961
|
(void) src1;
|
6799
6962
|
}
|
6800
6963
|
|
6801
|
-
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6964
|
+
static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6802
6965
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
|
6803
6966
|
}
|
6804
6967
|
|
6805
|
-
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6968
|
+
static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6806
6969
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
|
6807
6970
|
}
|
6808
6971
|
|
6809
|
-
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6972
|
+
static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6810
6973
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6811
6974
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
|
6812
6975
|
}
|
6813
6976
|
|
6814
|
-
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6977
|
+
static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6815
6978
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
6816
6979
|
}
|
6817
6980
|
|
6818
|
-
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6981
|
+
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6819
6982
|
(void) src0;
|
6820
6983
|
(void) src1;
|
6821
6984
|
(void) dst;
|
@@ -6938,11 +7101,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6938
7101
|
return extra;
|
6939
7102
|
}
|
6940
7103
|
|
6941
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
7104
|
+
static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6942
7105
|
if (scratch && g_scratch_size == 0) {
|
6943
7106
|
return;
|
6944
7107
|
}
|
6945
7108
|
|
7109
|
+
tensor->backend = GGML_BACKEND_GPU;
|
7110
|
+
|
6946
7111
|
// recursively assign CUDA buffers until a compute tensor is found
|
6947
7112
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6948
7113
|
const ggml_op src0_op = tensor->src[0]->op;
|
@@ -6954,8 +7119,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6954
7119
|
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6955
7120
|
}
|
6956
7121
|
|
6957
|
-
tensor->backend = GGML_BACKEND_GPU;
|
6958
|
-
|
6959
7122
|
if (scratch && no_alloc) {
|
6960
7123
|
return;
|
6961
7124
|
}
|
@@ -7040,6 +7203,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
7040
7203
|
tensor->extra = extra;
|
7041
7204
|
}
|
7042
7205
|
|
7206
|
+
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
7207
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7208
|
+
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7209
|
+
|
7210
|
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7211
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7212
|
+
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7213
|
+
}
|
7214
|
+
|
7043
7215
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
7044
7216
|
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
7045
7217
|
}
|
@@ -7075,7 +7247,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
|
7075
7247
|
}
|
7076
7248
|
|
7077
7249
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7078
|
-
|
7250
|
+
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7251
|
+
// it still won't always work as expected, but it's better than nothing
|
7252
|
+
if (scratch_size > g_scratch_size) {
|
7253
|
+
ggml_cuda_free_scratch();
|
7254
|
+
}
|
7255
|
+
g_scratch_size = std::max(g_scratch_size, scratch_size);
|
7079
7256
|
}
|
7080
7257
|
|
7081
7258
|
void ggml_cuda_free_scratch() {
|