llama_cpp 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/llama_cpp.cpp +52 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +697 -130
- data/ext/llama_cpp/src/ggml-cuda.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +548 -497
- data/ext/llama_cpp/src/ggml-metal.metal +425 -122
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -32
- data/ext/llama_cpp/src/ggml-opencl.h +1 -2
- data/ext/llama_cpp/src/ggml.c +1904 -303
- data/ext/llama_cpp/src/ggml.h +126 -2
- data/ext/llama_cpp/src/llama.cpp +212 -108
- data/ext/llama_cpp/src/llama.h +12 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +4 -2
@@ -1,5 +1,6 @@
|
|
1
1
|
#include <cstddef>
|
2
2
|
#include <cstdint>
|
3
|
+
#include <limits>
|
3
4
|
#include <stdint.h>
|
4
5
|
#include <stdio.h>
|
5
6
|
#include <atomic>
|
@@ -24,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
24
25
|
} \
|
25
26
|
} while (0)
|
26
27
|
|
27
|
-
#if CUDART_VERSION >=
|
28
|
+
#if CUDART_VERSION >= 12000
|
28
29
|
#define CUBLAS_CHECK(err) \
|
29
30
|
do { \
|
30
31
|
cublasStatus_t err_ = (err); \
|
@@ -48,6 +49,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
48
49
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
|
49
50
|
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
50
51
|
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
52
|
+
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
51
53
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
52
54
|
typedef void (*ggml_cuda_op_t)(
|
53
55
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
|
@@ -151,7 +153,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
151
153
|
#define CUDA_ADD_BLOCK_SIZE 256
|
152
154
|
#define CUDA_MUL_BLOCK_SIZE 256
|
153
155
|
#define CUDA_SILU_BLOCK_SIZE 256
|
156
|
+
#define CUDA_CPY_BLOCK_SIZE 32
|
157
|
+
#define CUDA_SCALE_BLOCK_SIZE 256
|
154
158
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
159
|
+
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
155
160
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
156
161
|
|
157
162
|
// dmmv = dequantize_mul_mat_vec
|
@@ -655,10 +660,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
655
660
|
}
|
656
661
|
|
657
662
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
658
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
|
663
|
+
static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
|
659
664
|
// qk = quantized weights per x block
|
660
665
|
// qr = number of quantized weights per data value in x block
|
661
|
-
const int row = blockIdx.
|
666
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
667
|
+
|
668
|
+
if (row >= nrows) {
|
669
|
+
return;
|
670
|
+
}
|
671
|
+
|
662
672
|
const int tid = threadIdx.x;
|
663
673
|
|
664
674
|
const int iter_stride = 2*GGML_CUDA_DMMV_X;
|
@@ -703,8 +713,13 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
703
713
|
}
|
704
714
|
|
705
715
|
template <int n_thread, dot_kernel_k_t dot_kernel>
|
706
|
-
static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols) {
|
707
|
-
const int row = blockIdx.
|
716
|
+
static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
|
717
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
718
|
+
|
719
|
+
if (row >= nrows) {
|
720
|
+
return;
|
721
|
+
}
|
722
|
+
|
708
723
|
const int tid = threadIdx.x;
|
709
724
|
|
710
725
|
const int iter_stride = QK_K;
|
@@ -737,6 +752,139 @@ static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y
|
|
737
752
|
}
|
738
753
|
}
|
739
754
|
|
755
|
+
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
756
|
+
const half * x = (half *) vx;
|
757
|
+
|
758
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
759
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
760
|
+
|
761
|
+
const int nrows_y = ncols_x;
|
762
|
+
const int nrows_dst = nrows_x;
|
763
|
+
const int row_dst = row_x;
|
764
|
+
|
765
|
+
float tmp = 0.0f;
|
766
|
+
|
767
|
+
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
768
|
+
const int col_x = col_x0 + threadIdx.x;
|
769
|
+
|
770
|
+
if (col_x >= ncols_x) {
|
771
|
+
break;
|
772
|
+
}
|
773
|
+
|
774
|
+
// x is transposed and permuted
|
775
|
+
const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
|
776
|
+
const float xi = __half2float(x[ix]);
|
777
|
+
|
778
|
+
const int row_y = col_x;
|
779
|
+
|
780
|
+
|
781
|
+
// y is not transposed but permuted
|
782
|
+
const int iy = channel*nrows_y + row_y;
|
783
|
+
|
784
|
+
tmp += xi * y[iy];
|
785
|
+
}
|
786
|
+
|
787
|
+
// dst is not transposed and not permuted
|
788
|
+
const int idst = channel*nrows_dst + row_dst;
|
789
|
+
|
790
|
+
// sum up partial sums and write back result
|
791
|
+
__syncthreads();
|
792
|
+
#pragma unroll
|
793
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
794
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
795
|
+
}
|
796
|
+
|
797
|
+
if (threadIdx.x == 0) {
|
798
|
+
dst[idst] = tmp;
|
799
|
+
}
|
800
|
+
}
|
801
|
+
|
802
|
+
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
803
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
804
|
+
const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
|
805
|
+
|
806
|
+
const half * x = (half *) vx;
|
807
|
+
|
808
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
809
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
810
|
+
|
811
|
+
const int nrows_y = ncols_x;
|
812
|
+
const int nrows_dst = nrows_x;
|
813
|
+
const int row_dst = row_x;
|
814
|
+
|
815
|
+
const int idst = channel*nrows_dst + row_dst;
|
816
|
+
|
817
|
+
float tmp = 0.0f;
|
818
|
+
|
819
|
+
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
820
|
+
const int col_x = col_x0 + threadIdx.x;
|
821
|
+
|
822
|
+
if (col_x >= ncols_x) {
|
823
|
+
break;
|
824
|
+
}
|
825
|
+
|
826
|
+
const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
|
827
|
+
const float xi = __half2float(x[ix]);
|
828
|
+
|
829
|
+
const int row_y = col_x;
|
830
|
+
|
831
|
+
const int iy = channel*nrows_y + row_y;
|
832
|
+
|
833
|
+
tmp += xi * y[iy];
|
834
|
+
}
|
835
|
+
|
836
|
+
// sum up partial sums and write back result
|
837
|
+
__syncthreads();
|
838
|
+
#pragma unroll
|
839
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
840
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
841
|
+
}
|
842
|
+
|
843
|
+
if (threadIdx.x == 0) {
|
844
|
+
dst[idst] = tmp;
|
845
|
+
}
|
846
|
+
}
|
847
|
+
|
848
|
+
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
849
|
+
const float * xi = (float *) cxi;
|
850
|
+
float * dsti = (float *) cdsti;
|
851
|
+
|
852
|
+
*dsti = *xi;
|
853
|
+
}
|
854
|
+
|
855
|
+
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
856
|
+
const float * xi = (float *) cxi;
|
857
|
+
half * dsti = (half *) cdsti;
|
858
|
+
|
859
|
+
*dsti = __float2half(*xi);
|
860
|
+
}
|
861
|
+
|
862
|
+
template <cpy_kernel_t cpy_1>
|
863
|
+
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
864
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
865
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
866
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
867
|
+
|
868
|
+
if (i >= ne) {
|
869
|
+
return;
|
870
|
+
}
|
871
|
+
|
872
|
+
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
873
|
+
// then combine those indices with the corresponding byte offsets to get the total offsets
|
874
|
+
const int i02 = i / (ne00*ne01);
|
875
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
876
|
+
const int i00 = i - i02*ne01*ne00 - i01*ne00;
|
877
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
878
|
+
|
879
|
+
const int i12 = i / (ne10*ne11);
|
880
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
881
|
+
const int i10 = i - i12*ne10*ne11 - i11*ne10;
|
882
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
883
|
+
|
884
|
+
cpy_1(cx + x_offset, cdst + dst_offset);
|
885
|
+
}
|
886
|
+
|
887
|
+
// rope == RoPE == rotary positional embedding
|
740
888
|
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
|
741
889
|
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
742
890
|
|
@@ -758,6 +906,72 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
758
906
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
759
907
|
}
|
760
908
|
|
909
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
910
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
911
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
912
|
+
|
913
|
+
if (col >= ncols) {
|
914
|
+
return;
|
915
|
+
}
|
916
|
+
|
917
|
+
const int i = row*ncols + col;
|
918
|
+
// dst[i] = col > n_past + row ? -INFINITY : x[i];
|
919
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
920
|
+
}
|
921
|
+
|
922
|
+
// the CUDA soft max implementation differs from the CPU implementation
|
923
|
+
// instead of doubles floats are used
|
924
|
+
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
925
|
+
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
926
|
+
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
927
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
928
|
+
const int block_size = blockDim.x;
|
929
|
+
const int tid = threadIdx.x;
|
930
|
+
|
931
|
+
float tmp = 0.0;
|
932
|
+
|
933
|
+
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
934
|
+
const int col = block_start + tid;
|
935
|
+
|
936
|
+
if (col >= ncols) {
|
937
|
+
break;
|
938
|
+
}
|
939
|
+
|
940
|
+
const int i = row*ncols + col;
|
941
|
+
const float val = expf(x[i]);
|
942
|
+
tmp += val;
|
943
|
+
dst[i] = val;
|
944
|
+
}
|
945
|
+
|
946
|
+
// sum up partial sums
|
947
|
+
__syncthreads();
|
948
|
+
#pragma unroll
|
949
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
950
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
951
|
+
}
|
952
|
+
|
953
|
+
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
954
|
+
const int col = block_start + tid;
|
955
|
+
|
956
|
+
if (col >= ncols) {
|
957
|
+
break;
|
958
|
+
}
|
959
|
+
|
960
|
+
const int i = row*ncols + col;
|
961
|
+
dst[i] /= tmp;
|
962
|
+
}
|
963
|
+
}
|
964
|
+
|
965
|
+
static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
|
966
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
967
|
+
|
968
|
+
if (i >= k) {
|
969
|
+
return;
|
970
|
+
}
|
971
|
+
|
972
|
+
dst[i] = scale * x[i];
|
973
|
+
}
|
974
|
+
|
761
975
|
static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
762
976
|
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
763
977
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
@@ -831,73 +1045,92 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
831
1045
|
|
832
1046
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
833
1047
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
834
|
-
|
1048
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1049
|
+
const dim3 block_nums(1, block_num_y, 1);
|
835
1050
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
836
1051
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
837
|
-
<<<
|
1052
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
838
1053
|
}
|
839
1054
|
|
840
1055
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
841
1056
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
842
|
-
|
1057
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1058
|
+
const dim3 block_nums(1, block_num_y, 1);
|
843
1059
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
844
1060
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
845
|
-
<<<
|
1061
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
846
1062
|
}
|
847
1063
|
|
848
1064
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
849
1065
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
850
|
-
|
1066
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1067
|
+
const dim3 block_nums(1, block_num_y, 1);
|
851
1068
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
852
1069
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
853
|
-
<<<
|
1070
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
854
1071
|
}
|
855
1072
|
|
856
1073
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
857
1074
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
858
|
-
|
1075
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1076
|
+
const dim3 block_nums(1, block_num_y, 1);
|
859
1077
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
860
1078
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
861
|
-
<<<
|
1079
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
862
1080
|
}
|
863
1081
|
|
864
1082
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
865
1083
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
866
|
-
|
1084
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1085
|
+
const dim3 block_nums(1, block_num_y, 1);
|
867
1086
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
868
1087
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
869
|
-
<<<
|
1088
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
870
1089
|
}
|
871
1090
|
|
872
1091
|
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
873
1092
|
GGML_ASSERT(ncols % QK_K == 0);
|
874
1093
|
const int ny = 2;
|
1094
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1095
|
+
const dim3 block_nums(1, block_num_y, 1);
|
875
1096
|
const dim3 block_dims(32, ny, 1);
|
876
|
-
dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<
|
1097
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
877
1098
|
}
|
878
1099
|
|
879
1100
|
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
880
1101
|
GGML_ASSERT(ncols % QK_K == 0);
|
881
|
-
const
|
882
|
-
|
1102
|
+
const int ny = 2;
|
1103
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1104
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1105
|
+
const dim3 block_dims(32, ny, 1);
|
1106
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
883
1107
|
}
|
884
1108
|
|
885
1109
|
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
886
1110
|
GGML_ASSERT(ncols % QK_K == 0);
|
887
|
-
const
|
888
|
-
|
1111
|
+
const int ny = 2;
|
1112
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1113
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1114
|
+
const dim3 block_dims(32, ny, 1);
|
1115
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
889
1116
|
}
|
890
1117
|
|
891
1118
|
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
892
1119
|
GGML_ASSERT(ncols % QK_K == 0);
|
893
|
-
const
|
894
|
-
|
1120
|
+
const int ny = 2;
|
1121
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1122
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1123
|
+
const dim3 block_dims(32, ny, 1);
|
1124
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
895
1125
|
}
|
896
1126
|
|
897
1127
|
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
898
1128
|
GGML_ASSERT(ncols % QK_K == 0);
|
899
|
-
const
|
900
|
-
|
1129
|
+
const int ny = 2;
|
1130
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1131
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1132
|
+
const dim3 block_dims(32, ny, 1);
|
1133
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
901
1134
|
}
|
902
1135
|
|
903
1136
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -907,10 +1140,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
907
1140
|
|
908
1141
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
909
1142
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
910
|
-
|
1143
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1144
|
+
const dim3 block_nums(1, block_num_y, 1);
|
911
1145
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
912
1146
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
913
|
-
<<<
|
1147
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
914
1148
|
}
|
915
1149
|
|
916
1150
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
@@ -942,6 +1176,47 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
942
1176
|
}
|
943
1177
|
}
|
944
1178
|
|
1179
|
+
static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
|
1180
|
+
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1181
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1182
|
+
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
1183
|
+
}
|
1184
|
+
|
1185
|
+
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
1186
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
1187
|
+
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
1188
|
+
|
1189
|
+
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1190
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1191
|
+
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
1192
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
|
1193
|
+
}
|
1194
|
+
|
1195
|
+
static void ggml_cpy_f32_f32_cuda(
|
1196
|
+
const char * cx, char * cdst, const int ne,
|
1197
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1198
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
1199
|
+
|
1200
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
1201
|
+
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
1202
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
static void ggml_cpy_f32_f16_cuda(
|
1206
|
+
const char * cx, char * cdst, const int ne,
|
1207
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1208
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
1209
|
+
|
1210
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
1211
|
+
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
1212
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
1213
|
+
}
|
1214
|
+
|
1215
|
+
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
1216
|
+
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
1217
|
+
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
1218
|
+
}
|
1219
|
+
|
945
1220
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
|
946
1221
|
GGML_ASSERT(nrows % 2 == 0);
|
947
1222
|
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -950,6 +1225,19 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
950
1225
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
951
1226
|
}
|
952
1227
|
|
1228
|
+
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
1229
|
+
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
1230
|
+
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
1231
|
+
const dim3 block_nums(block_num_x, nrows_x, 1);
|
1232
|
+
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
1233
|
+
}
|
1234
|
+
|
1235
|
+
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
1236
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1237
|
+
const dim3 block_nums(1, nrows_x, 1);
|
1238
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
1239
|
+
}
|
1240
|
+
|
953
1241
|
// buffer pool for cuda
|
954
1242
|
#define MAX_CUDA_BUFFERS 256
|
955
1243
|
|
@@ -1105,6 +1393,9 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
1105
1393
|
void * ptr = nullptr;
|
1106
1394
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
1107
1395
|
if (err != cudaSuccess) {
|
1396
|
+
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
1397
|
+
// This can fixed the OOM error in WSL.
|
1398
|
+
cudaGetLastError();
|
1108
1399
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
1109
1400
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
1110
1401
|
return nullptr;
|
@@ -1117,10 +1408,25 @@ void ggml_cuda_host_free(void * ptr) {
|
|
1117
1408
|
CUDA_CHECK(cudaFreeHost(ptr));
|
1118
1409
|
}
|
1119
1410
|
|
1120
|
-
static cudaError_t
|
1411
|
+
static cudaError_t ggml_cuda_cpy_tensor_2d(
|
1121
1412
|
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
1122
1413
|
|
1123
|
-
|
1414
|
+
cudaMemcpyKind kind;
|
1415
|
+
char * src_ptr;
|
1416
|
+
if (src->backend == GGML_BACKEND_CPU) {
|
1417
|
+
kind = cudaMemcpyHostToDevice;
|
1418
|
+
src_ptr = (char *) src->data;
|
1419
|
+
} else if (src->backend == GGML_BACKEND_GPU) {
|
1420
|
+
kind = cudaMemcpyDeviceToDevice;
|
1421
|
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
1422
|
+
int id;
|
1423
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
1424
|
+
src_ptr = (char *) extra->data_device[id];
|
1425
|
+
} else {
|
1426
|
+
GGML_ASSERT(false);
|
1427
|
+
}
|
1428
|
+
char * dst_ptr = (char *) dst;
|
1429
|
+
|
1124
1430
|
const int64_t ne0 = src->ne[0];
|
1125
1431
|
const int64_t nb0 = src->nb[0];
|
1126
1432
|
const int64_t nb1 = src->nb[1];
|
@@ -1131,17 +1437,17 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(
|
|
1131
1437
|
const int64_t bs = ggml_blck_size(type);
|
1132
1438
|
int64_t i1_diff = i1_high - i1_low;
|
1133
1439
|
|
1134
|
-
const
|
1440
|
+
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
1135
1441
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
1136
|
-
return cudaMemcpyAsync(
|
1442
|
+
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
1137
1443
|
} else if (nb0 == ts) {
|
1138
|
-
return cudaMemcpy2DAsync(
|
1444
|
+
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
1139
1445
|
} else {
|
1140
1446
|
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
1141
1447
|
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
1142
|
-
void * rd = (void *) (
|
1448
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
1143
1449
|
// pretend the row is a matrix with cols=1
|
1144
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0,
|
1450
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
1145
1451
|
if (r != cudaSuccess) return r;
|
1146
1452
|
}
|
1147
1453
|
return cudaSuccess;
|
@@ -1377,8 +1683,81 @@ inline void ggml_cuda_op_rope(
|
|
1377
1683
|
(void) i1;
|
1378
1684
|
}
|
1379
1685
|
|
1686
|
+
inline void ggml_cuda_op_diag_mask_inf(
|
1687
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1688
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1689
|
+
cudaStream_t & cudaStream_main){
|
1690
|
+
|
1691
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1692
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1693
|
+
|
1694
|
+
const int64_t ne00 = src0->ne[0];
|
1695
|
+
const int64_t ne01 = src0->ne[1];
|
1696
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1697
|
+
|
1698
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
1699
|
+
|
1700
|
+
// compute
|
1701
|
+
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
1702
|
+
CUDA_CHECK(cudaGetLastError());
|
1703
|
+
|
1704
|
+
(void) dst;
|
1705
|
+
(void) src0_ddq_i;
|
1706
|
+
(void) src1_ddf_i;
|
1707
|
+
(void) i02;
|
1708
|
+
(void) i1;
|
1709
|
+
}
|
1710
|
+
|
1711
|
+
inline void ggml_cuda_op_soft_max(
|
1712
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1713
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1714
|
+
cudaStream_t & cudaStream_main){
|
1715
|
+
|
1716
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1717
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1718
|
+
|
1719
|
+
const int64_t ne00 = src0->ne[0];
|
1720
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1721
|
+
|
1722
|
+
// compute
|
1723
|
+
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
1724
|
+
CUDA_CHECK(cudaGetLastError());
|
1725
|
+
|
1726
|
+
(void) src1;
|
1727
|
+
(void) dst;
|
1728
|
+
(void) src0_ddq_i;
|
1729
|
+
(void) src1_ddf_i;
|
1730
|
+
(void) i02;
|
1731
|
+
(void) i1;
|
1732
|
+
}
|
1733
|
+
|
1734
|
+
inline void ggml_cuda_op_scale(
|
1735
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1736
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1737
|
+
cudaStream_t & cudaStream_main){
|
1738
|
+
|
1739
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1740
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1741
|
+
|
1742
|
+
const float scale = ((float *) src1->data)[0];
|
1743
|
+
|
1744
|
+
const int64_t ne00 = src0->ne[0];
|
1745
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1746
|
+
|
1747
|
+
// compute
|
1748
|
+
scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
|
1749
|
+
CUDA_CHECK(cudaGetLastError());
|
1750
|
+
|
1751
|
+
(void) src1;
|
1752
|
+
(void) dst;
|
1753
|
+
(void) src0_ddq_i;
|
1754
|
+
(void) src1_ddf_i;
|
1755
|
+
(void) i02;
|
1756
|
+
(void) i1;
|
1757
|
+
}
|
1758
|
+
|
1380
1759
|
static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
1381
|
-
ggml_cuda_op_t op, bool src0_needs_f32) {
|
1760
|
+
ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
|
1382
1761
|
const int64_t ne00 = src0->ne[0];
|
1383
1762
|
const int64_t ne01 = src0->ne[1];
|
1384
1763
|
const int64_t ne02 = src0->ne[2];
|
@@ -1401,21 +1780,27 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1401
1780
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
1402
1781
|
|
1403
1782
|
// strides for iteration over dims 3 and 2
|
1404
|
-
const int64_t
|
1405
|
-
const int64_t
|
1406
|
-
const int64_t
|
1407
|
-
const int64_t
|
1783
|
+
const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
|
1784
|
+
const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
|
1785
|
+
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
1786
|
+
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
1787
|
+
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
1408
1788
|
|
1409
1789
|
const size_t src0_ts = ggml_type_size(src0->type);
|
1410
1790
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
1411
1791
|
|
1412
|
-
struct ggml_tensor_extra_gpu * src0_extra =
|
1792
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
1413
1793
|
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
1414
|
-
struct ggml_tensor_extra_gpu * dst_extra
|
1794
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
1415
1795
|
|
1416
1796
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
1797
|
+
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
1417
1798
|
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
1418
1799
|
|
1800
|
+
const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
|
1801
|
+
const bool src1_stays_on_host = use_src1 && (
|
1802
|
+
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
1803
|
+
|
1419
1804
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
1420
1805
|
|
1421
1806
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
@@ -1424,13 +1809,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1424
1809
|
char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
|
1425
1810
|
float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
1426
1811
|
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1427
|
-
float *
|
1812
|
+
float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1428
1813
|
|
1429
1814
|
// asq = actual size quantized, asf = actual size float
|
1430
1815
|
size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
1431
1816
|
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1432
1817
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1433
|
-
size_t
|
1818
|
+
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1434
1819
|
|
1435
1820
|
for (int id = 0; id < g_device_count; ++id) {
|
1436
1821
|
if (!split && id != g_main_device) {
|
@@ -1443,9 +1828,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1443
1828
|
int64_t row_low, row_high;
|
1444
1829
|
if (split) {
|
1445
1830
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
1446
|
-
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
1447
1831
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
1448
|
-
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
1449
1832
|
} else {
|
1450
1833
|
row_low = 0;
|
1451
1834
|
row_high = nrows0;
|
@@ -1458,7 +1841,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1458
1841
|
|
1459
1842
|
cudaSetDevice(id);
|
1460
1843
|
|
1461
|
-
if (src0_on_device) {
|
1844
|
+
if (src0_on_device && src0_is_contiguous) {
|
1462
1845
|
if (src0_is_f32) {
|
1463
1846
|
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
1464
1847
|
} else {
|
@@ -1476,8 +1859,8 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1476
1859
|
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
|
1477
1860
|
}
|
1478
1861
|
|
1479
|
-
if (use_src1) {
|
1480
|
-
if (src1_on_device) {
|
1862
|
+
if (use_src1 && !src1_stays_on_host) {
|
1863
|
+
if (src1_on_device && src1_is_contiguous) {
|
1481
1864
|
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
1482
1865
|
} else {
|
1483
1866
|
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
|
@@ -1490,26 +1873,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1490
1873
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
1491
1874
|
}
|
1492
1875
|
|
1493
|
-
|
1876
|
+
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
1877
|
+
const int64_t i02_max = flatten_rows ? 1 : ne02;
|
1878
|
+
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
1879
|
+
|
1880
|
+
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
1494
1881
|
const int64_t i13 = i03 % ne13;
|
1495
|
-
for (int64_t i02 = 0; i02 <
|
1882
|
+
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
1496
1883
|
const int64_t i12 = i02 % ne12;
|
1497
1884
|
|
1498
1885
|
const int64_t i0 = i03*ne02 + i02;
|
1499
|
-
|
1500
|
-
|
1886
|
+
|
1887
|
+
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
1888
|
+
const int64_t i0_offset_low = row_low/rows_per_iter;
|
1889
|
+
const int64_t i0_offset_high = row_high/rows_per_iter;
|
1501
1890
|
|
1502
1891
|
int64_t i01_low = 0;
|
1503
|
-
int64_t i01_high =
|
1892
|
+
int64_t i01_high = rows_per_iter;
|
1504
1893
|
if (split) {
|
1505
1894
|
if (i0 < i0_offset_low || i0 > i0_offset_high) {
|
1506
1895
|
continue;
|
1507
1896
|
}
|
1508
1897
|
if (i0 == i0_offset_low) {
|
1509
|
-
i01_low = row_low %
|
1898
|
+
i01_low = row_low % rows_per_iter;
|
1510
1899
|
}
|
1511
1900
|
if (i0 == i0_offset_high) {
|
1512
|
-
i01_high = row_high %
|
1901
|
+
i01_high = row_high % rows_per_iter;
|
1513
1902
|
}
|
1514
1903
|
}
|
1515
1904
|
|
@@ -1518,7 +1907,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1518
1907
|
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
|
1519
1908
|
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
1520
1909
|
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
1521
|
-
GGML_ASSERT(i01_high ==
|
1910
|
+
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
|
1522
1911
|
|
1523
1912
|
const int64_t i01_diff = i01_high - i01_low;
|
1524
1913
|
if (i01_diff == 0) {
|
@@ -1526,24 +1915,23 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1526
1915
|
}
|
1527
1916
|
const int64_t i11 = i13*ne12 + i12;
|
1528
1917
|
|
1529
|
-
cudaStream_t cudaStream_main
|
1918
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
|
1530
1919
|
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
|
1531
|
-
cudaEvent_t cudaEvent_memcpy_src1
|
1920
|
+
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
|
1532
1921
|
|
1533
1922
|
// for split tensors the data begins at i0 == i0_offset_low
|
1534
1923
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
1535
1924
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
1536
1925
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
1537
|
-
float * dst_ddf_i
|
1926
|
+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
1538
1927
|
|
1539
1928
|
// for split tensors the data pointer needs to be rounded down
|
1540
1929
|
// to the bin edge for i03, i02 bins beyond the first
|
1541
1930
|
if (i0 - i0_offset_low > 0) {
|
1931
|
+
GGML_ASSERT(!flatten_rows);
|
1542
1932
|
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
1543
1933
|
src0_ddf_i -= (row_low % ne01)*ne00;
|
1544
|
-
|
1545
|
-
if (i0 - i0_offset_low > 0) {
|
1546
|
-
dst_ddf_i -= (row_low % ne0)*ne1;
|
1934
|
+
dst_ddf_i -= (row_low % ne0)*ne1;
|
1547
1935
|
}
|
1548
1936
|
|
1549
1937
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
@@ -1553,30 +1941,37 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1553
1941
|
}
|
1554
1942
|
|
1555
1943
|
// copy src0, src1 to device if necessary
|
1556
|
-
if (use_src1) {
|
1944
|
+
if (use_src1 && !src1_stays_on_host) {
|
1557
1945
|
if (src1->backend == GGML_BACKEND_CPU) {
|
1558
|
-
|
1559
|
-
|
1946
|
+
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
|
1947
|
+
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
|
1948
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1));
|
1949
|
+
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
1560
1950
|
if (id != g_main_device) {
|
1951
|
+
GGML_ASSERT(!flatten_rows);
|
1561
1952
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
1562
1953
|
src1_ddf_i_source += i11*src1_stride;
|
1563
1954
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
|
1564
1955
|
cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
|
1565
1956
|
}
|
1957
|
+
} else if (src1_on_device && !src1_is_contiguous) {
|
1958
|
+
GGML_ASSERT(!split);
|
1959
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
|
1566
1960
|
} else {
|
1567
1961
|
GGML_ASSERT(false);
|
1568
1962
|
}
|
1569
1963
|
}
|
1570
1964
|
CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
|
1571
|
-
|
1965
|
+
|
1966
|
+
if (!src0_on_device || !src0_is_contiguous) {
|
1572
1967
|
if (src0_is_f32) {
|
1573
|
-
CUDA_CHECK(
|
1968
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
1574
1969
|
} else {
|
1575
|
-
CUDA_CHECK(
|
1970
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
1576
1971
|
}
|
1577
1972
|
}
|
1578
1973
|
|
1579
|
-
// convert src0 to f32 if it
|
1974
|
+
// convert src0 to f32 if it is necessary for the ggml_cuda_op
|
1580
1975
|
if (src0_needs_f32 && !src0_is_f32) {
|
1581
1976
|
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
1582
1977
|
CUDA_CHECK(cudaGetLastError());
|
@@ -1641,39 +2036,30 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1641
2036
|
|
1642
2037
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1643
2038
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1644
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true);
|
2039
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
|
1645
2040
|
}
|
1646
2041
|
|
1647
2042
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1648
2043
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1649
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
|
2044
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
1650
2045
|
}
|
1651
2046
|
|
1652
2047
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1653
2048
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1654
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true);
|
2049
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
1655
2050
|
}
|
1656
2051
|
|
1657
2052
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1658
2053
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1659
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true);
|
2054
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
1660
2055
|
}
|
1661
2056
|
|
1662
2057
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
1663
|
-
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
|
1664
2058
|
const int64_t ne10 = src1->ne[0];
|
1665
2059
|
|
1666
2060
|
const int64_t ne0 = dst->ne[0];
|
1667
2061
|
const int64_t ne1 = dst->ne[1];
|
1668
2062
|
|
1669
|
-
// if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) {
|
1670
|
-
// fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n",
|
1671
|
-
// src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
1672
|
-
// src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
1673
|
-
// dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
|
1674
|
-
// return false;
|
1675
|
-
// }
|
1676
|
-
|
1677
2063
|
// TODO: find the optimal values for these
|
1678
2064
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
1679
2065
|
src1->type == GGML_TYPE_F32 &&
|
@@ -1685,23 +2071,158 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|
1685
2071
|
return false;
|
1686
2072
|
}
|
1687
2073
|
|
2074
|
+
void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
2075
|
+
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
2076
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
2077
|
+
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
2078
|
+
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
2079
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2080
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2081
|
+
|
2082
|
+
const int64_t ne00 = src0->ne[0];
|
2083
|
+
const int64_t ne01 = src0->ne[1];
|
2084
|
+
const int64_t ne02 = src0->ne[2];
|
2085
|
+
|
2086
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2087
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
|
2088
|
+
|
2089
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2090
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
2091
|
+
|
2092
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2093
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
2094
|
+
|
2095
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2096
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2097
|
+
|
2098
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
2099
|
+
|
2100
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2101
|
+
}
|
2102
|
+
|
2103
|
+
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
2104
|
+
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
2105
|
+
GGML_ASSERT(!ggml_is_permuted(src0));
|
2106
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
2107
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2108
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2109
|
+
|
2110
|
+
const int64_t ne00 = src0->ne[0];
|
2111
|
+
const int64_t ne01 = src0->ne[1];
|
2112
|
+
const int64_t ne02 = src0->ne[2];
|
2113
|
+
|
2114
|
+
const int64_t nb01 = src0->nb[1];
|
2115
|
+
const int64_t nb02 = src0->nb[2];
|
2116
|
+
|
2117
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2118
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
|
2119
|
+
|
2120
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2121
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
2122
|
+
|
2123
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2124
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
2125
|
+
|
2126
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2127
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2128
|
+
|
2129
|
+
const int row_stride_x = nb01 / sizeof(half);
|
2130
|
+
const int channel_stride_x = nb02 / sizeof(half);
|
2131
|
+
|
2132
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
2133
|
+
|
2134
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2135
|
+
}
|
2136
|
+
|
1688
2137
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1689
|
-
|
1690
|
-
|
2138
|
+
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
2139
|
+
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
2140
|
+
|
2141
|
+
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
2142
|
+
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
2143
|
+
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
2144
|
+
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
2145
|
+
}else if (src0->type == GGML_TYPE_F32) {
|
2146
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
1691
2147
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
1692
|
-
if (src1->ne[1] == 1) {
|
1693
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
2148
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
|
2149
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
|
1694
2150
|
} else {
|
1695
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
|
2151
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
1696
2152
|
}
|
1697
2153
|
} else {
|
1698
2154
|
GGML_ASSERT(false);
|
1699
2155
|
}
|
1700
2156
|
}
|
1701
2157
|
|
2158
|
+
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2159
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2160
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
|
2161
|
+
}
|
2162
|
+
|
2163
|
+
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2164
|
+
const int64_t ne = ggml_nelements(src0);
|
2165
|
+
GGML_ASSERT(ne == ggml_nelements(src1));
|
2166
|
+
|
2167
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
2168
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
2169
|
+
|
2170
|
+
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
2171
|
+
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
2172
|
+
|
2173
|
+
const int64_t ne00 = src0->ne[0];
|
2174
|
+
const int64_t ne01 = src0->ne[1];
|
2175
|
+
GGML_ASSERT(src0->ne[3] == 1);
|
2176
|
+
|
2177
|
+
const int64_t nb00 = src0->nb[0];
|
2178
|
+
const int64_t nb01 = src0->nb[1];
|
2179
|
+
const int64_t nb02 = src0->nb[2];
|
2180
|
+
|
2181
|
+
const int64_t ne10 = src1->ne[0];
|
2182
|
+
const int64_t ne11 = src1->ne[1];
|
2183
|
+
GGML_ASSERT(src1->ne[3] == 1);
|
2184
|
+
|
2185
|
+
const int64_t nb10 = src1->nb[0];
|
2186
|
+
const int64_t nb11 = src1->nb[1];
|
2187
|
+
const int64_t nb12 = src1->nb[2];
|
2188
|
+
|
2189
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2190
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
|
2191
|
+
|
2192
|
+
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2193
|
+
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2194
|
+
|
2195
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2196
|
+
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
2197
|
+
|
2198
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
2199
|
+
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
2200
|
+
ne10, ne11, nb10, nb11, nb12, cudaStream_main);
|
2201
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
2202
|
+
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
2203
|
+
ne10, ne11, nb10, nb11, nb12, cudaStream_main);
|
2204
|
+
} else {
|
2205
|
+
GGML_ASSERT(false);
|
2206
|
+
}
|
2207
|
+
|
2208
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2209
|
+
|
2210
|
+
(void) dst;
|
2211
|
+
}
|
2212
|
+
|
2213
|
+
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2214
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2215
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
2216
|
+
}
|
2217
|
+
|
2218
|
+
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2219
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2220
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
|
2221
|
+
}
|
2222
|
+
|
1702
2223
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1703
2224
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1704
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
|
2225
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
|
1705
2226
|
}
|
1706
2227
|
|
1707
2228
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -1710,16 +2231,14 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
1710
2231
|
(void) dst;
|
1711
2232
|
}
|
1712
2233
|
|
1713
|
-
void
|
1714
|
-
FILE * fp = fopen(fname, "rb");
|
2234
|
+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
1715
2235
|
int nrows = ggml_nrows(tensor);
|
1716
2236
|
const size_t nb1 = tensor->nb[1];
|
1717
2237
|
ggml_backend backend = tensor->backend;
|
1718
2238
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
2239
|
+
memset(extra, 0, sizeof(*extra));
|
1719
2240
|
|
1720
2241
|
for (int id = 0; id < g_device_count; ++id) {
|
1721
|
-
extra->data_device[id] = nullptr;
|
1722
|
-
|
1723
2242
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
1724
2243
|
continue;
|
1725
2244
|
}
|
@@ -1732,10 +2251,7 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|
1732
2251
|
row_high = nrows;
|
1733
2252
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
1734
2253
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
1735
|
-
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
1736
2254
|
row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
|
1737
|
-
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
1738
|
-
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
|
1739
2255
|
} else {
|
1740
2256
|
GGML_ASSERT(false);
|
1741
2257
|
}
|
@@ -1745,35 +2261,19 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|
1745
2261
|
|
1746
2262
|
int64_t nrows_split = row_high - row_low;
|
1747
2263
|
|
1748
|
-
const size_t offset_split =
|
2264
|
+
const size_t offset_split = row_low*nb1;
|
1749
2265
|
const size_t size = ggml_nbytes_split(tensor, nrows_split);
|
1750
2266
|
|
1751
2267
|
void * buf;
|
1752
2268
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
1753
|
-
void * buf_host =
|
1754
|
-
|
1755
|
-
#ifdef _WIN32
|
1756
|
-
int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
|
1757
|
-
#else
|
1758
|
-
int ret = fseek(fp, (long) offset_split, SEEK_SET);
|
1759
|
-
#endif
|
1760
|
-
GGML_ASSERT(ret == 0); // same
|
1761
|
-
|
1762
|
-
size_t ret2 = fread(buf_host, size, 1, fp);
|
1763
|
-
if (ret2 != 1) {
|
1764
|
-
fprintf(stderr, "unexpectedly reached end of file");
|
1765
|
-
exit(1);
|
1766
|
-
}
|
2269
|
+
void * buf_host = (char*)data + offset_split;
|
1767
2270
|
|
1768
2271
|
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
1769
|
-
cudaDeviceSynchronize();
|
1770
2272
|
|
1771
|
-
free(buf_host);
|
1772
2273
|
extra->data_device[id] = buf;
|
1773
2274
|
}
|
1774
2275
|
|
1775
2276
|
tensor->extra = extra;
|
1776
|
-
fclose(fp);
|
1777
2277
|
}
|
1778
2278
|
|
1779
2279
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
@@ -1795,47 +2295,78 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
1795
2295
|
delete extra;
|
1796
2296
|
}
|
1797
2297
|
|
1798
|
-
void
|
1799
|
-
if (
|
1800
|
-
|
2298
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
2299
|
+
if (scratch && g_scratch_size == 0) {
|
2300
|
+
return;
|
1801
2301
|
}
|
1802
2302
|
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
2303
|
+
// recursively assign CUDA buffers until a compute tensor is found
|
2304
|
+
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
|
2305
|
+
const ggml_op src0_op = tensor->src0->op;
|
2306
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2307
|
+
ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
|
2308
|
+
}
|
2309
|
+
}
|
2310
|
+
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
|
2311
|
+
ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
|
1807
2312
|
}
|
1808
2313
|
|
1809
2314
|
tensor->backend = GGML_BACKEND_GPU;
|
1810
2315
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
1811
2316
|
|
1812
|
-
bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data
|
2317
|
+
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
2318
|
+
tensor->op == GGML_OP_VIEW;
|
2319
|
+
const size_t size = ggml_nbytes(tensor);
|
1813
2320
|
|
1814
2321
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
1815
2322
|
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
|
1816
2323
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
2324
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2325
|
+
size_t offset = 0;
|
2326
|
+
if (tensor->op == GGML_OP_VIEW) {
|
2327
|
+
memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
|
2328
|
+
}
|
2329
|
+
extra->data_device[g_main_device] = src0_ddc + offset;
|
2330
|
+
} else if (tensor->op == GGML_OP_CPY) {
|
2331
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
|
2332
|
+
void * src1_ddv = src1_extra->data_device[g_main_device];
|
2333
|
+
extra->data_device[g_main_device] = src1_ddv;
|
2334
|
+
} else if (scratch) {
|
2335
|
+
GGML_ASSERT(size <= g_scratch_size);
|
2336
|
+
if (g_scratch_offset + size > g_scratch_size) {
|
2337
|
+
g_scratch_offset = 0;
|
2338
|
+
}
|
2339
|
+
|
1820
2340
|
char * data = (char *) g_scratch_buffer;
|
1821
2341
|
if (data == nullptr) {
|
1822
2342
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
1823
2343
|
g_scratch_buffer = data;
|
1824
2344
|
}
|
1825
2345
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
1826
|
-
}
|
1827
2346
|
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1831
|
-
|
2347
|
+
g_scratch_offset += size;
|
2348
|
+
|
2349
|
+
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
2350
|
+
} else { // allocate new buffers outside of scratch
|
2351
|
+
void * data;
|
2352
|
+
CUDA_CHECK(cudaMalloc(&data, size));
|
2353
|
+
CUDA_CHECK(cudaMemset(data, 0, size));
|
2354
|
+
extra->data_device[g_main_device] = data;
|
2355
|
+
}
|
1832
2356
|
|
1833
|
-
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
1834
2357
|
tensor->extra = extra;
|
1835
2358
|
}
|
1836
2359
|
|
2360
|
+
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
2361
|
+
ggml_cuda_assign_buffers_impl(tensor, true);
|
2362
|
+
}
|
2363
|
+
|
2364
|
+
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
2365
|
+
ggml_cuda_assign_buffers_impl(tensor, false);
|
2366
|
+
}
|
2367
|
+
|
1837
2368
|
void ggml_cuda_set_main_device(int main_device) {
|
1838
|
-
if (main_device
|
2369
|
+
if (main_device >= g_device_count) {
|
1839
2370
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
1840
2371
|
main_device, g_device_count, g_main_device);
|
1841
2372
|
return;
|
@@ -1852,6 +2383,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
|
1852
2383
|
g_scratch_size = scratch_size;
|
1853
2384
|
}
|
1854
2385
|
|
2386
|
+
void ggml_cuda_free_scratch() {
|
2387
|
+
if (g_scratch_buffer == nullptr) {
|
2388
|
+
return;
|
2389
|
+
}
|
2390
|
+
|
2391
|
+
CUDA_CHECK(cudaFree(g_scratch_buffer));
|
2392
|
+
g_scratch_buffer = nullptr;
|
2393
|
+
}
|
2394
|
+
|
1855
2395
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
1856
2396
|
ggml_cuda_func_t func;
|
1857
2397
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -1889,12 +2429,39 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
1889
2429
|
}
|
1890
2430
|
func = ggml_cuda_mul_mat;
|
1891
2431
|
break;
|
2432
|
+
case GGML_OP_SCALE:
|
2433
|
+
if (!any_on_device) {
|
2434
|
+
return false;
|
2435
|
+
}
|
2436
|
+
func = ggml_cuda_scale;
|
2437
|
+
break;
|
2438
|
+
case GGML_OP_CPY:
|
2439
|
+
if (!any_on_device) {
|
2440
|
+
return false;
|
2441
|
+
}
|
2442
|
+
func = ggml_cuda_cpy;
|
2443
|
+
break;
|
1892
2444
|
case GGML_OP_RESHAPE:
|
2445
|
+
case GGML_OP_VIEW:
|
2446
|
+
case GGML_OP_PERMUTE:
|
2447
|
+
case GGML_OP_TRANSPOSE:
|
1893
2448
|
if (!any_on_device) {
|
1894
2449
|
return false;
|
1895
2450
|
}
|
1896
2451
|
func = ggml_cuda_nop;
|
1897
2452
|
break;
|
2453
|
+
case GGML_OP_DIAG_MASK_INF:
|
2454
|
+
if (!any_on_device) {
|
2455
|
+
return false;
|
2456
|
+
}
|
2457
|
+
func = ggml_cuda_diag_mask_inf;
|
2458
|
+
break;
|
2459
|
+
case GGML_OP_SOFT_MAX:
|
2460
|
+
if (!any_on_device) {
|
2461
|
+
return false;
|
2462
|
+
}
|
2463
|
+
func = ggml_cuda_soft_max;
|
2464
|
+
break;
|
1898
2465
|
case GGML_OP_ROPE:
|
1899
2466
|
if (!any_on_device) {
|
1900
2467
|
return false;
|