llama_cpp 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/llama_cpp.cpp +52 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +697 -130
- data/ext/llama_cpp/src/ggml-cuda.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +548 -497
- data/ext/llama_cpp/src/ggml-metal.metal +425 -122
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -32
- data/ext/llama_cpp/src/ggml-opencl.h +1 -2
- data/ext/llama_cpp/src/ggml.c +1904 -303
- data/ext/llama_cpp/src/ggml.h +126 -2
- data/ext/llama_cpp/src/llama.cpp +212 -108
- data/ext/llama_cpp/src/llama.h +12 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +4 -2
@@ -1,5 +1,6 @@
|
|
1
1
|
#include <cstddef>
|
2
2
|
#include <cstdint>
|
3
|
+
#include <limits>
|
3
4
|
#include <stdint.h>
|
4
5
|
#include <stdio.h>
|
5
6
|
#include <atomic>
|
@@ -24,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
24
25
|
} \
|
25
26
|
} while (0)
|
26
27
|
|
27
|
-
#if CUDART_VERSION >=
|
28
|
+
#if CUDART_VERSION >= 12000
|
28
29
|
#define CUBLAS_CHECK(err) \
|
29
30
|
do { \
|
30
31
|
cublasStatus_t err_ = (err); \
|
@@ -48,6 +49,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
48
49
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
|
49
50
|
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
50
51
|
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
52
|
+
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
51
53
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
52
54
|
typedef void (*ggml_cuda_op_t)(
|
53
55
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
|
@@ -151,7 +153,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
151
153
|
#define CUDA_ADD_BLOCK_SIZE 256
|
152
154
|
#define CUDA_MUL_BLOCK_SIZE 256
|
153
155
|
#define CUDA_SILU_BLOCK_SIZE 256
|
156
|
+
#define CUDA_CPY_BLOCK_SIZE 32
|
157
|
+
#define CUDA_SCALE_BLOCK_SIZE 256
|
154
158
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
159
|
+
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
155
160
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
156
161
|
|
157
162
|
// dmmv = dequantize_mul_mat_vec
|
@@ -655,10 +660,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
655
660
|
}
|
656
661
|
|
657
662
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
658
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
|
663
|
+
static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
|
659
664
|
// qk = quantized weights per x block
|
660
665
|
// qr = number of quantized weights per data value in x block
|
661
|
-
const int row = blockIdx.
|
666
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
667
|
+
|
668
|
+
if (row >= nrows) {
|
669
|
+
return;
|
670
|
+
}
|
671
|
+
|
662
672
|
const int tid = threadIdx.x;
|
663
673
|
|
664
674
|
const int iter_stride = 2*GGML_CUDA_DMMV_X;
|
@@ -703,8 +713,13 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
703
713
|
}
|
704
714
|
|
705
715
|
template <int n_thread, dot_kernel_k_t dot_kernel>
|
706
|
-
static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols) {
|
707
|
-
const int row = blockIdx.
|
716
|
+
static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
|
717
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
718
|
+
|
719
|
+
if (row >= nrows) {
|
720
|
+
return;
|
721
|
+
}
|
722
|
+
|
708
723
|
const int tid = threadIdx.x;
|
709
724
|
|
710
725
|
const int iter_stride = QK_K;
|
@@ -737,6 +752,139 @@ static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y
|
|
737
752
|
}
|
738
753
|
}
|
739
754
|
|
755
|
+
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
756
|
+
const half * x = (half *) vx;
|
757
|
+
|
758
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
759
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
760
|
+
|
761
|
+
const int nrows_y = ncols_x;
|
762
|
+
const int nrows_dst = nrows_x;
|
763
|
+
const int row_dst = row_x;
|
764
|
+
|
765
|
+
float tmp = 0.0f;
|
766
|
+
|
767
|
+
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
768
|
+
const int col_x = col_x0 + threadIdx.x;
|
769
|
+
|
770
|
+
if (col_x >= ncols_x) {
|
771
|
+
break;
|
772
|
+
}
|
773
|
+
|
774
|
+
// x is transposed and permuted
|
775
|
+
const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
|
776
|
+
const float xi = __half2float(x[ix]);
|
777
|
+
|
778
|
+
const int row_y = col_x;
|
779
|
+
|
780
|
+
|
781
|
+
// y is not transposed but permuted
|
782
|
+
const int iy = channel*nrows_y + row_y;
|
783
|
+
|
784
|
+
tmp += xi * y[iy];
|
785
|
+
}
|
786
|
+
|
787
|
+
// dst is not transposed and not permuted
|
788
|
+
const int idst = channel*nrows_dst + row_dst;
|
789
|
+
|
790
|
+
// sum up partial sums and write back result
|
791
|
+
__syncthreads();
|
792
|
+
#pragma unroll
|
793
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
794
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
795
|
+
}
|
796
|
+
|
797
|
+
if (threadIdx.x == 0) {
|
798
|
+
dst[idst] = tmp;
|
799
|
+
}
|
800
|
+
}
|
801
|
+
|
802
|
+
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
803
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
804
|
+
const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
|
805
|
+
|
806
|
+
const half * x = (half *) vx;
|
807
|
+
|
808
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
809
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
810
|
+
|
811
|
+
const int nrows_y = ncols_x;
|
812
|
+
const int nrows_dst = nrows_x;
|
813
|
+
const int row_dst = row_x;
|
814
|
+
|
815
|
+
const int idst = channel*nrows_dst + row_dst;
|
816
|
+
|
817
|
+
float tmp = 0.0f;
|
818
|
+
|
819
|
+
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
820
|
+
const int col_x = col_x0 + threadIdx.x;
|
821
|
+
|
822
|
+
if (col_x >= ncols_x) {
|
823
|
+
break;
|
824
|
+
}
|
825
|
+
|
826
|
+
const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
|
827
|
+
const float xi = __half2float(x[ix]);
|
828
|
+
|
829
|
+
const int row_y = col_x;
|
830
|
+
|
831
|
+
const int iy = channel*nrows_y + row_y;
|
832
|
+
|
833
|
+
tmp += xi * y[iy];
|
834
|
+
}
|
835
|
+
|
836
|
+
// sum up partial sums and write back result
|
837
|
+
__syncthreads();
|
838
|
+
#pragma unroll
|
839
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
840
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
841
|
+
}
|
842
|
+
|
843
|
+
if (threadIdx.x == 0) {
|
844
|
+
dst[idst] = tmp;
|
845
|
+
}
|
846
|
+
}
|
847
|
+
|
848
|
+
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
849
|
+
const float * xi = (float *) cxi;
|
850
|
+
float * dsti = (float *) cdsti;
|
851
|
+
|
852
|
+
*dsti = *xi;
|
853
|
+
}
|
854
|
+
|
855
|
+
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
856
|
+
const float * xi = (float *) cxi;
|
857
|
+
half * dsti = (half *) cdsti;
|
858
|
+
|
859
|
+
*dsti = __float2half(*xi);
|
860
|
+
}
|
861
|
+
|
862
|
+
template <cpy_kernel_t cpy_1>
|
863
|
+
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
864
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
865
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
866
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
867
|
+
|
868
|
+
if (i >= ne) {
|
869
|
+
return;
|
870
|
+
}
|
871
|
+
|
872
|
+
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
873
|
+
// then combine those indices with the corresponding byte offsets to get the total offsets
|
874
|
+
const int i02 = i / (ne00*ne01);
|
875
|
+
const int i01 = (i - i02*ne01*ne00) / ne00;
|
876
|
+
const int i00 = i - i02*ne01*ne00 - i01*ne00;
|
877
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
878
|
+
|
879
|
+
const int i12 = i / (ne10*ne11);
|
880
|
+
const int i11 = (i - i12*ne10*ne11) / ne10;
|
881
|
+
const int i10 = i - i12*ne10*ne11 - i11*ne10;
|
882
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
883
|
+
|
884
|
+
cpy_1(cx + x_offset, cdst + dst_offset);
|
885
|
+
}
|
886
|
+
|
887
|
+
// rope == RoPE == rotary positional embedding
|
740
888
|
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
|
741
889
|
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
742
890
|
|
@@ -758,6 +906,72 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
758
906
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
759
907
|
}
|
760
908
|
|
909
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
910
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
911
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
912
|
+
|
913
|
+
if (col >= ncols) {
|
914
|
+
return;
|
915
|
+
}
|
916
|
+
|
917
|
+
const int i = row*ncols + col;
|
918
|
+
// dst[i] = col > n_past + row ? -INFINITY : x[i];
|
919
|
+
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
920
|
+
}
|
921
|
+
|
922
|
+
// the CUDA soft max implementation differs from the CPU implementation
|
923
|
+
// instead of doubles floats are used
|
924
|
+
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
925
|
+
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
926
|
+
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
927
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
928
|
+
const int block_size = blockDim.x;
|
929
|
+
const int tid = threadIdx.x;
|
930
|
+
|
931
|
+
float tmp = 0.0;
|
932
|
+
|
933
|
+
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
934
|
+
const int col = block_start + tid;
|
935
|
+
|
936
|
+
if (col >= ncols) {
|
937
|
+
break;
|
938
|
+
}
|
939
|
+
|
940
|
+
const int i = row*ncols + col;
|
941
|
+
const float val = expf(x[i]);
|
942
|
+
tmp += val;
|
943
|
+
dst[i] = val;
|
944
|
+
}
|
945
|
+
|
946
|
+
// sum up partial sums
|
947
|
+
__syncthreads();
|
948
|
+
#pragma unroll
|
949
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
950
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
951
|
+
}
|
952
|
+
|
953
|
+
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
954
|
+
const int col = block_start + tid;
|
955
|
+
|
956
|
+
if (col >= ncols) {
|
957
|
+
break;
|
958
|
+
}
|
959
|
+
|
960
|
+
const int i = row*ncols + col;
|
961
|
+
dst[i] /= tmp;
|
962
|
+
}
|
963
|
+
}
|
964
|
+
|
965
|
+
static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
|
966
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
967
|
+
|
968
|
+
if (i >= k) {
|
969
|
+
return;
|
970
|
+
}
|
971
|
+
|
972
|
+
dst[i] = scale * x[i];
|
973
|
+
}
|
974
|
+
|
761
975
|
static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
762
976
|
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
763
977
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
@@ -831,73 +1045,92 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
831
1045
|
|
832
1046
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
833
1047
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
834
|
-
|
1048
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1049
|
+
const dim3 block_nums(1, block_num_y, 1);
|
835
1050
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
836
1051
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
837
|
-
<<<
|
1052
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
838
1053
|
}
|
839
1054
|
|
840
1055
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
841
1056
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
842
|
-
|
1057
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1058
|
+
const dim3 block_nums(1, block_num_y, 1);
|
843
1059
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
844
1060
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
845
|
-
<<<
|
1061
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
846
1062
|
}
|
847
1063
|
|
848
1064
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
849
1065
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
850
|
-
|
1066
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1067
|
+
const dim3 block_nums(1, block_num_y, 1);
|
851
1068
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
852
1069
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
853
|
-
<<<
|
1070
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
854
1071
|
}
|
855
1072
|
|
856
1073
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
857
1074
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
858
|
-
|
1075
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1076
|
+
const dim3 block_nums(1, block_num_y, 1);
|
859
1077
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
860
1078
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
861
|
-
<<<
|
1079
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
862
1080
|
}
|
863
1081
|
|
864
1082
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
865
1083
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
866
|
-
|
1084
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1085
|
+
const dim3 block_nums(1, block_num_y, 1);
|
867
1086
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
868
1087
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
869
|
-
<<<
|
1088
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
870
1089
|
}
|
871
1090
|
|
872
1091
|
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
873
1092
|
GGML_ASSERT(ncols % QK_K == 0);
|
874
1093
|
const int ny = 2;
|
1094
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1095
|
+
const dim3 block_nums(1, block_num_y, 1);
|
875
1096
|
const dim3 block_dims(32, ny, 1);
|
876
|
-
dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<
|
1097
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
877
1098
|
}
|
878
1099
|
|
879
1100
|
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
880
1101
|
GGML_ASSERT(ncols % QK_K == 0);
|
881
|
-
const
|
882
|
-
|
1102
|
+
const int ny = 2;
|
1103
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1104
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1105
|
+
const dim3 block_dims(32, ny, 1);
|
1106
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
883
1107
|
}
|
884
1108
|
|
885
1109
|
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
886
1110
|
GGML_ASSERT(ncols % QK_K == 0);
|
887
|
-
const
|
888
|
-
|
1111
|
+
const int ny = 2;
|
1112
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1113
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1114
|
+
const dim3 block_dims(32, ny, 1);
|
1115
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
889
1116
|
}
|
890
1117
|
|
891
1118
|
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
892
1119
|
GGML_ASSERT(ncols % QK_K == 0);
|
893
|
-
const
|
894
|
-
|
1120
|
+
const int ny = 2;
|
1121
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1122
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1123
|
+
const dim3 block_dims(32, ny, 1);
|
1124
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
895
1125
|
}
|
896
1126
|
|
897
1127
|
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
898
1128
|
GGML_ASSERT(ncols % QK_K == 0);
|
899
|
-
const
|
900
|
-
|
1129
|
+
const int ny = 2;
|
1130
|
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1131
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1132
|
+
const dim3 block_dims(32, ny, 1);
|
1133
|
+
dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
901
1134
|
}
|
902
1135
|
|
903
1136
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -907,10 +1140,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
907
1140
|
|
908
1141
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
909
1142
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
910
|
-
|
1143
|
+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1144
|
+
const dim3 block_nums(1, block_num_y, 1);
|
911
1145
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
912
1146
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
913
|
-
<<<
|
1147
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
914
1148
|
}
|
915
1149
|
|
916
1150
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
@@ -942,6 +1176,47 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
942
1176
|
}
|
943
1177
|
}
|
944
1178
|
|
1179
|
+
static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
|
1180
|
+
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1181
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1182
|
+
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
1183
|
+
}
|
1184
|
+
|
1185
|
+
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
1186
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
1187
|
+
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
1188
|
+
|
1189
|
+
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1190
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1191
|
+
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
1192
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
|
1193
|
+
}
|
1194
|
+
|
1195
|
+
static void ggml_cpy_f32_f32_cuda(
|
1196
|
+
const char * cx, char * cdst, const int ne,
|
1197
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1198
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
1199
|
+
|
1200
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
1201
|
+
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
1202
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
static void ggml_cpy_f32_f16_cuda(
|
1206
|
+
const char * cx, char * cdst, const int ne,
|
1207
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
1208
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
1209
|
+
|
1210
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
1211
|
+
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
1212
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
1213
|
+
}
|
1214
|
+
|
1215
|
+
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
1216
|
+
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
1217
|
+
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
1218
|
+
}
|
1219
|
+
|
945
1220
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
|
946
1221
|
GGML_ASSERT(nrows % 2 == 0);
|
947
1222
|
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -950,6 +1225,19 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
950
1225
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
951
1226
|
}
|
952
1227
|
|
1228
|
+
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
1229
|
+
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
1230
|
+
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
1231
|
+
const dim3 block_nums(block_num_x, nrows_x, 1);
|
1232
|
+
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
1233
|
+
}
|
1234
|
+
|
1235
|
+
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
1236
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1237
|
+
const dim3 block_nums(1, nrows_x, 1);
|
1238
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
1239
|
+
}
|
1240
|
+
|
953
1241
|
// buffer pool for cuda
|
954
1242
|
#define MAX_CUDA_BUFFERS 256
|
955
1243
|
|
@@ -1105,6 +1393,9 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
1105
1393
|
void * ptr = nullptr;
|
1106
1394
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
1107
1395
|
if (err != cudaSuccess) {
|
1396
|
+
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
1397
|
+
// This can fixed the OOM error in WSL.
|
1398
|
+
cudaGetLastError();
|
1108
1399
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
1109
1400
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
1110
1401
|
return nullptr;
|
@@ -1117,10 +1408,25 @@ void ggml_cuda_host_free(void * ptr) {
|
|
1117
1408
|
CUDA_CHECK(cudaFreeHost(ptr));
|
1118
1409
|
}
|
1119
1410
|
|
1120
|
-
static cudaError_t
|
1411
|
+
static cudaError_t ggml_cuda_cpy_tensor_2d(
|
1121
1412
|
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
1122
1413
|
|
1123
|
-
|
1414
|
+
cudaMemcpyKind kind;
|
1415
|
+
char * src_ptr;
|
1416
|
+
if (src->backend == GGML_BACKEND_CPU) {
|
1417
|
+
kind = cudaMemcpyHostToDevice;
|
1418
|
+
src_ptr = (char *) src->data;
|
1419
|
+
} else if (src->backend == GGML_BACKEND_GPU) {
|
1420
|
+
kind = cudaMemcpyDeviceToDevice;
|
1421
|
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
1422
|
+
int id;
|
1423
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
1424
|
+
src_ptr = (char *) extra->data_device[id];
|
1425
|
+
} else {
|
1426
|
+
GGML_ASSERT(false);
|
1427
|
+
}
|
1428
|
+
char * dst_ptr = (char *) dst;
|
1429
|
+
|
1124
1430
|
const int64_t ne0 = src->ne[0];
|
1125
1431
|
const int64_t nb0 = src->nb[0];
|
1126
1432
|
const int64_t nb1 = src->nb[1];
|
@@ -1131,17 +1437,17 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(
|
|
1131
1437
|
const int64_t bs = ggml_blck_size(type);
|
1132
1438
|
int64_t i1_diff = i1_high - i1_low;
|
1133
1439
|
|
1134
|
-
const
|
1440
|
+
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
1135
1441
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
1136
|
-
return cudaMemcpyAsync(
|
1442
|
+
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
1137
1443
|
} else if (nb0 == ts) {
|
1138
|
-
return cudaMemcpy2DAsync(
|
1444
|
+
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
1139
1445
|
} else {
|
1140
1446
|
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
1141
1447
|
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
1142
|
-
void * rd = (void *) (
|
1448
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
1143
1449
|
// pretend the row is a matrix with cols=1
|
1144
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0,
|
1450
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
1145
1451
|
if (r != cudaSuccess) return r;
|
1146
1452
|
}
|
1147
1453
|
return cudaSuccess;
|
@@ -1377,8 +1683,81 @@ inline void ggml_cuda_op_rope(
|
|
1377
1683
|
(void) i1;
|
1378
1684
|
}
|
1379
1685
|
|
1686
|
+
inline void ggml_cuda_op_diag_mask_inf(
|
1687
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1688
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1689
|
+
cudaStream_t & cudaStream_main){
|
1690
|
+
|
1691
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1692
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1693
|
+
|
1694
|
+
const int64_t ne00 = src0->ne[0];
|
1695
|
+
const int64_t ne01 = src0->ne[1];
|
1696
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1697
|
+
|
1698
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
1699
|
+
|
1700
|
+
// compute
|
1701
|
+
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
1702
|
+
CUDA_CHECK(cudaGetLastError());
|
1703
|
+
|
1704
|
+
(void) dst;
|
1705
|
+
(void) src0_ddq_i;
|
1706
|
+
(void) src1_ddf_i;
|
1707
|
+
(void) i02;
|
1708
|
+
(void) i1;
|
1709
|
+
}
|
1710
|
+
|
1711
|
+
inline void ggml_cuda_op_soft_max(
|
1712
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1713
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1714
|
+
cudaStream_t & cudaStream_main){
|
1715
|
+
|
1716
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1717
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1718
|
+
|
1719
|
+
const int64_t ne00 = src0->ne[0];
|
1720
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1721
|
+
|
1722
|
+
// compute
|
1723
|
+
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
1724
|
+
CUDA_CHECK(cudaGetLastError());
|
1725
|
+
|
1726
|
+
(void) src1;
|
1727
|
+
(void) dst;
|
1728
|
+
(void) src0_ddq_i;
|
1729
|
+
(void) src1_ddf_i;
|
1730
|
+
(void) i02;
|
1731
|
+
(void) i1;
|
1732
|
+
}
|
1733
|
+
|
1734
|
+
inline void ggml_cuda_op_scale(
|
1735
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
1736
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1737
|
+
cudaStream_t & cudaStream_main){
|
1738
|
+
|
1739
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
1740
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
1741
|
+
|
1742
|
+
const float scale = ((float *) src1->data)[0];
|
1743
|
+
|
1744
|
+
const int64_t ne00 = src0->ne[0];
|
1745
|
+
const int64_t i01_diff = i01_high - i01_low;
|
1746
|
+
|
1747
|
+
// compute
|
1748
|
+
scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
|
1749
|
+
CUDA_CHECK(cudaGetLastError());
|
1750
|
+
|
1751
|
+
(void) src1;
|
1752
|
+
(void) dst;
|
1753
|
+
(void) src0_ddq_i;
|
1754
|
+
(void) src1_ddf_i;
|
1755
|
+
(void) i02;
|
1756
|
+
(void) i1;
|
1757
|
+
}
|
1758
|
+
|
1380
1759
|
static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
1381
|
-
ggml_cuda_op_t op, bool src0_needs_f32) {
|
1760
|
+
ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
|
1382
1761
|
const int64_t ne00 = src0->ne[0];
|
1383
1762
|
const int64_t ne01 = src0->ne[1];
|
1384
1763
|
const int64_t ne02 = src0->ne[2];
|
@@ -1401,21 +1780,27 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1401
1780
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
1402
1781
|
|
1403
1782
|
// strides for iteration over dims 3 and 2
|
1404
|
-
const int64_t
|
1405
|
-
const int64_t
|
1406
|
-
const int64_t
|
1407
|
-
const int64_t
|
1783
|
+
const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
|
1784
|
+
const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
|
1785
|
+
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
1786
|
+
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
1787
|
+
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
1408
1788
|
|
1409
1789
|
const size_t src0_ts = ggml_type_size(src0->type);
|
1410
1790
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
1411
1791
|
|
1412
|
-
struct ggml_tensor_extra_gpu * src0_extra =
|
1792
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
1413
1793
|
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
1414
|
-
struct ggml_tensor_extra_gpu * dst_extra
|
1794
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
1415
1795
|
|
1416
1796
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
1797
|
+
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
1417
1798
|
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
1418
1799
|
|
1800
|
+
const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
|
1801
|
+
const bool src1_stays_on_host = use_src1 && (
|
1802
|
+
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
1803
|
+
|
1419
1804
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
1420
1805
|
|
1421
1806
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
@@ -1424,13 +1809,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1424
1809
|
char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
|
1425
1810
|
float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
1426
1811
|
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1427
|
-
float *
|
1812
|
+
float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1428
1813
|
|
1429
1814
|
// asq = actual size quantized, asf = actual size float
|
1430
1815
|
size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
1431
1816
|
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1432
1817
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1433
|
-
size_t
|
1818
|
+
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1434
1819
|
|
1435
1820
|
for (int id = 0; id < g_device_count; ++id) {
|
1436
1821
|
if (!split && id != g_main_device) {
|
@@ -1443,9 +1828,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1443
1828
|
int64_t row_low, row_high;
|
1444
1829
|
if (split) {
|
1445
1830
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
1446
|
-
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
1447
1831
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
1448
|
-
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
1449
1832
|
} else {
|
1450
1833
|
row_low = 0;
|
1451
1834
|
row_high = nrows0;
|
@@ -1458,7 +1841,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1458
1841
|
|
1459
1842
|
cudaSetDevice(id);
|
1460
1843
|
|
1461
|
-
if (src0_on_device) {
|
1844
|
+
if (src0_on_device && src0_is_contiguous) {
|
1462
1845
|
if (src0_is_f32) {
|
1463
1846
|
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
1464
1847
|
} else {
|
@@ -1476,8 +1859,8 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1476
1859
|
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
|
1477
1860
|
}
|
1478
1861
|
|
1479
|
-
if (use_src1) {
|
1480
|
-
if (src1_on_device) {
|
1862
|
+
if (use_src1 && !src1_stays_on_host) {
|
1863
|
+
if (src1_on_device && src1_is_contiguous) {
|
1481
1864
|
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
1482
1865
|
} else {
|
1483
1866
|
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
|
@@ -1490,26 +1873,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1490
1873
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
1491
1874
|
}
|
1492
1875
|
|
1493
|
-
|
1876
|
+
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
1877
|
+
const int64_t i02_max = flatten_rows ? 1 : ne02;
|
1878
|
+
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
1879
|
+
|
1880
|
+
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
1494
1881
|
const int64_t i13 = i03 % ne13;
|
1495
|
-
for (int64_t i02 = 0; i02 <
|
1882
|
+
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
1496
1883
|
const int64_t i12 = i02 % ne12;
|
1497
1884
|
|
1498
1885
|
const int64_t i0 = i03*ne02 + i02;
|
1499
|
-
|
1500
|
-
|
1886
|
+
|
1887
|
+
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
1888
|
+
const int64_t i0_offset_low = row_low/rows_per_iter;
|
1889
|
+
const int64_t i0_offset_high = row_high/rows_per_iter;
|
1501
1890
|
|
1502
1891
|
int64_t i01_low = 0;
|
1503
|
-
int64_t i01_high =
|
1892
|
+
int64_t i01_high = rows_per_iter;
|
1504
1893
|
if (split) {
|
1505
1894
|
if (i0 < i0_offset_low || i0 > i0_offset_high) {
|
1506
1895
|
continue;
|
1507
1896
|
}
|
1508
1897
|
if (i0 == i0_offset_low) {
|
1509
|
-
i01_low = row_low %
|
1898
|
+
i01_low = row_low % rows_per_iter;
|
1510
1899
|
}
|
1511
1900
|
if (i0 == i0_offset_high) {
|
1512
|
-
i01_high = row_high %
|
1901
|
+
i01_high = row_high % rows_per_iter;
|
1513
1902
|
}
|
1514
1903
|
}
|
1515
1904
|
|
@@ -1518,7 +1907,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1518
1907
|
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
|
1519
1908
|
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
1520
1909
|
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
1521
|
-
GGML_ASSERT(i01_high ==
|
1910
|
+
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
|
1522
1911
|
|
1523
1912
|
const int64_t i01_diff = i01_high - i01_low;
|
1524
1913
|
if (i01_diff == 0) {
|
@@ -1526,24 +1915,23 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1526
1915
|
}
|
1527
1916
|
const int64_t i11 = i13*ne12 + i12;
|
1528
1917
|
|
1529
|
-
cudaStream_t cudaStream_main
|
1918
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
|
1530
1919
|
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
|
1531
|
-
cudaEvent_t cudaEvent_memcpy_src1
|
1920
|
+
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
|
1532
1921
|
|
1533
1922
|
// for split tensors the data begins at i0 == i0_offset_low
|
1534
1923
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
1535
1924
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
1536
1925
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
1537
|
-
float * dst_ddf_i
|
1926
|
+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
1538
1927
|
|
1539
1928
|
// for split tensors the data pointer needs to be rounded down
|
1540
1929
|
// to the bin edge for i03, i02 bins beyond the first
|
1541
1930
|
if (i0 - i0_offset_low > 0) {
|
1931
|
+
GGML_ASSERT(!flatten_rows);
|
1542
1932
|
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
1543
1933
|
src0_ddf_i -= (row_low % ne01)*ne00;
|
1544
|
-
|
1545
|
-
if (i0 - i0_offset_low > 0) {
|
1546
|
-
dst_ddf_i -= (row_low % ne0)*ne1;
|
1934
|
+
dst_ddf_i -= (row_low % ne0)*ne1;
|
1547
1935
|
}
|
1548
1936
|
|
1549
1937
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
@@ -1553,30 +1941,37 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1553
1941
|
}
|
1554
1942
|
|
1555
1943
|
// copy src0, src1 to device if necessary
|
1556
|
-
if (use_src1) {
|
1944
|
+
if (use_src1 && !src1_stays_on_host) {
|
1557
1945
|
if (src1->backend == GGML_BACKEND_CPU) {
|
1558
|
-
|
1559
|
-
|
1946
|
+
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
|
1947
|
+
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
|
1948
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1));
|
1949
|
+
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
1560
1950
|
if (id != g_main_device) {
|
1951
|
+
GGML_ASSERT(!flatten_rows);
|
1561
1952
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
1562
1953
|
src1_ddf_i_source += i11*src1_stride;
|
1563
1954
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
|
1564
1955
|
cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
|
1565
1956
|
}
|
1957
|
+
} else if (src1_on_device && !src1_is_contiguous) {
|
1958
|
+
GGML_ASSERT(!split);
|
1959
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
|
1566
1960
|
} else {
|
1567
1961
|
GGML_ASSERT(false);
|
1568
1962
|
}
|
1569
1963
|
}
|
1570
1964
|
CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
|
1571
|
-
|
1965
|
+
|
1966
|
+
if (!src0_on_device || !src0_is_contiguous) {
|
1572
1967
|
if (src0_is_f32) {
|
1573
|
-
CUDA_CHECK(
|
1968
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
1574
1969
|
} else {
|
1575
|
-
CUDA_CHECK(
|
1970
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
1576
1971
|
}
|
1577
1972
|
}
|
1578
1973
|
|
1579
|
-
// convert src0 to f32 if it
|
1974
|
+
// convert src0 to f32 if it is necessary for the ggml_cuda_op
|
1580
1975
|
if (src0_needs_f32 && !src0_is_f32) {
|
1581
1976
|
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
1582
1977
|
CUDA_CHECK(cudaGetLastError());
|
@@ -1641,39 +2036,30 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1641
2036
|
|
1642
2037
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1643
2038
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1644
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true);
|
2039
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
|
1645
2040
|
}
|
1646
2041
|
|
1647
2042
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1648
2043
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1649
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
|
2044
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
1650
2045
|
}
|
1651
2046
|
|
1652
2047
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1653
2048
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1654
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true);
|
2049
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
1655
2050
|
}
|
1656
2051
|
|
1657
2052
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1658
2053
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1659
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true);
|
2054
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
1660
2055
|
}
|
1661
2056
|
|
1662
2057
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
1663
|
-
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
|
1664
2058
|
const int64_t ne10 = src1->ne[0];
|
1665
2059
|
|
1666
2060
|
const int64_t ne0 = dst->ne[0];
|
1667
2061
|
const int64_t ne1 = dst->ne[1];
|
1668
2062
|
|
1669
|
-
// if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) {
|
1670
|
-
// fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n",
|
1671
|
-
// src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
1672
|
-
// src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
1673
|
-
// dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
|
1674
|
-
// return false;
|
1675
|
-
// }
|
1676
|
-
|
1677
2063
|
// TODO: find the optimal values for these
|
1678
2064
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
1679
2065
|
src1->type == GGML_TYPE_F32 &&
|
@@ -1685,23 +2071,158 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|
1685
2071
|
return false;
|
1686
2072
|
}
|
1687
2073
|
|
2074
|
+
void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
2075
|
+
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
2076
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
2077
|
+
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
2078
|
+
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
2079
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2080
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2081
|
+
|
2082
|
+
const int64_t ne00 = src0->ne[0];
|
2083
|
+
const int64_t ne01 = src0->ne[1];
|
2084
|
+
const int64_t ne02 = src0->ne[2];
|
2085
|
+
|
2086
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2087
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
|
2088
|
+
|
2089
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2090
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
2091
|
+
|
2092
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2093
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
2094
|
+
|
2095
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2096
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2097
|
+
|
2098
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
2099
|
+
|
2100
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2101
|
+
}
|
2102
|
+
|
2103
|
+
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
2104
|
+
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
2105
|
+
GGML_ASSERT(!ggml_is_permuted(src0));
|
2106
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
2107
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
2108
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
2109
|
+
|
2110
|
+
const int64_t ne00 = src0->ne[0];
|
2111
|
+
const int64_t ne01 = src0->ne[1];
|
2112
|
+
const int64_t ne02 = src0->ne[2];
|
2113
|
+
|
2114
|
+
const int64_t nb01 = src0->nb[1];
|
2115
|
+
const int64_t nb02 = src0->nb[2];
|
2116
|
+
|
2117
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2118
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
|
2119
|
+
|
2120
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2121
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
2122
|
+
|
2123
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2124
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
2125
|
+
|
2126
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
2127
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2128
|
+
|
2129
|
+
const int row_stride_x = nb01 / sizeof(half);
|
2130
|
+
const int channel_stride_x = nb02 / sizeof(half);
|
2131
|
+
|
2132
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
2133
|
+
|
2134
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2135
|
+
}
|
2136
|
+
|
1688
2137
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1689
|
-
|
1690
|
-
|
2138
|
+
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
2139
|
+
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
2140
|
+
|
2141
|
+
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
2142
|
+
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
2143
|
+
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
2144
|
+
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
2145
|
+
}else if (src0->type == GGML_TYPE_F32) {
|
2146
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
1691
2147
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
1692
|
-
if (src1->ne[1] == 1) {
|
1693
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
2148
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
|
2149
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
|
1694
2150
|
} else {
|
1695
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
|
2151
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
1696
2152
|
}
|
1697
2153
|
} else {
|
1698
2154
|
GGML_ASSERT(false);
|
1699
2155
|
}
|
1700
2156
|
}
|
1701
2157
|
|
2158
|
+
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2159
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2160
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
|
2161
|
+
}
|
2162
|
+
|
2163
|
+
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2164
|
+
const int64_t ne = ggml_nelements(src0);
|
2165
|
+
GGML_ASSERT(ne == ggml_nelements(src1));
|
2166
|
+
|
2167
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
2168
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
2169
|
+
|
2170
|
+
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
2171
|
+
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
2172
|
+
|
2173
|
+
const int64_t ne00 = src0->ne[0];
|
2174
|
+
const int64_t ne01 = src0->ne[1];
|
2175
|
+
GGML_ASSERT(src0->ne[3] == 1);
|
2176
|
+
|
2177
|
+
const int64_t nb00 = src0->nb[0];
|
2178
|
+
const int64_t nb01 = src0->nb[1];
|
2179
|
+
const int64_t nb02 = src0->nb[2];
|
2180
|
+
|
2181
|
+
const int64_t ne10 = src1->ne[0];
|
2182
|
+
const int64_t ne11 = src1->ne[1];
|
2183
|
+
GGML_ASSERT(src1->ne[3] == 1);
|
2184
|
+
|
2185
|
+
const int64_t nb10 = src1->nb[0];
|
2186
|
+
const int64_t nb11 = src1->nb[1];
|
2187
|
+
const int64_t nb12 = src1->nb[2];
|
2188
|
+
|
2189
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2190
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
|
2191
|
+
|
2192
|
+
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2193
|
+
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
2194
|
+
|
2195
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2196
|
+
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
2197
|
+
|
2198
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
2199
|
+
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
2200
|
+
ne10, ne11, nb10, nb11, nb12, cudaStream_main);
|
2201
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
2202
|
+
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
2203
|
+
ne10, ne11, nb10, nb11, nb12, cudaStream_main);
|
2204
|
+
} else {
|
2205
|
+
GGML_ASSERT(false);
|
2206
|
+
}
|
2207
|
+
|
2208
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2209
|
+
|
2210
|
+
(void) dst;
|
2211
|
+
}
|
2212
|
+
|
2213
|
+
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2214
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2215
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
2216
|
+
}
|
2217
|
+
|
2218
|
+
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2219
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
2220
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
|
2221
|
+
}
|
2222
|
+
|
1702
2223
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
1703
2224
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
1704
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
|
2225
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
|
1705
2226
|
}
|
1706
2227
|
|
1707
2228
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -1710,16 +2231,14 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
1710
2231
|
(void) dst;
|
1711
2232
|
}
|
1712
2233
|
|
1713
|
-
void
|
1714
|
-
FILE * fp = fopen(fname, "rb");
|
2234
|
+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
1715
2235
|
int nrows = ggml_nrows(tensor);
|
1716
2236
|
const size_t nb1 = tensor->nb[1];
|
1717
2237
|
ggml_backend backend = tensor->backend;
|
1718
2238
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
2239
|
+
memset(extra, 0, sizeof(*extra));
|
1719
2240
|
|
1720
2241
|
for (int id = 0; id < g_device_count; ++id) {
|
1721
|
-
extra->data_device[id] = nullptr;
|
1722
|
-
|
1723
2242
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
1724
2243
|
continue;
|
1725
2244
|
}
|
@@ -1732,10 +2251,7 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|
1732
2251
|
row_high = nrows;
|
1733
2252
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
1734
2253
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
1735
|
-
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
1736
2254
|
row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
|
1737
|
-
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
1738
|
-
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
|
1739
2255
|
} else {
|
1740
2256
|
GGML_ASSERT(false);
|
1741
2257
|
}
|
@@ -1745,35 +2261,19 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|
1745
2261
|
|
1746
2262
|
int64_t nrows_split = row_high - row_low;
|
1747
2263
|
|
1748
|
-
const size_t offset_split =
|
2264
|
+
const size_t offset_split = row_low*nb1;
|
1749
2265
|
const size_t size = ggml_nbytes_split(tensor, nrows_split);
|
1750
2266
|
|
1751
2267
|
void * buf;
|
1752
2268
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
1753
|
-
void * buf_host =
|
1754
|
-
|
1755
|
-
#ifdef _WIN32
|
1756
|
-
int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
|
1757
|
-
#else
|
1758
|
-
int ret = fseek(fp, (long) offset_split, SEEK_SET);
|
1759
|
-
#endif
|
1760
|
-
GGML_ASSERT(ret == 0); // same
|
1761
|
-
|
1762
|
-
size_t ret2 = fread(buf_host, size, 1, fp);
|
1763
|
-
if (ret2 != 1) {
|
1764
|
-
fprintf(stderr, "unexpectedly reached end of file");
|
1765
|
-
exit(1);
|
1766
|
-
}
|
2269
|
+
void * buf_host = (char*)data + offset_split;
|
1767
2270
|
|
1768
2271
|
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
1769
|
-
cudaDeviceSynchronize();
|
1770
2272
|
|
1771
|
-
free(buf_host);
|
1772
2273
|
extra->data_device[id] = buf;
|
1773
2274
|
}
|
1774
2275
|
|
1775
2276
|
tensor->extra = extra;
|
1776
|
-
fclose(fp);
|
1777
2277
|
}
|
1778
2278
|
|
1779
2279
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
@@ -1795,47 +2295,78 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
1795
2295
|
delete extra;
|
1796
2296
|
}
|
1797
2297
|
|
1798
|
-
void
|
1799
|
-
if (
|
1800
|
-
|
2298
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
2299
|
+
if (scratch && g_scratch_size == 0) {
|
2300
|
+
return;
|
1801
2301
|
}
|
1802
2302
|
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
2303
|
+
// recursively assign CUDA buffers until a compute tensor is found
|
2304
|
+
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
|
2305
|
+
const ggml_op src0_op = tensor->src0->op;
|
2306
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2307
|
+
ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
|
2308
|
+
}
|
2309
|
+
}
|
2310
|
+
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
|
2311
|
+
ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
|
1807
2312
|
}
|
1808
2313
|
|
1809
2314
|
tensor->backend = GGML_BACKEND_GPU;
|
1810
2315
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
1811
2316
|
|
1812
|
-
bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data
|
2317
|
+
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
2318
|
+
tensor->op == GGML_OP_VIEW;
|
2319
|
+
const size_t size = ggml_nbytes(tensor);
|
1813
2320
|
|
1814
2321
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
1815
2322
|
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
|
1816
2323
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
2324
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2325
|
+
size_t offset = 0;
|
2326
|
+
if (tensor->op == GGML_OP_VIEW) {
|
2327
|
+
memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
|
2328
|
+
}
|
2329
|
+
extra->data_device[g_main_device] = src0_ddc + offset;
|
2330
|
+
} else if (tensor->op == GGML_OP_CPY) {
|
2331
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
|
2332
|
+
void * src1_ddv = src1_extra->data_device[g_main_device];
|
2333
|
+
extra->data_device[g_main_device] = src1_ddv;
|
2334
|
+
} else if (scratch) {
|
2335
|
+
GGML_ASSERT(size <= g_scratch_size);
|
2336
|
+
if (g_scratch_offset + size > g_scratch_size) {
|
2337
|
+
g_scratch_offset = 0;
|
2338
|
+
}
|
2339
|
+
|
1820
2340
|
char * data = (char *) g_scratch_buffer;
|
1821
2341
|
if (data == nullptr) {
|
1822
2342
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
1823
2343
|
g_scratch_buffer = data;
|
1824
2344
|
}
|
1825
2345
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
1826
|
-
}
|
1827
2346
|
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1831
|
-
|
2347
|
+
g_scratch_offset += size;
|
2348
|
+
|
2349
|
+
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
2350
|
+
} else { // allocate new buffers outside of scratch
|
2351
|
+
void * data;
|
2352
|
+
CUDA_CHECK(cudaMalloc(&data, size));
|
2353
|
+
CUDA_CHECK(cudaMemset(data, 0, size));
|
2354
|
+
extra->data_device[g_main_device] = data;
|
2355
|
+
}
|
1832
2356
|
|
1833
|
-
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
1834
2357
|
tensor->extra = extra;
|
1835
2358
|
}
|
1836
2359
|
|
2360
|
+
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
2361
|
+
ggml_cuda_assign_buffers_impl(tensor, true);
|
2362
|
+
}
|
2363
|
+
|
2364
|
+
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
2365
|
+
ggml_cuda_assign_buffers_impl(tensor, false);
|
2366
|
+
}
|
2367
|
+
|
1837
2368
|
void ggml_cuda_set_main_device(int main_device) {
|
1838
|
-
if (main_device
|
2369
|
+
if (main_device >= g_device_count) {
|
1839
2370
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
1840
2371
|
main_device, g_device_count, g_main_device);
|
1841
2372
|
return;
|
@@ -1852,6 +2383,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
|
1852
2383
|
g_scratch_size = scratch_size;
|
1853
2384
|
}
|
1854
2385
|
|
2386
|
+
void ggml_cuda_free_scratch() {
|
2387
|
+
if (g_scratch_buffer == nullptr) {
|
2388
|
+
return;
|
2389
|
+
}
|
2390
|
+
|
2391
|
+
CUDA_CHECK(cudaFree(g_scratch_buffer));
|
2392
|
+
g_scratch_buffer = nullptr;
|
2393
|
+
}
|
2394
|
+
|
1855
2395
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
1856
2396
|
ggml_cuda_func_t func;
|
1857
2397
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -1889,12 +2429,39 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
1889
2429
|
}
|
1890
2430
|
func = ggml_cuda_mul_mat;
|
1891
2431
|
break;
|
2432
|
+
case GGML_OP_SCALE:
|
2433
|
+
if (!any_on_device) {
|
2434
|
+
return false;
|
2435
|
+
}
|
2436
|
+
func = ggml_cuda_scale;
|
2437
|
+
break;
|
2438
|
+
case GGML_OP_CPY:
|
2439
|
+
if (!any_on_device) {
|
2440
|
+
return false;
|
2441
|
+
}
|
2442
|
+
func = ggml_cuda_cpy;
|
2443
|
+
break;
|
1892
2444
|
case GGML_OP_RESHAPE:
|
2445
|
+
case GGML_OP_VIEW:
|
2446
|
+
case GGML_OP_PERMUTE:
|
2447
|
+
case GGML_OP_TRANSPOSE:
|
1893
2448
|
if (!any_on_device) {
|
1894
2449
|
return false;
|
1895
2450
|
}
|
1896
2451
|
func = ggml_cuda_nop;
|
1897
2452
|
break;
|
2453
|
+
case GGML_OP_DIAG_MASK_INF:
|
2454
|
+
if (!any_on_device) {
|
2455
|
+
return false;
|
2456
|
+
}
|
2457
|
+
func = ggml_cuda_diag_mask_inf;
|
2458
|
+
break;
|
2459
|
+
case GGML_OP_SOFT_MAX:
|
2460
|
+
if (!any_on_device) {
|
2461
|
+
return false;
|
2462
|
+
}
|
2463
|
+
func = ggml_cuda_soft_max;
|
2464
|
+
break;
|
1898
2465
|
case GGML_OP_ROPE:
|
1899
2466
|
if (!any_on_device) {
|
1900
2467
|
return false;
|