llama_cpp 0.6.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -62,6 +62,7 @@
62
62
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
63
63
  #define cudaMemcpyKind hipMemcpyKind
64
64
  #define cudaMemset hipMemset
65
+ #define cudaMemsetAsync hipMemsetAsync
65
66
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
66
67
  #define cudaSetDevice hipSetDevice
67
68
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -80,9 +81,9 @@
80
81
  #include "ggml.h"
81
82
 
82
83
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
83
- #define CC_TURING 700
84
+ #define CC_VOLTA 700
84
85
  #define CC_OFFSET_AMD 1000000
85
- #define CC_RDNA2 CC_OFFSET_AMD + 1030
86
+ #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
86
87
 
87
88
  #if defined(GGML_USE_HIPBLAS)
88
89
  #define __CUDA_ARCH__ 1300
@@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
414
415
  #define CUDA_SILU_BLOCK_SIZE 256
415
416
  #define CUDA_CPY_BLOCK_SIZE 32
416
417
  #define CUDA_SCALE_BLOCK_SIZE 256
418
+ #define CUDA_CLAMP_BLOCK_SIZE 256
417
419
  #define CUDA_ROPE_BLOCK_SIZE 256
418
420
  #define CUDA_ALIBI_BLOCK_SIZE 32
419
421
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
420
422
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
421
423
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
424
+ #define CUDA_GET_ROWS_BLOCK_SIZE 256
422
425
 
423
426
  // dmmv = dequantize_mul_mat_vec
424
427
  #ifndef GGML_CUDA_DMMV_X
@@ -715,7 +718,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
715
718
 
716
719
  //================================== k-quants
717
720
 
718
- static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
721
+ template<typename dst_t>
722
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
719
723
 
720
724
  const int i = blockIdx.x;
721
725
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -727,7 +731,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
727
731
  const int is = 8*n + l/16;
728
732
 
729
733
  const uint8_t q = x[i].qs[32*n + l];
730
- float * y = yy + i*QK_K + 128*n;
734
+ dst_t * y = yy + i*QK_K + 128*n;
731
735
 
732
736
  float dall = __low2half(x[i].dm);
733
737
  float dmin = __high2half(x[i].dm);
@@ -739,7 +743,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
739
743
  const int is = tid/16; // 0 or 1
740
744
  const int il = tid%16; // 0...15
741
745
  const uint8_t q = x[i].qs[il] >> (2*is);
742
- float * y = yy + i*QK_K + 16*is + il;
746
+ dst_t * y = yy + i*QK_K + 16*is + il;
743
747
  float dall = __low2half(x[i].dm);
744
748
  float dmin = __high2half(x[i].dm);
745
749
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
@@ -748,7 +752,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
748
752
 
749
753
  }
750
754
 
751
- static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
755
+ template<typename dst_t>
756
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
752
757
 
753
758
  const int i = blockIdx.x;
754
759
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -772,7 +777,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
772
777
  float d_all = x[i].d;
773
778
  float dl = d_all * (us - 32);
774
779
 
775
- float * y = yy + i*QK_K + 128*n + 32*j;
780
+ dst_t * y = yy + i*QK_K + 128*n + 32*j;
776
781
  const uint8_t * q = x[i].qs + 32*n;
777
782
  const uint8_t * hm = x[i].hmask;
778
783
 
@@ -784,7 +789,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
784
789
  const int im = il/8; // 0...1
785
790
  const int in = il%8; // 0...7
786
791
 
787
- float * y = yy + i*QK_K + 16*is + il;
792
+ dst_t * y = yy + i*QK_K + 16*is + il;
788
793
 
789
794
  const uint8_t q = x[i].qs[il] >> (2*is);
790
795
  const uint8_t h = x[i].hmask[in] >> (2*is + im);
@@ -812,7 +817,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
812
817
  }
813
818
  #endif
814
819
 
815
- static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
820
+ template<typename dst_t>
821
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
816
822
  const block_q4_K * x = (const block_q4_K *) vx;
817
823
 
818
824
  const int i = blockIdx.x;
@@ -825,7 +831,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
825
831
  const int is = 2*il;
826
832
  const int n = 4;
827
833
 
828
- float * y = yy + i*QK_K + 64*il + n*ir;
834
+ dst_t * y = yy + i*QK_K + 64*il + n*ir;
829
835
 
830
836
  const float dall = __low2half(x[i].dm);
831
837
  const float dmin = __high2half(x[i].dm);
@@ -844,7 +850,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
844
850
  #else
845
851
  const int tid = threadIdx.x;
846
852
  const uint8_t * q = x[i].qs;
847
- float * y = yy + i*QK_K;
853
+ dst_t * y = yy + i*QK_K;
848
854
  const float d = (float)x[i].dm[0];
849
855
  const float m = (float)x[i].dm[1];
850
856
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
@@ -852,7 +858,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
852
858
  #endif
853
859
  }
854
860
 
855
- static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
861
+ template<typename dst_t>
862
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
856
863
  const block_q5_K * x = (const block_q5_K *) vx;
857
864
 
858
865
  const int i = blockIdx.x;
@@ -864,7 +871,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
864
871
  const int ir = tid%16; // ir is in 0...15
865
872
  const int is = 2*il; // is is in 0...6
866
873
 
867
- float * y = yy + i*QK_K + 64*il + 2*ir;
874
+ dst_t * y = yy + i*QK_K + 64*il + 2*ir;
868
875
 
869
876
  const float dall = __low2half(x[i].dm);
870
877
  const float dmin = __high2half(x[i].dm);
@@ -892,13 +899,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
892
899
  const int is = tid/16; // 0 or 1
893
900
  const uint8_t h = x[i].qh[in] >> im;
894
901
  const float d = x[i].d;
895
- float * y = yy + i*QK_K + tid;
902
+ dst_t * y = yy + i*QK_K + tid;
896
903
  y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
897
904
  y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
898
905
  #endif
899
906
  }
900
907
 
901
- static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
908
+ template<typename dst_t>
909
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
902
910
  const block_q6_K * x = (const block_q6_K *) vx;
903
911
 
904
912
  const int i = blockIdx.x;
@@ -910,7 +918,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
910
918
  const int il = tid - 32*ip; // 0...32
911
919
  const int is = 8*ip + il/16;
912
920
 
913
- float * y = yy + i*QK_K + 128*ip + il;
921
+ dst_t * y = yy + i*QK_K + 128*ip + il;
914
922
 
915
923
  const float d = x[i].d;
916
924
 
@@ -929,7 +937,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
929
937
  const int ip = tid/16; // 0 or 1
930
938
  const int il = tid - 16*ip; // 0...15
931
939
 
932
- float * y = yy + i*QK_K + 16*ip + il;
940
+ dst_t * y = yy + i*QK_K + 16*ip + il;
933
941
 
934
942
  const float d = x[i].d;
935
943
 
@@ -1569,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1569
1577
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1570
1578
  }
1571
1579
 
1580
+ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1581
+ static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1582
+ const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1583
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1584
+
1585
+ if (col >= ncols) {
1586
+ return;
1587
+ }
1588
+
1589
+ const int r = y[row];
1590
+
1591
+ // copy x[r*ncols + col] to dst[row*ncols + col]
1592
+ const int xi = r*ncols + col;
1593
+ const int di = row*ncols + col;
1594
+
1595
+ const int ib = xi/qk; // block index
1596
+ const int iqs = (xi%qk)/qr; // quant index
1597
+ const int iybs = di - di%qk; // y block start index
1598
+ const int y_offset = qr == 1 ? 1 : qk/2;
1599
+
1600
+ // dequantize
1601
+ dfloat2 v;
1602
+ dequantize_kernel(x, ib, iqs, v);
1603
+
1604
+ dst[iybs + iqs + 0] = v.x;
1605
+ dst[iybs + iqs + y_offset] = v.y;
1606
+ }
1607
+
1572
1608
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1573
1609
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1574
1610
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -3548,7 +3584,7 @@ template <bool need_check> static __global__ void
3548
3584
  load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3549
3585
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3550
3586
 
3551
- #elif __CUDA_ARCH__ >= CC_TURING
3587
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3552
3588
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3553
3589
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3554
3590
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3568,7 +3604,7 @@ template <bool need_check> static __global__ void
3568
3604
  #else
3569
3605
  (void) vec_dot_q4_0_q8_1_mul_mat;
3570
3606
  assert(false);
3571
- #endif // __CUDA_ARCH__ >= CC_TURING
3607
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3572
3608
  }
3573
3609
 
3574
3610
  #define MMQ_X_Q4_1_RDNA2 64
@@ -3589,9 +3625,9 @@ template <bool need_check> static __global__ void
3589
3625
  #if defined(RDNA3) || defined(RDNA2)
3590
3626
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3591
3627
  #endif // defined(RDNA3) || defined(RDNA2)
3592
- #elif __CUDA_ARCH__ < CC_TURING
3628
+ #elif __CUDA_ARCH__ < CC_VOLTA
3593
3629
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3594
- #endif // __CUDA_ARCH__ < CC_TURING
3630
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3595
3631
  mul_mat_q4_1(
3596
3632
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3597
3633
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3611,7 +3647,7 @@ template <bool need_check> static __global__ void
3611
3647
  load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3612
3648
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3613
3649
 
3614
- #elif __CUDA_ARCH__ >= CC_TURING
3650
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3615
3651
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3616
3652
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3617
3653
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3631,7 +3667,7 @@ template <bool need_check> static __global__ void
3631
3667
  #else
3632
3668
  (void) vec_dot_q4_1_q8_1_mul_mat;
3633
3669
  assert(false);
3634
- #endif // __CUDA_ARCH__ >= CC_TURING
3670
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3635
3671
  }
3636
3672
 
3637
3673
  #define MMQ_X_Q5_0_RDNA2 64
@@ -3672,7 +3708,7 @@ template <bool need_check> static __global__ void
3672
3708
  load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3673
3709
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3674
3710
 
3675
- #elif __CUDA_ARCH__ >= CC_TURING
3711
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3676
3712
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3677
3713
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3678
3714
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3692,7 +3728,7 @@ template <bool need_check> static __global__ void
3692
3728
  #else
3693
3729
  (void) vec_dot_q5_0_q8_1_mul_mat;
3694
3730
  assert(false);
3695
- #endif // __CUDA_ARCH__ >= CC_TURING
3731
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3696
3732
  }
3697
3733
 
3698
3734
  #define MMQ_X_Q5_1_RDNA2 64
@@ -3733,7 +3769,7 @@ mul_mat_q5_1(
3733
3769
  load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3734
3770
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3735
3771
 
3736
- #elif __CUDA_ARCH__ >= CC_TURING
3772
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3737
3773
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3738
3774
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3739
3775
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3753,7 +3789,7 @@ mul_mat_q5_1(
3753
3789
  #else
3754
3790
  (void) vec_dot_q5_1_q8_1_mul_mat;
3755
3791
  assert(false);
3756
- #endif // __CUDA_ARCH__ >= CC_TURING
3792
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3757
3793
  }
3758
3794
 
3759
3795
  #define MMQ_X_Q8_0_RDNA2 64
@@ -3794,7 +3830,7 @@ template <bool need_check> static __global__ void
3794
3830
  load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3795
3831
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3796
3832
 
3797
- #elif __CUDA_ARCH__ >= CC_TURING
3833
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3798
3834
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3799
3835
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3800
3836
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3814,7 +3850,7 @@ template <bool need_check> static __global__ void
3814
3850
  #else
3815
3851
  (void) vec_dot_q8_0_q8_1_mul_mat;
3816
3852
  assert(false);
3817
- #endif // __CUDA_ARCH__ >= CC_TURING
3853
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3818
3854
  }
3819
3855
 
3820
3856
  #define MMQ_X_Q2_K_RDNA2 64
@@ -3855,7 +3891,7 @@ mul_mat_q2_K(
3855
3891
  load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3856
3892
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3857
3893
 
3858
- #elif __CUDA_ARCH__ >= CC_TURING
3894
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3859
3895
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3860
3896
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3861
3897
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3875,7 +3911,7 @@ mul_mat_q2_K(
3875
3911
  #else
3876
3912
  (void) vec_dot_q2_K_q8_1_mul_mat;
3877
3913
  assert(false);
3878
- #endif // __CUDA_ARCH__ >= CC_TURING
3914
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3879
3915
  }
3880
3916
 
3881
3917
  #define MMQ_X_Q3_K_RDNA2 128
@@ -3896,9 +3932,9 @@ template <bool need_check> static __global__ void
3896
3932
  #if defined(RDNA3) || defined(RDNA2)
3897
3933
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3898
3934
  #endif // defined(RDNA3) || defined(RDNA2)
3899
- #elif __CUDA_ARCH__ < CC_TURING
3935
+ #elif __CUDA_ARCH__ < CC_VOLTA
3900
3936
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3901
- #endif // __CUDA_ARCH__ < CC_TURING
3937
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3902
3938
  mul_mat_q3_K(
3903
3939
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3904
3940
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3918,7 +3954,7 @@ template <bool need_check> static __global__ void
3918
3954
  load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3919
3955
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3920
3956
 
3921
- #elif __CUDA_ARCH__ >= CC_TURING
3957
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3922
3958
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3923
3959
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3924
3960
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3938,7 +3974,7 @@ template <bool need_check> static __global__ void
3938
3974
  #else
3939
3975
  (void) vec_dot_q3_K_q8_1_mul_mat;
3940
3976
  assert(false);
3941
- #endif // __CUDA_ARCH__ >= CC_TURING
3977
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3942
3978
  }
3943
3979
 
3944
3980
  #define MMQ_X_Q4_K_RDNA2 64
@@ -3959,9 +3995,9 @@ template <bool need_check> static __global__ void
3959
3995
  #if defined(RDNA3) || defined(RDNA2)
3960
3996
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3961
3997
  #endif // defined(RDNA3) || defined(RDNA2)
3962
- #elif __CUDA_ARCH__ < CC_TURING
3998
+ #elif __CUDA_ARCH__ < CC_VOLTA
3963
3999
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3964
- #endif // __CUDA_ARCH__ < CC_TURING
4000
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3965
4001
  mul_mat_q4_K(
3966
4002
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3967
4003
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3981,7 +4017,7 @@ template <bool need_check> static __global__ void
3981
4017
  load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3982
4018
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3983
4019
 
3984
- #elif __CUDA_ARCH__ >= CC_TURING
4020
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3985
4021
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3986
4022
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3987
4023
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -4001,7 +4037,7 @@ template <bool need_check> static __global__ void
4001
4037
  #else
4002
4038
  (void) vec_dot_q4_K_q8_1_mul_mat;
4003
4039
  assert(false);
4004
- #endif // __CUDA_ARCH__ >= CC_TURING
4040
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4005
4041
  }
4006
4042
 
4007
4043
  #define MMQ_X_Q5_K_RDNA2 64
@@ -4042,7 +4078,7 @@ mul_mat_q5_K(
4042
4078
  load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4043
4079
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4044
4080
 
4045
- #elif __CUDA_ARCH__ >= CC_TURING
4081
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4046
4082
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
4047
4083
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
4048
4084
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -4062,7 +4098,7 @@ mul_mat_q5_K(
4062
4098
  #else
4063
4099
  (void) vec_dot_q5_K_q8_1_mul_mat;
4064
4100
  assert(false);
4065
- #endif // __CUDA_ARCH__ >= CC_TURING
4101
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4066
4102
  }
4067
4103
 
4068
4104
  #define MMQ_X_Q6_K_RDNA2 64
@@ -4083,9 +4119,9 @@ template <bool need_check> static __global__ void
4083
4119
  #if defined(RDNA3) || defined(RDNA2)
4084
4120
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4085
4121
  #endif // defined(RDNA3) || defined(RDNA2)
4086
- #elif __CUDA_ARCH__ < CC_TURING
4122
+ #elif __CUDA_ARCH__ < CC_VOLTA
4087
4123
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
4088
- #endif // __CUDA_ARCH__ < CC_TURING
4124
+ #endif // __CUDA_ARCH__ < CC_VOLTA
4089
4125
  mul_mat_q6_K(
4090
4126
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
4091
4127
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -4105,7 +4141,7 @@ template <bool need_check> static __global__ void
4105
4141
  load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4106
4142
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4107
4143
 
4108
- #elif __CUDA_ARCH__ >= CC_TURING
4144
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4109
4145
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
4110
4146
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
4111
4147
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4125,7 +4161,7 @@ template <bool need_check> static __global__ void
4125
4161
  #else
4126
4162
  (void) vec_dot_q6_K_q8_1_mul_mat;
4127
4163
  assert(false);
4128
- #endif // __CUDA_ARCH__ >= CC_TURING
4164
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4129
4165
  }
4130
4166
 
4131
4167
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -4550,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
4550
4586
  dst[i] = scale * x[i];
4551
4587
  }
4552
4588
 
4589
+ static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4590
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
4591
+
4592
+ if (i >= k) {
4593
+ return;
4594
+ }
4595
+
4596
+ dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4597
+ }
4598
+
4599
+ template<int qk, int qr, dequantize_kernel_t dq>
4600
+ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4601
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
4602
+ const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
4603
+ const dim3 block_nums(block_num_x, nrows, 1);
4604
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4605
+ }
4606
+
4553
4607
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4554
4608
  const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4555
4609
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -4604,32 +4658,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4604
4658
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4605
4659
  }
4606
4660
 
4607
- static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4661
+ template<typename dst_t>
4662
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4608
4663
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4609
4664
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4610
4665
  }
4611
4666
 
4612
- static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4667
+ template<typename dst_t>
4668
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4613
4669
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4614
4670
  dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4615
4671
  }
4616
4672
 
4617
- static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4673
+ template<typename dst_t>
4674
+ static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4618
4675
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4619
4676
  dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4620
4677
  }
4621
4678
 
4622
- static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4679
+ template<typename dst_t>
4680
+ static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4623
4681
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4624
4682
  dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4625
4683
  }
4626
4684
 
4627
- static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4685
+ template<typename dst_t>
4686
+ static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4628
4687
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4629
4688
  dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4630
4689
  }
4631
4690
 
4632
- static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4691
+ template<typename dst_t>
4692
+ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4633
4693
  const int nb = k / QK_K;
4634
4694
  #if QK_K == 256
4635
4695
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4638,7 +4698,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
4638
4698
  #endif
4639
4699
  }
4640
4700
 
4641
- static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4701
+ template<typename dst_t>
4702
+ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4642
4703
  const int nb = k / QK_K;
4643
4704
  #if QK_K == 256
4644
4705
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4647,12 +4708,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
4647
4708
  #endif
4648
4709
  }
4649
4710
 
4650
- static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4711
+ template<typename dst_t>
4712
+ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4651
4713
  const int nb = k / QK_K;
4652
4714
  dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
4653
4715
  }
4654
4716
 
4655
- static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4717
+ template<typename dst_t>
4718
+ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4656
4719
  const int nb = k / QK_K;
4657
4720
  #if QK_K == 256
4658
4721
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4661,7 +4724,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu
4661
4724
  #endif
4662
4725
  }
4663
4726
 
4664
- static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4727
+ template<typename dst_t>
4728
+ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4665
4729
  const int nb = k / QK_K;
4666
4730
  #if QK_K == 256
4667
4731
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4868,6 +4932,26 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
4868
4932
 
4869
4933
  static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
4870
4934
  switch (type) {
4935
+ case GGML_TYPE_Q4_0:
4936
+ return dequantize_row_q4_0_cuda;
4937
+ case GGML_TYPE_Q4_1:
4938
+ return dequantize_row_q4_1_cuda;
4939
+ case GGML_TYPE_Q5_0:
4940
+ return dequantize_row_q5_0_cuda;
4941
+ case GGML_TYPE_Q5_1:
4942
+ return dequantize_row_q5_1_cuda;
4943
+ case GGML_TYPE_Q8_0:
4944
+ return dequantize_row_q8_0_cuda;
4945
+ case GGML_TYPE_Q2_K:
4946
+ return dequantize_row_q2_K_cuda;
4947
+ case GGML_TYPE_Q3_K:
4948
+ return dequantize_row_q3_K_cuda;
4949
+ case GGML_TYPE_Q4_K:
4950
+ return dequantize_row_q4_K_cuda;
4951
+ case GGML_TYPE_Q5_K:
4952
+ return dequantize_row_q5_K_cuda;
4953
+ case GGML_TYPE_Q6_K:
4954
+ return dequantize_row_q6_K_cuda;
4871
4955
  case GGML_TYPE_F32:
4872
4956
  return convert_fp32_to_fp16_cuda;
4873
4957
  default:
@@ -4921,7 +5005,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4921
5005
  mmq_x = MMQ_X_Q4_0_RDNA1;
4922
5006
  mmq_y = MMQ_Y_Q4_0_RDNA1;
4923
5007
  nwarps = NWARPS_Q4_0_RDNA1;
4924
- } else if (compute_capability >= CC_TURING) {
5008
+ } else if (compute_capability >= CC_VOLTA) {
4925
5009
  mmq_x = MMQ_X_Q4_0_AMPERE;
4926
5010
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4927
5011
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4966,7 +5050,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4966
5050
  mmq_x = MMQ_X_Q4_1_RDNA1;
4967
5051
  mmq_y = MMQ_Y_Q4_1_RDNA1;
4968
5052
  nwarps = NWARPS_Q4_1_RDNA1;
4969
- } else if (compute_capability >= CC_TURING) {
5053
+ } else if (compute_capability >= CC_VOLTA) {
4970
5054
  mmq_x = MMQ_X_Q4_1_AMPERE;
4971
5055
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4972
5056
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -5011,7 +5095,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
5011
5095
  mmq_x = MMQ_X_Q5_0_RDNA1;
5012
5096
  mmq_y = MMQ_Y_Q5_0_RDNA1;
5013
5097
  nwarps = NWARPS_Q5_0_RDNA1;
5014
- } else if (compute_capability >= CC_TURING) {
5098
+ } else if (compute_capability >= CC_VOLTA) {
5015
5099
  mmq_x = MMQ_X_Q5_0_AMPERE;
5016
5100
  mmq_y = MMQ_Y_Q5_0_AMPERE;
5017
5101
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -5056,7 +5140,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
5056
5140
  mmq_x = MMQ_X_Q5_1_RDNA1;
5057
5141
  mmq_y = MMQ_Y_Q5_1_RDNA1;
5058
5142
  nwarps = NWARPS_Q5_1_RDNA1;
5059
- } else if (compute_capability >= CC_TURING) {
5143
+ } else if (compute_capability >= CC_VOLTA) {
5060
5144
  mmq_x = MMQ_X_Q5_1_AMPERE;
5061
5145
  mmq_y = MMQ_Y_Q5_1_AMPERE;
5062
5146
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -5101,7 +5185,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
5101
5185
  mmq_x = MMQ_X_Q8_0_RDNA1;
5102
5186
  mmq_y = MMQ_Y_Q8_0_RDNA1;
5103
5187
  nwarps = NWARPS_Q8_0_RDNA1;
5104
- } else if (compute_capability >= CC_TURING) {
5188
+ } else if (compute_capability >= CC_VOLTA) {
5105
5189
  mmq_x = MMQ_X_Q8_0_AMPERE;
5106
5190
  mmq_y = MMQ_Y_Q8_0_AMPERE;
5107
5191
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -5146,7 +5230,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
5146
5230
  mmq_x = MMQ_X_Q2_K_RDNA1;
5147
5231
  mmq_y = MMQ_Y_Q2_K_RDNA1;
5148
5232
  nwarps = NWARPS_Q2_K_RDNA1;
5149
- } else if (compute_capability >= CC_TURING) {
5233
+ } else if (compute_capability >= CC_VOLTA) {
5150
5234
  mmq_x = MMQ_X_Q2_K_AMPERE;
5151
5235
  mmq_y = MMQ_Y_Q2_K_AMPERE;
5152
5236
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -5193,7 +5277,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
5193
5277
  mmq_x = MMQ_X_Q3_K_RDNA1;
5194
5278
  mmq_y = MMQ_Y_Q3_K_RDNA1;
5195
5279
  nwarps = NWARPS_Q3_K_RDNA1;
5196
- } else if (compute_capability >= CC_TURING) {
5280
+ } else if (compute_capability >= CC_VOLTA) {
5197
5281
  mmq_x = MMQ_X_Q3_K_AMPERE;
5198
5282
  mmq_y = MMQ_Y_Q3_K_AMPERE;
5199
5283
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -5239,7 +5323,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
5239
5323
  mmq_x = MMQ_X_Q4_K_RDNA1;
5240
5324
  mmq_y = MMQ_Y_Q4_K_RDNA1;
5241
5325
  nwarps = NWARPS_Q4_K_RDNA1;
5242
- } else if (compute_capability >= CC_TURING) {
5326
+ } else if (compute_capability >= CC_VOLTA) {
5243
5327
  mmq_x = MMQ_X_Q4_K_AMPERE;
5244
5328
  mmq_y = MMQ_Y_Q4_K_AMPERE;
5245
5329
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -5284,7 +5368,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
5284
5368
  mmq_x = MMQ_X_Q5_K_RDNA1;
5285
5369
  mmq_y = MMQ_Y_Q5_K_RDNA1;
5286
5370
  nwarps = NWARPS_Q5_K_RDNA1;
5287
- } else if (compute_capability >= CC_TURING) {
5371
+ } else if (compute_capability >= CC_VOLTA) {
5288
5372
  mmq_x = MMQ_X_Q5_K_AMPERE;
5289
5373
  mmq_y = MMQ_Y_Q5_K_AMPERE;
5290
5374
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -5329,7 +5413,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
5329
5413
  mmq_x = MMQ_X_Q6_K_RDNA1;
5330
5414
  mmq_y = MMQ_Y_Q6_K_RDNA1;
5331
5415
  nwarps = NWARPS_Q6_K_RDNA1;
5332
- } else if (compute_capability >= CC_TURING) {
5416
+ } else if (compute_capability >= CC_VOLTA) {
5333
5417
  mmq_x = MMQ_X_Q6_K_AMPERE;
5334
5418
  mmq_y = MMQ_Y_Q6_K_AMPERE;
5335
5419
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -5401,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5401
5485
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5402
5486
  }
5403
5487
 
5488
+ static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
5489
+ const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
5490
+ clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
5491
+ }
5492
+
5404
5493
  template<typename T>
5405
5494
  static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5406
5495
  const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
@@ -5668,7 +5757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5668
5757
  } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5669
5758
  GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5670
5759
  kind = cudaMemcpyDeviceToDevice;
5671
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5760
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5672
5761
  int id;
5673
5762
  CUDA_CHECK(cudaGetDevice(&id));
5674
5763
  src_ptr = (char *) extra->data_device[id];
@@ -5704,6 +5793,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5704
5793
  }
5705
5794
  }
5706
5795
 
5796
+ static void ggml_cuda_op_repeat(
5797
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5798
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5799
+ // guaranteed to be an integer due to the check in ggml_can_repeat
5800
+ const int64_t ne0 = dst->ne[0];
5801
+ const int64_t ne1 = dst->ne[1];
5802
+ const int64_t ne2 = dst->ne[2];
5803
+ const int64_t ne3 = dst->ne[3];
5804
+
5805
+ const int64_t ne00 = src0->ne[0];
5806
+ const int64_t ne01 = src0->ne[1];
5807
+ const int64_t ne02 = src0->ne[2];
5808
+ const int64_t ne03 = src0->ne[3];
5809
+
5810
+ const size_t nb0 = dst->nb[0];
5811
+ const size_t nb1 = dst->nb[1];
5812
+ const size_t nb2 = dst->nb[2];
5813
+ const size_t nb3 = dst->nb[3];
5814
+
5815
+ const size_t nb00 = src0->nb[0];
5816
+ const size_t nb01 = src0->nb[1];
5817
+ const size_t nb02 = src0->nb[2];
5818
+ const size_t nb03 = src0->nb[3];
5819
+
5820
+ const int nr0 = (int)(ne0/ne00);
5821
+ const int nr1 = (int)(ne1/ne01);
5822
+ const int nr2 = (int)(ne2/ne02);
5823
+ const int nr3 = (int)(ne3/ne03);
5824
+
5825
+ // TODO: support for transposed / permuted tensors
5826
+ GGML_ASSERT(nb0 == sizeof(float));
5827
+ GGML_ASSERT(nb00 == sizeof(float));
5828
+
5829
+ // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
5830
+ for (int i3 = 0; i3 < nr3; i3++) {
5831
+ for (int k3 = 0; k3 < ne03; k3++) {
5832
+ for (int i2 = 0; i2 < nr2; i2++) {
5833
+ for (int k2 = 0; k2 < ne02; k2++) {
5834
+ for (int i1 = 0; i1 < nr1; i1++) {
5835
+ for (int k1 = 0; k1 < ne01; k1++) {
5836
+ for (int i0 = 0; i0 < nr0; i0++) {
5837
+ CUDA_CHECK(cudaMemcpyAsync(
5838
+ (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
5839
+ (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
5840
+ ne00*nb0, cudaMemcpyDeviceToDevice, stream));
5841
+ }
5842
+ }
5843
+ }
5844
+ }
5845
+ }
5846
+ }
5847
+ }
5848
+
5849
+ (void) src1;
5850
+ (void) src1_d;
5851
+ }
5852
+
5853
+ static void ggml_cuda_op_get_rows(
5854
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5855
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5856
+
5857
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
5858
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
5859
+ GGML_ASSERT(ggml_is_contiguous(src0));
5860
+ GGML_ASSERT(ggml_is_contiguous(src1));
5861
+ GGML_ASSERT(ggml_is_contiguous(dst));
5862
+
5863
+ const int ncols = src0->ne[0];
5864
+ const int nrows = ggml_nelements(src1);
5865
+
5866
+ const int32_t * src1_i32 = (const int32_t *) src1_d;
5867
+
5868
+ switch (src0->type) {
5869
+ case GGML_TYPE_F16:
5870
+ get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5871
+ break;
5872
+ case GGML_TYPE_F32:
5873
+ get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5874
+ break;
5875
+ case GGML_TYPE_Q4_0:
5876
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5877
+ break;
5878
+ case GGML_TYPE_Q4_1:
5879
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5880
+ break;
5881
+ case GGML_TYPE_Q5_0:
5882
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5883
+ break;
5884
+ case GGML_TYPE_Q5_1:
5885
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5886
+ break;
5887
+ case GGML_TYPE_Q8_0:
5888
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5889
+ break;
5890
+ default:
5891
+ // TODO: k-quants
5892
+ GGML_ASSERT(false);
5893
+ break;
5894
+ }
5895
+ }
5896
+
5707
5897
  inline void ggml_cuda_op_add(
5708
5898
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5709
5899
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -5907,7 +6097,7 @@ static int64_t get_row_rounding(ggml_type type) {
5907
6097
  switch(type) {
5908
6098
  case GGML_TYPE_Q4_0:
5909
6099
  case GGML_TYPE_Q4_1:
5910
- return max_compute_capability >= CC_TURING ? 128 : 64;
6100
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5911
6101
  case GGML_TYPE_Q5_0:
5912
6102
  case GGML_TYPE_Q5_1:
5913
6103
  case GGML_TYPE_Q8_0:
@@ -5918,7 +6108,7 @@ static int64_t get_row_rounding(ggml_type type) {
5918
6108
  case GGML_TYPE_Q3_K:
5919
6109
  case GGML_TYPE_Q4_K:
5920
6110
  case GGML_TYPE_Q5_K:
5921
- return max_compute_capability >= CC_TURING ? 128 : 64;
6111
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5922
6112
  case GGML_TYPE_Q6_K:
5923
6113
  return 64;
5924
6114
  default:
@@ -6083,8 +6273,19 @@ inline void ggml_cuda_op_mul_mat_cublas(
6083
6273
 
6084
6274
  const int compute_capability = g_compute_capabilities[id];
6085
6275
 
6086
- if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
6087
- // convert src1 to fp16, multiply as fp16, convert dst to fp32
6276
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
6277
+ // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
6278
+ half * src0_as_f16 = nullptr;
6279
+ size_t src0_as = 0;
6280
+ if (src0->type != GGML_TYPE_F16) {
6281
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
6282
+ GGML_ASSERT(to_fp16_cuda != nullptr);
6283
+ size_t ne = row_diff*ne00;
6284
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
6285
+ to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
6286
+ }
6287
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
6288
+
6088
6289
  half * src1_as_f16 = nullptr;
6089
6290
  size_t src1_as = 0;
6090
6291
  if (src1->type != GGML_TYPE_F16) {
@@ -6106,9 +6307,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
6106
6307
  CUBLAS_CHECK(
6107
6308
  cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6108
6309
  row_diff, src1_ncols, ne10,
6109
- &alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
6110
- src1_ptr, CUDA_R_16F, ne10,
6111
- &beta_f16, dst_f16, CUDA_R_16F, ldc,
6310
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
6311
+ src1_ptr, CUDA_R_16F, ne10,
6312
+ &beta_f16, dst_f16, CUDA_R_16F, ldc,
6112
6313
  CUBLAS_COMPUTE_16F,
6113
6314
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
6114
6315
 
@@ -6117,6 +6318,10 @@ inline void ggml_cuda_op_mul_mat_cublas(
6117
6318
 
6118
6319
  ggml_cuda_pool_free(dst_f16, dst_as);
6119
6320
 
6321
+ if (src0_as != 0) {
6322
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
6323
+ }
6324
+
6120
6325
  if (src1_as != 0) {
6121
6326
  ggml_cuda_pool_free(src1_as_f16, src1_as);
6122
6327
  }
@@ -6229,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
6229
6434
  const int64_t ne02 = src0->ne[2];
6230
6435
  const int64_t nrows = ggml_nrows(src0);
6231
6436
 
6232
- const int n_past = ((int32_t *) dst->op_params)[0];
6437
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6233
6438
  const int n_head = ((int32_t *) dst->op_params)[1];
6234
6439
  float max_bias;
6235
6440
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
6236
6441
 
6237
- GGML_ASSERT(ne01 + n_past == ne00);
6442
+ //GGML_ASSERT(ne01 + n_past == ne00);
6238
6443
  GGML_ASSERT(n_head == ne02);
6239
6444
 
6240
6445
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -6293,7 +6498,14 @@ inline void ggml_cuda_op_scale(
6293
6498
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6294
6499
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6295
6500
 
6296
- const float scale = ((float *) src1->data)[0];
6501
+ float scale;
6502
+ // HACK: support for ggml backend interface
6503
+ if (src1->backend == GGML_BACKEND_CPU) {
6504
+ scale = ((float *) src1->data)[0];
6505
+ } else {
6506
+ // TODO: pass pointer to kernel instead of copying to host
6507
+ CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
6508
+ }
6297
6509
 
6298
6510
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
6299
6511
  CUDA_CHECK(cudaGetLastError());
@@ -6303,6 +6515,24 @@ inline void ggml_cuda_op_scale(
6303
6515
  (void) src1_dd;
6304
6516
  }
6305
6517
 
6518
+ inline void ggml_cuda_op_clamp(
6519
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6520
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6521
+
6522
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6523
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6524
+
6525
+ const float min = ((float *) dst->op_params)[0];
6526
+ const float max = ((float *) dst->op_params)[1];
6527
+
6528
+ clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6529
+ CUDA_CHECK(cudaGetLastError());
6530
+
6531
+ (void) src1;
6532
+ (void) dst;
6533
+ (void) src1_dd;
6534
+ }
6535
+
6306
6536
  static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6307
6537
  const int64_t nrows0 = ggml_nrows(src0);
6308
6538
 
@@ -6312,9 +6542,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6312
6542
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6313
6543
  GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6314
6544
 
6315
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6316
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6317
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6545
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6546
+ ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6547
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6318
6548
 
6319
6549
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6320
6550
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@@ -6455,9 +6685,9 @@ static void ggml_cuda_op_mul_mat(
6455
6685
  const size_t q8_1_ts = sizeof(block_q8_1);
6456
6686
  const size_t q8_1_bs = QK8_1;
6457
6687
 
6458
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6459
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6460
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6688
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6689
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6690
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6461
6691
 
6462
6692
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6463
6693
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
@@ -6535,7 +6765,7 @@ static void ggml_cuda_op_mul_mat(
6535
6765
  if (convert_src1_to_q8_1) {
6536
6766
  src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6537
6767
 
6538
- if (split && src1_on_device && src1_is_contiguous) {
6768
+ if (src1_on_device && src1_is_contiguous) {
6539
6769
  quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6540
6770
  CUDA_CHECK(cudaGetLastError());
6541
6771
  }
@@ -6617,7 +6847,7 @@ static void ggml_cuda_op_mul_mat(
6617
6847
  GGML_ASSERT(false);
6618
6848
  }
6619
6849
 
6620
- if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6850
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
6621
6851
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6622
6852
  CUDA_CHECK(cudaGetLastError());
6623
6853
  }
@@ -6708,6 +6938,14 @@ static void ggml_cuda_op_mul_mat(
6708
6938
  }
6709
6939
  }
6710
6940
 
6941
+ static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6942
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
6943
+ }
6944
+
6945
+ static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6946
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
6947
+ }
6948
+
6711
6949
  static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6712
6950
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6713
6951
  }
@@ -6762,13 +7000,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
6762
7000
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6763
7001
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6764
7002
 
6765
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7003
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6766
7004
  void * src0_ddq = src0_extra->data_device[g_main_device];
6767
7005
 
6768
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7006
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6769
7007
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6770
7008
 
6771
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7009
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6772
7010
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6773
7011
 
6774
7012
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@@ -6793,13 +7031,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
6793
7031
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6794
7032
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6795
7033
 
6796
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7034
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6797
7035
  void * src0_ddq = src0_extra->data_device[g_main_device];
6798
7036
 
6799
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7037
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6800
7038
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6801
7039
 
6802
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7040
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6803
7041
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6804
7042
 
6805
7043
  const int64_t row_stride_x = nb01 / sizeof(half);
@@ -6820,11 +7058,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
6820
7058
  }
6821
7059
  }
6822
7060
 
6823
- if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7061
+ if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6824
7062
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6825
7063
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6826
7064
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6827
- }else if (src0->type == GGML_TYPE_F32) {
7065
+ } else if (src0->type == GGML_TYPE_F32) {
6828
7066
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6829
7067
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6830
7068
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
@@ -6856,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
6856
7094
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6857
7095
  }
6858
7096
 
7097
+ static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7098
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
7099
+ }
7100
+
6859
7101
  static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6860
7102
  const int64_t ne = ggml_nelements(src0);
6861
7103
  GGML_ASSERT(ne == ggml_nelements(src1));
@@ -6885,8 +7127,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
6885
7127
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6886
7128
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6887
7129
 
6888
- const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6889
- const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7130
+ const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7131
+ const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6890
7132
 
6891
7133
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6892
7134
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@@ -6941,8 +7183,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6941
7183
 
6942
7184
  const size_t nb1 = tensor->nb[1];
6943
7185
 
6944
- ggml_backend backend = tensor->backend;
6945
- struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
7186
+ ggml_backend_type backend = tensor->backend;
7187
+ ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6946
7188
  memset(extra, 0, sizeof(*extra));
6947
7189
 
6948
7190
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -6996,7 +7238,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6996
7238
  CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
6997
7239
  }
6998
7240
 
6999
-
7000
7241
  CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
7001
7242
 
7002
7243
  extra->data_device[id] = buf;
@@ -7035,17 +7276,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
7035
7276
  delete extra;
7036
7277
  }
7037
7278
 
7038
- static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7279
+ static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7039
7280
  static size_t g_temp_tensor_extra_index = 0;
7040
7281
 
7041
- static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7282
+ static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7042
7283
  if (g_temp_tensor_extras == nullptr) {
7043
7284
  g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7044
7285
  }
7045
7286
 
7046
7287
  size_t alloc_index = g_temp_tensor_extra_index;
7047
7288
  g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7048
- struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7289
+ ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7049
7290
  memset(extra, 0, sizeof(*extra));
7050
7291
 
7051
7292
  return extra;
@@ -7073,7 +7314,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7073
7314
  return;
7074
7315
  }
7075
7316
 
7076
- struct ggml_tensor_extra_gpu * extra;
7317
+ ggml_tensor_extra_gpu * extra;
7077
7318
 
7078
7319
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7079
7320
  tensor->op == GGML_OP_VIEW ||
@@ -7082,7 +7323,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7082
7323
 
7083
7324
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7084
7325
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7085
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7326
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7086
7327
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7087
7328
  size_t offset = 0;
7088
7329
  if (tensor->op == GGML_OP_VIEW) {
@@ -7091,7 +7332,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7091
7332
  extra = ggml_cuda_alloc_temp_tensor_extra();
7092
7333
  extra->data_device[g_main_device] = src0_ddc + offset;
7093
7334
  } else if (tensor->op == GGML_OP_CPY) {
7094
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7335
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7095
7336
  void * src1_ddv = src1_extra->data_device[g_main_device];
7096
7337
  extra = ggml_cuda_alloc_temp_tensor_extra();
7097
7338
  extra->data_device[g_main_device] = src1_ddv;
@@ -7133,13 +7374,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
7133
7374
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
7134
7375
  }
7135
7376
 
7136
- struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7377
+ ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7137
7378
 
7138
7379
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7139
7380
  tensor->op == GGML_OP_VIEW;
7140
7381
 
7141
7382
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7142
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7383
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7143
7384
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7144
7385
  size_t view_offset = 0;
7145
7386
  if (tensor->op == GGML_OP_VIEW) {
@@ -7157,7 +7398,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7157
7398
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7158
7399
  GGML_ASSERT(ggml_is_contiguous(tensor));
7159
7400
 
7160
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7401
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7161
7402
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7162
7403
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7163
7404
  }
@@ -7214,58 +7455,47 @@ void ggml_cuda_free_scratch() {
7214
7455
  g_scratch_buffer = nullptr;
7215
7456
  }
7216
7457
 
7217
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
7458
+ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7218
7459
  ggml_cuda_func_t func;
7219
7460
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7220
7461
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
7221
7462
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
7222
7463
 
7464
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
7465
+ return false;
7466
+ }
7467
+
7223
7468
  switch (tensor->op) {
7469
+ case GGML_OP_REPEAT:
7470
+ func = ggml_cuda_repeat;
7471
+ break;
7472
+ case GGML_OP_GET_ROWS:
7473
+ func = ggml_cuda_get_rows;
7474
+ break;
7224
7475
  case GGML_OP_DUP:
7225
- if (!any_on_device) {
7226
- return false;
7227
- }
7228
7476
  func = ggml_cuda_dup;
7229
7477
  break;
7230
7478
  case GGML_OP_ADD:
7231
- if (!any_on_device) {
7232
- return false;
7233
- }
7234
7479
  func = ggml_cuda_add;
7235
7480
  break;
7236
7481
  case GGML_OP_MUL:
7237
- if (!any_on_device) {
7238
- return false;
7239
- }
7240
7482
  func = ggml_cuda_mul;
7241
7483
  break;
7242
7484
  case GGML_OP_UNARY:
7243
7485
  switch (ggml_get_unary_op(tensor)) {
7244
7486
  case GGML_UNARY_OP_GELU:
7245
- if (!any_on_device) {
7246
- return false;
7247
- }
7248
7487
  func = ggml_cuda_gelu;
7249
7488
  break;
7250
7489
  case GGML_UNARY_OP_SILU:
7251
- if (!any_on_device) {
7252
- return false;
7253
- }
7254
7490
  func = ggml_cuda_silu;
7255
7491
  break;
7256
7492
  default:
7257
7493
  return false;
7258
7494
  } break;
7259
7495
  case GGML_OP_NORM:
7260
- if (!any_on_device) {
7261
- return false;
7262
- }
7263
7496
  func = ggml_cuda_norm;
7264
7497
  break;
7265
7498
  case GGML_OP_RMS_NORM:
7266
- if (!any_on_device) {
7267
- return false;
7268
- }
7269
7499
  func = ggml_cuda_rms_norm;
7270
7500
  break;
7271
7501
  case GGML_OP_MUL_MAT:
@@ -7275,54 +7505,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7275
7505
  func = ggml_cuda_mul_mat;
7276
7506
  break;
7277
7507
  case GGML_OP_SCALE:
7278
- if (!any_on_device) {
7279
- return false;
7280
- }
7281
7508
  func = ggml_cuda_scale;
7282
7509
  break;
7283
- case GGML_OP_CPY:
7510
+ case GGML_OP_CLAMP:
7284
7511
  if (!any_on_device) {
7285
7512
  return false;
7286
7513
  }
7514
+ func = ggml_cuda_clamp;
7515
+ break;
7516
+ case GGML_OP_CPY:
7287
7517
  func = ggml_cuda_cpy;
7288
7518
  break;
7289
7519
  case GGML_OP_CONT:
7290
- if (!any_on_device) {
7291
- return false;
7292
- }
7293
7520
  func = ggml_cuda_dup;
7294
7521
  break;
7295
7522
  case GGML_OP_RESHAPE:
7296
7523
  case GGML_OP_VIEW:
7297
7524
  case GGML_OP_PERMUTE:
7298
7525
  case GGML_OP_TRANSPOSE:
7299
- if (!any_on_device) {
7300
- return false;
7301
- }
7302
7526
  func = ggml_cuda_nop;
7303
7527
  break;
7304
7528
  case GGML_OP_DIAG_MASK_INF:
7305
- if (!any_on_device) {
7306
- return false;
7307
- }
7308
7529
  func = ggml_cuda_diag_mask_inf;
7309
7530
  break;
7310
7531
  case GGML_OP_SOFT_MAX:
7311
- if (!any_on_device) {
7312
- return false;
7313
- }
7314
7532
  func = ggml_cuda_soft_max;
7315
7533
  break;
7316
7534
  case GGML_OP_ROPE:
7317
- if (!any_on_device) {
7318
- return false;
7319
- }
7320
7535
  func = ggml_cuda_rope;
7321
7536
  break;
7322
7537
  case GGML_OP_ALIBI:
7323
- if (!any_on_device) {
7324
- return false;
7325
- }
7326
7538
  func = ggml_cuda_alibi;
7327
7539
  break;
7328
7540
  default:
@@ -7350,3 +7562,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
7350
7562
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
7351
7563
  snprintf(description, description_size, "%s", prop.name);
7352
7564
  }
7565
+
7566
+ ////////////////////////////////////////////////////////////////////////////////
7567
+
7568
+ // backend interface
7569
+
7570
+ #define UNUSED GGML_UNUSED
7571
+
7572
+ struct ggml_backend_context_cuda {
7573
+ };
7574
+
7575
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
7576
+ return GGML_CUDA_NAME;
7577
+
7578
+ UNUSED(backend);
7579
+ }
7580
+
7581
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
7582
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
7583
+ delete cuda_ctx;
7584
+ delete backend;
7585
+ }
7586
+
7587
+ struct ggml_backend_buffer_context_cuda {
7588
+ void * device;
7589
+
7590
+ ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
7591
+ size_t temp_tensor_extra_index = 0;
7592
+
7593
+ ~ggml_backend_buffer_context_cuda() {
7594
+ delete[] temp_tensor_extras;
7595
+ }
7596
+
7597
+ ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7598
+ if (temp_tensor_extras == nullptr) {
7599
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7600
+ }
7601
+
7602
+ size_t alloc_index = temp_tensor_extra_index;
7603
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7604
+ ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
7605
+ memset(extra, 0, sizeof(*extra));
7606
+
7607
+ return extra;
7608
+ }
7609
+ };
7610
+
7611
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
7612
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7613
+ CUDA_CHECK(cudaFree(ctx->device));
7614
+ delete ctx;
7615
+ }
7616
+
7617
+ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
7618
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7619
+ return ctx->device;
7620
+ }
7621
+
7622
+ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7623
+ int64_t row_low = 0;
7624
+ int64_t row_high = ggml_nrows(tensor);
7625
+ int64_t nrows_split = row_high - row_low;
7626
+
7627
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
7628
+
7629
+ int64_t ne0 = tensor->ne[0];
7630
+
7631
+ if (ggml_is_quantized(tensor->type)) {
7632
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
7633
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
7634
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
7635
+ }
7636
+ }
7637
+
7638
+ return size;
7639
+
7640
+ UNUSED(buffer);
7641
+ }
7642
+
7643
+ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7644
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7645
+
7646
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
7647
+ assert(tensor->view_src->buffer->backend == buffer->backend);
7648
+ tensor->backend = tensor->view_src->backend;
7649
+ tensor->extra = tensor->view_src->extra;
7650
+ return;
7651
+ }
7652
+
7653
+ ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
7654
+
7655
+ extra->data_device[g_main_device] = tensor->data;
7656
+
7657
+ tensor->backend = GGML_BACKEND_GPU;
7658
+ tensor->extra = extra;
7659
+
7660
+ if (ggml_is_quantized(tensor->type)) {
7661
+ // initialize padding to 0 to avoid possible NaN values
7662
+ int64_t row_low = 0;
7663
+ int64_t row_high = ggml_nrows(tensor);
7664
+ int64_t nrows_split = row_high - row_low;
7665
+
7666
+ size_t original_size = ggml_nbytes_split(tensor, nrows_split);
7667
+ size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
7668
+
7669
+ if (padded_size > original_size && tensor->view_src == nullptr) {
7670
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
7671
+ }
7672
+ }
7673
+
7674
+ UNUSED(buffer);
7675
+ }
7676
+
7677
+ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
7678
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
7679
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
7680
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
7681
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
7682
+ /* .free_tensor = */ NULL,
7683
+ };
7684
+
7685
+ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
7686
+ ggml_cuda_set_device(g_main_device);
7687
+
7688
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
7689
+ CUDA_CHECK(cudaMalloc(&ctx->device, size));
7690
+ return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
7691
+ }
7692
+
7693
+ static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
7694
+ return 128;
7695
+ UNUSED(backend);
7696
+ }
7697
+
7698
+ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
7699
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
7700
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7701
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7702
+
7703
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
7704
+
7705
+ UNUSED(backend);
7706
+ }
7707
+
7708
+ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
7709
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
7710
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7711
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7712
+
7713
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
7714
+
7715
+ UNUSED(backend);
7716
+ }
7717
+
7718
+ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
7719
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
7720
+
7721
+ UNUSED(backend);
7722
+ }
7723
+
7724
+ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
7725
+ GGML_ASSERT(!"not implemented");
7726
+
7727
+ return nullptr;
7728
+
7729
+ UNUSED(backend);
7730
+ UNUSED(cgraph);
7731
+ }
7732
+
7733
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7734
+ GGML_ASSERT(!"not implemented");
7735
+
7736
+ UNUSED(backend);
7737
+ UNUSED(plan);
7738
+ }
7739
+
7740
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7741
+ GGML_ASSERT(!"not implemented");
7742
+
7743
+ UNUSED(backend);
7744
+ UNUSED(plan);
7745
+ }
7746
+
7747
+ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
7748
+ ggml_cuda_set_device(g_main_device);
7749
+
7750
+ ggml_compute_params params = {};
7751
+ params.type = GGML_TASK_COMPUTE;
7752
+ params.ith = 0;
7753
+ for (int i = 0; i < cgraph->n_nodes; i++) {
7754
+ ggml_tensor * node = cgraph->nodes[i];
7755
+
7756
+ assert(node->backend == GGML_BACKEND_GPU);
7757
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
7758
+ if (node->src[j] != nullptr) {
7759
+ assert(node->src[j]->backend == GGML_BACKEND_GPU);
7760
+ }
7761
+ }
7762
+
7763
+ bool ok = ggml_cuda_compute_forward(&params, node);
7764
+ if (!ok) {
7765
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
7766
+ }
7767
+ GGML_ASSERT(ok);
7768
+
7769
+ #if 0
7770
+ if (node->type == GGML_TYPE_F32) {
7771
+ cudaDeviceSynchronize();
7772
+ std::vector<float> tmp(ggml_nelements(node), 0.0f);
7773
+ cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
7774
+ printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
7775
+ ggml_type_name(node->src[0]->type),
7776
+ node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
7777
+ node->src[0]->name,
7778
+ node->src[1] ? node->src[1]->name : "none");
7779
+ double sum = 0.0;
7780
+ double sq_sum = 0.0;
7781
+ for (int i = 0; i < ggml_nelements(node); i++) {
7782
+ printf("%f ", tmp[i]);
7783
+ sum += tmp[i];
7784
+ sq_sum += tmp[i]*tmp[i];
7785
+ }
7786
+ printf("\n");
7787
+ printf("sum: %f, ", sum);
7788
+ printf("sq_sum: %f\n", sq_sum);
7789
+ }
7790
+ #endif
7791
+ }
7792
+
7793
+ UNUSED(backend);
7794
+ }
7795
+
7796
+ static ggml_backend_i cuda_backend_i = {
7797
+ /* .get_name = */ ggml_backend_cuda_name,
7798
+ /* .free = */ ggml_backend_cuda_free,
7799
+ /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
7800
+ /* .get_alignment = */ ggml_backend_cuda_get_alignment,
7801
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
7802
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
7803
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
7804
+ /* .cpy_tensor_from = */ nullptr,
7805
+ /* .cpy_tensor_to = */ nullptr,
7806
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
7807
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
7808
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
7809
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
7810
+ /* .supports_op = */ nullptr,
7811
+ };
7812
+
7813
+ ggml_backend_t ggml_backend_cuda_init() {
7814
+ ggml_init_cublas(); // TODO: remove from ggml.c
7815
+
7816
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
7817
+
7818
+ ggml_backend_t cuda_backend = new ggml_backend {
7819
+ /* .interface = */ cuda_backend_i,
7820
+ /* .context = */ ctx
7821
+ };
7822
+
7823
+ return cuda_backend;
7824
+ }