llama_cpp 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,6 +62,7 @@
62
62
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
63
63
  #define cudaMemcpyKind hipMemcpyKind
64
64
  #define cudaMemset hipMemset
65
+ #define cudaMemsetAsync hipMemsetAsync
65
66
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
66
67
  #define cudaSetDevice hipSetDevice
67
68
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -80,9 +81,9 @@
80
81
  #include "ggml.h"
81
82
 
82
83
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
83
- #define CC_TURING 700
84
+ #define CC_VOLTA 700
84
85
  #define CC_OFFSET_AMD 1000000
85
- #define CC_RDNA2 CC_OFFSET_AMD + 1030
86
+ #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
86
87
 
87
88
  #if defined(GGML_USE_HIPBLAS)
88
89
  #define __CUDA_ARCH__ 1300
@@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
414
415
  #define CUDA_SILU_BLOCK_SIZE 256
415
416
  #define CUDA_CPY_BLOCK_SIZE 32
416
417
  #define CUDA_SCALE_BLOCK_SIZE 256
418
+ #define CUDA_CLAMP_BLOCK_SIZE 256
417
419
  #define CUDA_ROPE_BLOCK_SIZE 256
418
420
  #define CUDA_ALIBI_BLOCK_SIZE 32
419
421
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
420
422
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
421
423
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
424
+ #define CUDA_GET_ROWS_BLOCK_SIZE 256
422
425
 
423
426
  // dmmv = dequantize_mul_mat_vec
424
427
  #ifndef GGML_CUDA_DMMV_X
@@ -715,7 +718,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
715
718
 
716
719
  //================================== k-quants
717
720
 
718
- static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
721
+ template<typename dst_t>
722
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
719
723
 
720
724
  const int i = blockIdx.x;
721
725
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -727,7 +731,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
727
731
  const int is = 8*n + l/16;
728
732
 
729
733
  const uint8_t q = x[i].qs[32*n + l];
730
- float * y = yy + i*QK_K + 128*n;
734
+ dst_t * y = yy + i*QK_K + 128*n;
731
735
 
732
736
  float dall = __low2half(x[i].dm);
733
737
  float dmin = __high2half(x[i].dm);
@@ -739,7 +743,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
739
743
  const int is = tid/16; // 0 or 1
740
744
  const int il = tid%16; // 0...15
741
745
  const uint8_t q = x[i].qs[il] >> (2*is);
742
- float * y = yy + i*QK_K + 16*is + il;
746
+ dst_t * y = yy + i*QK_K + 16*is + il;
743
747
  float dall = __low2half(x[i].dm);
744
748
  float dmin = __high2half(x[i].dm);
745
749
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
@@ -748,7 +752,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
748
752
 
749
753
  }
750
754
 
751
- static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
755
+ template<typename dst_t>
756
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
752
757
 
753
758
  const int i = blockIdx.x;
754
759
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -772,7 +777,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
772
777
  float d_all = x[i].d;
773
778
  float dl = d_all * (us - 32);
774
779
 
775
- float * y = yy + i*QK_K + 128*n + 32*j;
780
+ dst_t * y = yy + i*QK_K + 128*n + 32*j;
776
781
  const uint8_t * q = x[i].qs + 32*n;
777
782
  const uint8_t * hm = x[i].hmask;
778
783
 
@@ -784,7 +789,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
784
789
  const int im = il/8; // 0...1
785
790
  const int in = il%8; // 0...7
786
791
 
787
- float * y = yy + i*QK_K + 16*is + il;
792
+ dst_t * y = yy + i*QK_K + 16*is + il;
788
793
 
789
794
  const uint8_t q = x[i].qs[il] >> (2*is);
790
795
  const uint8_t h = x[i].hmask[in] >> (2*is + im);
@@ -812,7 +817,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
812
817
  }
813
818
  #endif
814
819
 
815
- static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
820
+ template<typename dst_t>
821
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
816
822
  const block_q4_K * x = (const block_q4_K *) vx;
817
823
 
818
824
  const int i = blockIdx.x;
@@ -825,7 +831,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
825
831
  const int is = 2*il;
826
832
  const int n = 4;
827
833
 
828
- float * y = yy + i*QK_K + 64*il + n*ir;
834
+ dst_t * y = yy + i*QK_K + 64*il + n*ir;
829
835
 
830
836
  const float dall = __low2half(x[i].dm);
831
837
  const float dmin = __high2half(x[i].dm);
@@ -844,7 +850,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
844
850
  #else
845
851
  const int tid = threadIdx.x;
846
852
  const uint8_t * q = x[i].qs;
847
- float * y = yy + i*QK_K;
853
+ dst_t * y = yy + i*QK_K;
848
854
  const float d = (float)x[i].dm[0];
849
855
  const float m = (float)x[i].dm[1];
850
856
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
@@ -852,7 +858,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
852
858
  #endif
853
859
  }
854
860
 
855
- static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
861
+ template<typename dst_t>
862
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
856
863
  const block_q5_K * x = (const block_q5_K *) vx;
857
864
 
858
865
  const int i = blockIdx.x;
@@ -864,7 +871,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
864
871
  const int ir = tid%16; // ir is in 0...15
865
872
  const int is = 2*il; // is is in 0...6
866
873
 
867
- float * y = yy + i*QK_K + 64*il + 2*ir;
874
+ dst_t * y = yy + i*QK_K + 64*il + 2*ir;
868
875
 
869
876
  const float dall = __low2half(x[i].dm);
870
877
  const float dmin = __high2half(x[i].dm);
@@ -892,13 +899,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
892
899
  const int is = tid/16; // 0 or 1
893
900
  const uint8_t h = x[i].qh[in] >> im;
894
901
  const float d = x[i].d;
895
- float * y = yy + i*QK_K + tid;
902
+ dst_t * y = yy + i*QK_K + tid;
896
903
  y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
897
904
  y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
898
905
  #endif
899
906
  }
900
907
 
901
- static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
908
+ template<typename dst_t>
909
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
902
910
  const block_q6_K * x = (const block_q6_K *) vx;
903
911
 
904
912
  const int i = blockIdx.x;
@@ -910,7 +918,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
910
918
  const int il = tid - 32*ip; // 0...32
911
919
  const int is = 8*ip + il/16;
912
920
 
913
- float * y = yy + i*QK_K + 128*ip + il;
921
+ dst_t * y = yy + i*QK_K + 128*ip + il;
914
922
 
915
923
  const float d = x[i].d;
916
924
 
@@ -929,7 +937,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
929
937
  const int ip = tid/16; // 0 or 1
930
938
  const int il = tid - 16*ip; // 0...15
931
939
 
932
- float * y = yy + i*QK_K + 16*ip + il;
940
+ dst_t * y = yy + i*QK_K + 16*ip + il;
933
941
 
934
942
  const float d = x[i].d;
935
943
 
@@ -1569,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1569
1577
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1570
1578
  }
1571
1579
 
1580
+ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1581
+ static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1582
+ const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1583
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1584
+
1585
+ if (col >= ncols) {
1586
+ return;
1587
+ }
1588
+
1589
+ const int r = y[row];
1590
+
1591
+ // copy x[r*ncols + col] to dst[row*ncols + col]
1592
+ const int xi = r*ncols + col;
1593
+ const int di = row*ncols + col;
1594
+
1595
+ const int ib = xi/qk; // block index
1596
+ const int iqs = (xi%qk)/qr; // quant index
1597
+ const int iybs = di - di%qk; // y block start index
1598
+ const int y_offset = qr == 1 ? 1 : qk/2;
1599
+
1600
+ // dequantize
1601
+ dfloat2 v;
1602
+ dequantize_kernel(x, ib, iqs, v);
1603
+
1604
+ dst[iybs + iqs + 0] = v.x;
1605
+ dst[iybs + iqs + y_offset] = v.y;
1606
+ }
1607
+
1572
1608
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1573
1609
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1574
1610
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -3548,7 +3584,7 @@ template <bool need_check> static __global__ void
3548
3584
  load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3549
3585
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3550
3586
 
3551
- #elif __CUDA_ARCH__ >= CC_TURING
3587
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3552
3588
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3553
3589
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3554
3590
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3568,7 +3604,7 @@ template <bool need_check> static __global__ void
3568
3604
  #else
3569
3605
  (void) vec_dot_q4_0_q8_1_mul_mat;
3570
3606
  assert(false);
3571
- #endif // __CUDA_ARCH__ >= CC_TURING
3607
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3572
3608
  }
3573
3609
 
3574
3610
  #define MMQ_X_Q4_1_RDNA2 64
@@ -3589,9 +3625,9 @@ template <bool need_check> static __global__ void
3589
3625
  #if defined(RDNA3) || defined(RDNA2)
3590
3626
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3591
3627
  #endif // defined(RDNA3) || defined(RDNA2)
3592
- #elif __CUDA_ARCH__ < CC_TURING
3628
+ #elif __CUDA_ARCH__ < CC_VOLTA
3593
3629
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3594
- #endif // __CUDA_ARCH__ < CC_TURING
3630
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3595
3631
  mul_mat_q4_1(
3596
3632
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3597
3633
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3611,7 +3647,7 @@ template <bool need_check> static __global__ void
3611
3647
  load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3612
3648
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3613
3649
 
3614
- #elif __CUDA_ARCH__ >= CC_TURING
3650
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3615
3651
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3616
3652
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3617
3653
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3631,7 +3667,7 @@ template <bool need_check> static __global__ void
3631
3667
  #else
3632
3668
  (void) vec_dot_q4_1_q8_1_mul_mat;
3633
3669
  assert(false);
3634
- #endif // __CUDA_ARCH__ >= CC_TURING
3670
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3635
3671
  }
3636
3672
 
3637
3673
  #define MMQ_X_Q5_0_RDNA2 64
@@ -3672,7 +3708,7 @@ template <bool need_check> static __global__ void
3672
3708
  load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3673
3709
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3674
3710
 
3675
- #elif __CUDA_ARCH__ >= CC_TURING
3711
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3676
3712
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3677
3713
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3678
3714
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3692,7 +3728,7 @@ template <bool need_check> static __global__ void
3692
3728
  #else
3693
3729
  (void) vec_dot_q5_0_q8_1_mul_mat;
3694
3730
  assert(false);
3695
- #endif // __CUDA_ARCH__ >= CC_TURING
3731
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3696
3732
  }
3697
3733
 
3698
3734
  #define MMQ_X_Q5_1_RDNA2 64
@@ -3733,7 +3769,7 @@ mul_mat_q5_1(
3733
3769
  load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3734
3770
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3735
3771
 
3736
- #elif __CUDA_ARCH__ >= CC_TURING
3772
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3737
3773
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3738
3774
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3739
3775
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3753,7 +3789,7 @@ mul_mat_q5_1(
3753
3789
  #else
3754
3790
  (void) vec_dot_q5_1_q8_1_mul_mat;
3755
3791
  assert(false);
3756
- #endif // __CUDA_ARCH__ >= CC_TURING
3792
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3757
3793
  }
3758
3794
 
3759
3795
  #define MMQ_X_Q8_0_RDNA2 64
@@ -3794,7 +3830,7 @@ template <bool need_check> static __global__ void
3794
3830
  load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3795
3831
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3796
3832
 
3797
- #elif __CUDA_ARCH__ >= CC_TURING
3833
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3798
3834
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3799
3835
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3800
3836
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3814,7 +3850,7 @@ template <bool need_check> static __global__ void
3814
3850
  #else
3815
3851
  (void) vec_dot_q8_0_q8_1_mul_mat;
3816
3852
  assert(false);
3817
- #endif // __CUDA_ARCH__ >= CC_TURING
3853
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3818
3854
  }
3819
3855
 
3820
3856
  #define MMQ_X_Q2_K_RDNA2 64
@@ -3855,7 +3891,7 @@ mul_mat_q2_K(
3855
3891
  load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3856
3892
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3857
3893
 
3858
- #elif __CUDA_ARCH__ >= CC_TURING
3894
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3859
3895
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3860
3896
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3861
3897
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3875,7 +3911,7 @@ mul_mat_q2_K(
3875
3911
  #else
3876
3912
  (void) vec_dot_q2_K_q8_1_mul_mat;
3877
3913
  assert(false);
3878
- #endif // __CUDA_ARCH__ >= CC_TURING
3914
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3879
3915
  }
3880
3916
 
3881
3917
  #define MMQ_X_Q3_K_RDNA2 128
@@ -3896,9 +3932,9 @@ template <bool need_check> static __global__ void
3896
3932
  #if defined(RDNA3) || defined(RDNA2)
3897
3933
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3898
3934
  #endif // defined(RDNA3) || defined(RDNA2)
3899
- #elif __CUDA_ARCH__ < CC_TURING
3935
+ #elif __CUDA_ARCH__ < CC_VOLTA
3900
3936
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3901
- #endif // __CUDA_ARCH__ < CC_TURING
3937
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3902
3938
  mul_mat_q3_K(
3903
3939
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3904
3940
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3918,7 +3954,7 @@ template <bool need_check> static __global__ void
3918
3954
  load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3919
3955
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3920
3956
 
3921
- #elif __CUDA_ARCH__ >= CC_TURING
3957
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3922
3958
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3923
3959
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3924
3960
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3938,7 +3974,7 @@ template <bool need_check> static __global__ void
3938
3974
  #else
3939
3975
  (void) vec_dot_q3_K_q8_1_mul_mat;
3940
3976
  assert(false);
3941
- #endif // __CUDA_ARCH__ >= CC_TURING
3977
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3942
3978
  }
3943
3979
 
3944
3980
  #define MMQ_X_Q4_K_RDNA2 64
@@ -3959,9 +3995,9 @@ template <bool need_check> static __global__ void
3959
3995
  #if defined(RDNA3) || defined(RDNA2)
3960
3996
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3961
3997
  #endif // defined(RDNA3) || defined(RDNA2)
3962
- #elif __CUDA_ARCH__ < CC_TURING
3998
+ #elif __CUDA_ARCH__ < CC_VOLTA
3963
3999
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3964
- #endif // __CUDA_ARCH__ < CC_TURING
4000
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3965
4001
  mul_mat_q4_K(
3966
4002
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3967
4003
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3981,7 +4017,7 @@ template <bool need_check> static __global__ void
3981
4017
  load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3982
4018
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3983
4019
 
3984
- #elif __CUDA_ARCH__ >= CC_TURING
4020
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3985
4021
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3986
4022
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3987
4023
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -4001,7 +4037,7 @@ template <bool need_check> static __global__ void
4001
4037
  #else
4002
4038
  (void) vec_dot_q4_K_q8_1_mul_mat;
4003
4039
  assert(false);
4004
- #endif // __CUDA_ARCH__ >= CC_TURING
4040
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4005
4041
  }
4006
4042
 
4007
4043
  #define MMQ_X_Q5_K_RDNA2 64
@@ -4042,7 +4078,7 @@ mul_mat_q5_K(
4042
4078
  load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4043
4079
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4044
4080
 
4045
- #elif __CUDA_ARCH__ >= CC_TURING
4081
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4046
4082
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
4047
4083
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
4048
4084
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -4062,7 +4098,7 @@ mul_mat_q5_K(
4062
4098
  #else
4063
4099
  (void) vec_dot_q5_K_q8_1_mul_mat;
4064
4100
  assert(false);
4065
- #endif // __CUDA_ARCH__ >= CC_TURING
4101
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4066
4102
  }
4067
4103
 
4068
4104
  #define MMQ_X_Q6_K_RDNA2 64
@@ -4083,9 +4119,9 @@ template <bool need_check> static __global__ void
4083
4119
  #if defined(RDNA3) || defined(RDNA2)
4084
4120
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4085
4121
  #endif // defined(RDNA3) || defined(RDNA2)
4086
- #elif __CUDA_ARCH__ < CC_TURING
4122
+ #elif __CUDA_ARCH__ < CC_VOLTA
4087
4123
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
4088
- #endif // __CUDA_ARCH__ < CC_TURING
4124
+ #endif // __CUDA_ARCH__ < CC_VOLTA
4089
4125
  mul_mat_q6_K(
4090
4126
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
4091
4127
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -4105,7 +4141,7 @@ template <bool need_check> static __global__ void
4105
4141
  load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4106
4142
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4107
4143
 
4108
- #elif __CUDA_ARCH__ >= CC_TURING
4144
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4109
4145
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
4110
4146
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
4111
4147
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4125,7 +4161,7 @@ template <bool need_check> static __global__ void
4125
4161
  #else
4126
4162
  (void) vec_dot_q6_K_q8_1_mul_mat;
4127
4163
  assert(false);
4128
- #endif // __CUDA_ARCH__ >= CC_TURING
4164
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4129
4165
  }
4130
4166
 
4131
4167
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -4550,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
4550
4586
  dst[i] = scale * x[i];
4551
4587
  }
4552
4588
 
4589
+ static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4590
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
4591
+
4592
+ if (i >= k) {
4593
+ return;
4594
+ }
4595
+
4596
+ dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4597
+ }
4598
+
4599
+ template<int qk, int qr, dequantize_kernel_t dq>
4600
+ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4601
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
4602
+ const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
4603
+ const dim3 block_nums(block_num_x, nrows, 1);
4604
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4605
+ }
4606
+
4553
4607
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4554
4608
  const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4555
4609
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -4604,32 +4658,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4604
4658
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4605
4659
  }
4606
4660
 
4607
- static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4661
+ template<typename dst_t>
4662
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4608
4663
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4609
4664
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4610
4665
  }
4611
4666
 
4612
- static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4667
+ template<typename dst_t>
4668
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4613
4669
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4614
4670
  dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4615
4671
  }
4616
4672
 
4617
- static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4673
+ template<typename dst_t>
4674
+ static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4618
4675
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4619
4676
  dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4620
4677
  }
4621
4678
 
4622
- static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4679
+ template<typename dst_t>
4680
+ static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4623
4681
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4624
4682
  dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4625
4683
  }
4626
4684
 
4627
- static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4685
+ template<typename dst_t>
4686
+ static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4628
4687
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4629
4688
  dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4630
4689
  }
4631
4690
 
4632
- static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4691
+ template<typename dst_t>
4692
+ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4633
4693
  const int nb = k / QK_K;
4634
4694
  #if QK_K == 256
4635
4695
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4638,7 +4698,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
4638
4698
  #endif
4639
4699
  }
4640
4700
 
4641
- static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4701
+ template<typename dst_t>
4702
+ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4642
4703
  const int nb = k / QK_K;
4643
4704
  #if QK_K == 256
4644
4705
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4647,12 +4708,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
4647
4708
  #endif
4648
4709
  }
4649
4710
 
4650
- static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4711
+ template<typename dst_t>
4712
+ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4651
4713
  const int nb = k / QK_K;
4652
4714
  dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
4653
4715
  }
4654
4716
 
4655
- static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4717
+ template<typename dst_t>
4718
+ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4656
4719
  const int nb = k / QK_K;
4657
4720
  #if QK_K == 256
4658
4721
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4661,7 +4724,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu
4661
4724
  #endif
4662
4725
  }
4663
4726
 
4664
- static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4727
+ template<typename dst_t>
4728
+ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4665
4729
  const int nb = k / QK_K;
4666
4730
  #if QK_K == 256
4667
4731
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4868,6 +4932,26 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
4868
4932
 
4869
4933
  static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
4870
4934
  switch (type) {
4935
+ case GGML_TYPE_Q4_0:
4936
+ return dequantize_row_q4_0_cuda;
4937
+ case GGML_TYPE_Q4_1:
4938
+ return dequantize_row_q4_1_cuda;
4939
+ case GGML_TYPE_Q5_0:
4940
+ return dequantize_row_q5_0_cuda;
4941
+ case GGML_TYPE_Q5_1:
4942
+ return dequantize_row_q5_1_cuda;
4943
+ case GGML_TYPE_Q8_0:
4944
+ return dequantize_row_q8_0_cuda;
4945
+ case GGML_TYPE_Q2_K:
4946
+ return dequantize_row_q2_K_cuda;
4947
+ case GGML_TYPE_Q3_K:
4948
+ return dequantize_row_q3_K_cuda;
4949
+ case GGML_TYPE_Q4_K:
4950
+ return dequantize_row_q4_K_cuda;
4951
+ case GGML_TYPE_Q5_K:
4952
+ return dequantize_row_q5_K_cuda;
4953
+ case GGML_TYPE_Q6_K:
4954
+ return dequantize_row_q6_K_cuda;
4871
4955
  case GGML_TYPE_F32:
4872
4956
  return convert_fp32_to_fp16_cuda;
4873
4957
  default:
@@ -4921,7 +5005,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4921
5005
  mmq_x = MMQ_X_Q4_0_RDNA1;
4922
5006
  mmq_y = MMQ_Y_Q4_0_RDNA1;
4923
5007
  nwarps = NWARPS_Q4_0_RDNA1;
4924
- } else if (compute_capability >= CC_TURING) {
5008
+ } else if (compute_capability >= CC_VOLTA) {
4925
5009
  mmq_x = MMQ_X_Q4_0_AMPERE;
4926
5010
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4927
5011
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4966,7 +5050,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4966
5050
  mmq_x = MMQ_X_Q4_1_RDNA1;
4967
5051
  mmq_y = MMQ_Y_Q4_1_RDNA1;
4968
5052
  nwarps = NWARPS_Q4_1_RDNA1;
4969
- } else if (compute_capability >= CC_TURING) {
5053
+ } else if (compute_capability >= CC_VOLTA) {
4970
5054
  mmq_x = MMQ_X_Q4_1_AMPERE;
4971
5055
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4972
5056
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -5011,7 +5095,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
5011
5095
  mmq_x = MMQ_X_Q5_0_RDNA1;
5012
5096
  mmq_y = MMQ_Y_Q5_0_RDNA1;
5013
5097
  nwarps = NWARPS_Q5_0_RDNA1;
5014
- } else if (compute_capability >= CC_TURING) {
5098
+ } else if (compute_capability >= CC_VOLTA) {
5015
5099
  mmq_x = MMQ_X_Q5_0_AMPERE;
5016
5100
  mmq_y = MMQ_Y_Q5_0_AMPERE;
5017
5101
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -5056,7 +5140,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
5056
5140
  mmq_x = MMQ_X_Q5_1_RDNA1;
5057
5141
  mmq_y = MMQ_Y_Q5_1_RDNA1;
5058
5142
  nwarps = NWARPS_Q5_1_RDNA1;
5059
- } else if (compute_capability >= CC_TURING) {
5143
+ } else if (compute_capability >= CC_VOLTA) {
5060
5144
  mmq_x = MMQ_X_Q5_1_AMPERE;
5061
5145
  mmq_y = MMQ_Y_Q5_1_AMPERE;
5062
5146
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -5101,7 +5185,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
5101
5185
  mmq_x = MMQ_X_Q8_0_RDNA1;
5102
5186
  mmq_y = MMQ_Y_Q8_0_RDNA1;
5103
5187
  nwarps = NWARPS_Q8_0_RDNA1;
5104
- } else if (compute_capability >= CC_TURING) {
5188
+ } else if (compute_capability >= CC_VOLTA) {
5105
5189
  mmq_x = MMQ_X_Q8_0_AMPERE;
5106
5190
  mmq_y = MMQ_Y_Q8_0_AMPERE;
5107
5191
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -5146,7 +5230,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
5146
5230
  mmq_x = MMQ_X_Q2_K_RDNA1;
5147
5231
  mmq_y = MMQ_Y_Q2_K_RDNA1;
5148
5232
  nwarps = NWARPS_Q2_K_RDNA1;
5149
- } else if (compute_capability >= CC_TURING) {
5233
+ } else if (compute_capability >= CC_VOLTA) {
5150
5234
  mmq_x = MMQ_X_Q2_K_AMPERE;
5151
5235
  mmq_y = MMQ_Y_Q2_K_AMPERE;
5152
5236
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -5193,7 +5277,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
5193
5277
  mmq_x = MMQ_X_Q3_K_RDNA1;
5194
5278
  mmq_y = MMQ_Y_Q3_K_RDNA1;
5195
5279
  nwarps = NWARPS_Q3_K_RDNA1;
5196
- } else if (compute_capability >= CC_TURING) {
5280
+ } else if (compute_capability >= CC_VOLTA) {
5197
5281
  mmq_x = MMQ_X_Q3_K_AMPERE;
5198
5282
  mmq_y = MMQ_Y_Q3_K_AMPERE;
5199
5283
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -5239,7 +5323,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
5239
5323
  mmq_x = MMQ_X_Q4_K_RDNA1;
5240
5324
  mmq_y = MMQ_Y_Q4_K_RDNA1;
5241
5325
  nwarps = NWARPS_Q4_K_RDNA1;
5242
- } else if (compute_capability >= CC_TURING) {
5326
+ } else if (compute_capability >= CC_VOLTA) {
5243
5327
  mmq_x = MMQ_X_Q4_K_AMPERE;
5244
5328
  mmq_y = MMQ_Y_Q4_K_AMPERE;
5245
5329
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -5284,7 +5368,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
5284
5368
  mmq_x = MMQ_X_Q5_K_RDNA1;
5285
5369
  mmq_y = MMQ_Y_Q5_K_RDNA1;
5286
5370
  nwarps = NWARPS_Q5_K_RDNA1;
5287
- } else if (compute_capability >= CC_TURING) {
5371
+ } else if (compute_capability >= CC_VOLTA) {
5288
5372
  mmq_x = MMQ_X_Q5_K_AMPERE;
5289
5373
  mmq_y = MMQ_Y_Q5_K_AMPERE;
5290
5374
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -5329,7 +5413,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
5329
5413
  mmq_x = MMQ_X_Q6_K_RDNA1;
5330
5414
  mmq_y = MMQ_Y_Q6_K_RDNA1;
5331
5415
  nwarps = NWARPS_Q6_K_RDNA1;
5332
- } else if (compute_capability >= CC_TURING) {
5416
+ } else if (compute_capability >= CC_VOLTA) {
5333
5417
  mmq_x = MMQ_X_Q6_K_AMPERE;
5334
5418
  mmq_y = MMQ_Y_Q6_K_AMPERE;
5335
5419
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -5401,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5401
5485
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5402
5486
  }
5403
5487
 
5488
+ static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
5489
+ const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
5490
+ clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
5491
+ }
5492
+
5404
5493
  template<typename T>
5405
5494
  static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5406
5495
  const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
@@ -5668,7 +5757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5668
5757
  } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5669
5758
  GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5670
5759
  kind = cudaMemcpyDeviceToDevice;
5671
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5760
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5672
5761
  int id;
5673
5762
  CUDA_CHECK(cudaGetDevice(&id));
5674
5763
  src_ptr = (char *) extra->data_device[id];
@@ -5704,6 +5793,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5704
5793
  }
5705
5794
  }
5706
5795
 
5796
+ static void ggml_cuda_op_repeat(
5797
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5798
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5799
+ // guaranteed to be an integer due to the check in ggml_can_repeat
5800
+ const int64_t ne0 = dst->ne[0];
5801
+ const int64_t ne1 = dst->ne[1];
5802
+ const int64_t ne2 = dst->ne[2];
5803
+ const int64_t ne3 = dst->ne[3];
5804
+
5805
+ const int64_t ne00 = src0->ne[0];
5806
+ const int64_t ne01 = src0->ne[1];
5807
+ const int64_t ne02 = src0->ne[2];
5808
+ const int64_t ne03 = src0->ne[3];
5809
+
5810
+ const size_t nb0 = dst->nb[0];
5811
+ const size_t nb1 = dst->nb[1];
5812
+ const size_t nb2 = dst->nb[2];
5813
+ const size_t nb3 = dst->nb[3];
5814
+
5815
+ const size_t nb00 = src0->nb[0];
5816
+ const size_t nb01 = src0->nb[1];
5817
+ const size_t nb02 = src0->nb[2];
5818
+ const size_t nb03 = src0->nb[3];
5819
+
5820
+ const int nr0 = (int)(ne0/ne00);
5821
+ const int nr1 = (int)(ne1/ne01);
5822
+ const int nr2 = (int)(ne2/ne02);
5823
+ const int nr3 = (int)(ne3/ne03);
5824
+
5825
+ // TODO: support for transposed / permuted tensors
5826
+ GGML_ASSERT(nb0 == sizeof(float));
5827
+ GGML_ASSERT(nb00 == sizeof(float));
5828
+
5829
+ // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
5830
+ for (int i3 = 0; i3 < nr3; i3++) {
5831
+ for (int k3 = 0; k3 < ne03; k3++) {
5832
+ for (int i2 = 0; i2 < nr2; i2++) {
5833
+ for (int k2 = 0; k2 < ne02; k2++) {
5834
+ for (int i1 = 0; i1 < nr1; i1++) {
5835
+ for (int k1 = 0; k1 < ne01; k1++) {
5836
+ for (int i0 = 0; i0 < nr0; i0++) {
5837
+ CUDA_CHECK(cudaMemcpyAsync(
5838
+ (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
5839
+ (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
5840
+ ne00*nb0, cudaMemcpyDeviceToDevice, stream));
5841
+ }
5842
+ }
5843
+ }
5844
+ }
5845
+ }
5846
+ }
5847
+ }
5848
+
5849
+ (void) src1;
5850
+ (void) src1_d;
5851
+ }
5852
+
5853
+ static void ggml_cuda_op_get_rows(
5854
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5855
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5856
+
5857
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
5858
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
5859
+ GGML_ASSERT(ggml_is_contiguous(src0));
5860
+ GGML_ASSERT(ggml_is_contiguous(src1));
5861
+ GGML_ASSERT(ggml_is_contiguous(dst));
5862
+
5863
+ const int ncols = src0->ne[0];
5864
+ const int nrows = ggml_nelements(src1);
5865
+
5866
+ const int32_t * src1_i32 = (const int32_t *) src1_d;
5867
+
5868
+ switch (src0->type) {
5869
+ case GGML_TYPE_F16:
5870
+ get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5871
+ break;
5872
+ case GGML_TYPE_F32:
5873
+ get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5874
+ break;
5875
+ case GGML_TYPE_Q4_0:
5876
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5877
+ break;
5878
+ case GGML_TYPE_Q4_1:
5879
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5880
+ break;
5881
+ case GGML_TYPE_Q5_0:
5882
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5883
+ break;
5884
+ case GGML_TYPE_Q5_1:
5885
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5886
+ break;
5887
+ case GGML_TYPE_Q8_0:
5888
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5889
+ break;
5890
+ default:
5891
+ // TODO: k-quants
5892
+ GGML_ASSERT(false);
5893
+ break;
5894
+ }
5895
+ }
5896
+
5707
5897
  inline void ggml_cuda_op_add(
5708
5898
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5709
5899
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -5907,7 +6097,7 @@ static int64_t get_row_rounding(ggml_type type) {
5907
6097
  switch(type) {
5908
6098
  case GGML_TYPE_Q4_0:
5909
6099
  case GGML_TYPE_Q4_1:
5910
- return max_compute_capability >= CC_TURING ? 128 : 64;
6100
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5911
6101
  case GGML_TYPE_Q5_0:
5912
6102
  case GGML_TYPE_Q5_1:
5913
6103
  case GGML_TYPE_Q8_0:
@@ -5918,7 +6108,7 @@ static int64_t get_row_rounding(ggml_type type) {
5918
6108
  case GGML_TYPE_Q3_K:
5919
6109
  case GGML_TYPE_Q4_K:
5920
6110
  case GGML_TYPE_Q5_K:
5921
- return max_compute_capability >= CC_TURING ? 128 : 64;
6111
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5922
6112
  case GGML_TYPE_Q6_K:
5923
6113
  return 64;
5924
6114
  default:
@@ -6083,8 +6273,19 @@ inline void ggml_cuda_op_mul_mat_cublas(
6083
6273
 
6084
6274
  const int compute_capability = g_compute_capabilities[id];
6085
6275
 
6086
- if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
6087
- // convert src1 to fp16, multiply as fp16, convert dst to fp32
6276
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
6277
+ // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
6278
+ half * src0_as_f16 = nullptr;
6279
+ size_t src0_as = 0;
6280
+ if (src0->type != GGML_TYPE_F16) {
6281
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
6282
+ GGML_ASSERT(to_fp16_cuda != nullptr);
6283
+ size_t ne = row_diff*ne00;
6284
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
6285
+ to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
6286
+ }
6287
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
6288
+
6088
6289
  half * src1_as_f16 = nullptr;
6089
6290
  size_t src1_as = 0;
6090
6291
  if (src1->type != GGML_TYPE_F16) {
@@ -6106,9 +6307,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
6106
6307
  CUBLAS_CHECK(
6107
6308
  cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6108
6309
  row_diff, src1_ncols, ne10,
6109
- &alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
6110
- src1_ptr, CUDA_R_16F, ne10,
6111
- &beta_f16, dst_f16, CUDA_R_16F, ldc,
6310
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
6311
+ src1_ptr, CUDA_R_16F, ne10,
6312
+ &beta_f16, dst_f16, CUDA_R_16F, ldc,
6112
6313
  CUBLAS_COMPUTE_16F,
6113
6314
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
6114
6315
 
@@ -6117,6 +6318,10 @@ inline void ggml_cuda_op_mul_mat_cublas(
6117
6318
 
6118
6319
  ggml_cuda_pool_free(dst_f16, dst_as);
6119
6320
 
6321
+ if (src0_as != 0) {
6322
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
6323
+ }
6324
+
6120
6325
  if (src1_as != 0) {
6121
6326
  ggml_cuda_pool_free(src1_as_f16, src1_as);
6122
6327
  }
@@ -6229,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
6229
6434
  const int64_t ne02 = src0->ne[2];
6230
6435
  const int64_t nrows = ggml_nrows(src0);
6231
6436
 
6232
- const int n_past = ((int32_t *) dst->op_params)[0];
6437
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6233
6438
  const int n_head = ((int32_t *) dst->op_params)[1];
6234
6439
  float max_bias;
6235
6440
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
6236
6441
 
6237
- GGML_ASSERT(ne01 + n_past == ne00);
6442
+ //GGML_ASSERT(ne01 + n_past == ne00);
6238
6443
  GGML_ASSERT(n_head == ne02);
6239
6444
 
6240
6445
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -6293,7 +6498,14 @@ inline void ggml_cuda_op_scale(
6293
6498
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6294
6499
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6295
6500
 
6296
- const float scale = ((float *) src1->data)[0];
6501
+ float scale;
6502
+ // HACK: support for ggml backend interface
6503
+ if (src1->backend == GGML_BACKEND_CPU) {
6504
+ scale = ((float *) src1->data)[0];
6505
+ } else {
6506
+ // TODO: pass pointer to kernel instead of copying to host
6507
+ CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
6508
+ }
6297
6509
 
6298
6510
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
6299
6511
  CUDA_CHECK(cudaGetLastError());
@@ -6303,6 +6515,24 @@ inline void ggml_cuda_op_scale(
6303
6515
  (void) src1_dd;
6304
6516
  }
6305
6517
 
6518
+ inline void ggml_cuda_op_clamp(
6519
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6520
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6521
+
6522
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6523
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6524
+
6525
+ const float min = ((float *) dst->op_params)[0];
6526
+ const float max = ((float *) dst->op_params)[1];
6527
+
6528
+ clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6529
+ CUDA_CHECK(cudaGetLastError());
6530
+
6531
+ (void) src1;
6532
+ (void) dst;
6533
+ (void) src1_dd;
6534
+ }
6535
+
6306
6536
  static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6307
6537
  const int64_t nrows0 = ggml_nrows(src0);
6308
6538
 
@@ -6312,9 +6542,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6312
6542
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6313
6543
  GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6314
6544
 
6315
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6316
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6317
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6545
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6546
+ ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6547
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6318
6548
 
6319
6549
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6320
6550
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@@ -6455,9 +6685,9 @@ static void ggml_cuda_op_mul_mat(
6455
6685
  const size_t q8_1_ts = sizeof(block_q8_1);
6456
6686
  const size_t q8_1_bs = QK8_1;
6457
6687
 
6458
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6459
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6460
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6688
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6689
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6690
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6461
6691
 
6462
6692
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6463
6693
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
@@ -6535,7 +6765,7 @@ static void ggml_cuda_op_mul_mat(
6535
6765
  if (convert_src1_to_q8_1) {
6536
6766
  src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6537
6767
 
6538
- if (split && src1_on_device && src1_is_contiguous) {
6768
+ if (src1_on_device && src1_is_contiguous) {
6539
6769
  quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6540
6770
  CUDA_CHECK(cudaGetLastError());
6541
6771
  }
@@ -6617,7 +6847,7 @@ static void ggml_cuda_op_mul_mat(
6617
6847
  GGML_ASSERT(false);
6618
6848
  }
6619
6849
 
6620
- if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6850
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
6621
6851
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6622
6852
  CUDA_CHECK(cudaGetLastError());
6623
6853
  }
@@ -6708,6 +6938,14 @@ static void ggml_cuda_op_mul_mat(
6708
6938
  }
6709
6939
  }
6710
6940
 
6941
+ static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6942
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
6943
+ }
6944
+
6945
+ static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6946
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
6947
+ }
6948
+
6711
6949
  static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6712
6950
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6713
6951
  }
@@ -6762,13 +7000,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
6762
7000
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6763
7001
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6764
7002
 
6765
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7003
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6766
7004
  void * src0_ddq = src0_extra->data_device[g_main_device];
6767
7005
 
6768
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7006
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6769
7007
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6770
7008
 
6771
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7009
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6772
7010
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6773
7011
 
6774
7012
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@@ -6793,13 +7031,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
6793
7031
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6794
7032
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6795
7033
 
6796
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7034
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6797
7035
  void * src0_ddq = src0_extra->data_device[g_main_device];
6798
7036
 
6799
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7037
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6800
7038
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6801
7039
 
6802
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7040
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6803
7041
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6804
7042
 
6805
7043
  const int64_t row_stride_x = nb01 / sizeof(half);
@@ -6820,11 +7058,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
6820
7058
  }
6821
7059
  }
6822
7060
 
6823
- if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7061
+ if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6824
7062
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6825
7063
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6826
7064
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6827
- }else if (src0->type == GGML_TYPE_F32) {
7065
+ } else if (src0->type == GGML_TYPE_F32) {
6828
7066
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6829
7067
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6830
7068
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
@@ -6856,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
6856
7094
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6857
7095
  }
6858
7096
 
7097
+ static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7098
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
7099
+ }
7100
+
6859
7101
  static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6860
7102
  const int64_t ne = ggml_nelements(src0);
6861
7103
  GGML_ASSERT(ne == ggml_nelements(src1));
@@ -6885,8 +7127,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
6885
7127
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6886
7128
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6887
7129
 
6888
- const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6889
- const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7130
+ const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7131
+ const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6890
7132
 
6891
7133
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6892
7134
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@@ -6941,8 +7183,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6941
7183
 
6942
7184
  const size_t nb1 = tensor->nb[1];
6943
7185
 
6944
- ggml_backend backend = tensor->backend;
6945
- struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
7186
+ ggml_backend_type backend = tensor->backend;
7187
+ ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6946
7188
  memset(extra, 0, sizeof(*extra));
6947
7189
 
6948
7190
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -6996,7 +7238,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6996
7238
  CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
6997
7239
  }
6998
7240
 
6999
-
7000
7241
  CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
7001
7242
 
7002
7243
  extra->data_device[id] = buf;
@@ -7035,17 +7276,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
7035
7276
  delete extra;
7036
7277
  }
7037
7278
 
7038
- static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7279
+ static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7039
7280
  static size_t g_temp_tensor_extra_index = 0;
7040
7281
 
7041
- static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7282
+ static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7042
7283
  if (g_temp_tensor_extras == nullptr) {
7043
7284
  g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7044
7285
  }
7045
7286
 
7046
7287
  size_t alloc_index = g_temp_tensor_extra_index;
7047
7288
  g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7048
- struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7289
+ ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7049
7290
  memset(extra, 0, sizeof(*extra));
7050
7291
 
7051
7292
  return extra;
@@ -7073,7 +7314,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7073
7314
  return;
7074
7315
  }
7075
7316
 
7076
- struct ggml_tensor_extra_gpu * extra;
7317
+ ggml_tensor_extra_gpu * extra;
7077
7318
 
7078
7319
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7079
7320
  tensor->op == GGML_OP_VIEW ||
@@ -7082,7 +7323,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7082
7323
 
7083
7324
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7084
7325
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7085
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7326
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7086
7327
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7087
7328
  size_t offset = 0;
7088
7329
  if (tensor->op == GGML_OP_VIEW) {
@@ -7091,7 +7332,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7091
7332
  extra = ggml_cuda_alloc_temp_tensor_extra();
7092
7333
  extra->data_device[g_main_device] = src0_ddc + offset;
7093
7334
  } else if (tensor->op == GGML_OP_CPY) {
7094
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7335
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7095
7336
  void * src1_ddv = src1_extra->data_device[g_main_device];
7096
7337
  extra = ggml_cuda_alloc_temp_tensor_extra();
7097
7338
  extra->data_device[g_main_device] = src1_ddv;
@@ -7133,13 +7374,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
7133
7374
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
7134
7375
  }
7135
7376
 
7136
- struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7377
+ ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7137
7378
 
7138
7379
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7139
7380
  tensor->op == GGML_OP_VIEW;
7140
7381
 
7141
7382
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7142
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7383
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7143
7384
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7144
7385
  size_t view_offset = 0;
7145
7386
  if (tensor->op == GGML_OP_VIEW) {
@@ -7157,7 +7398,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7157
7398
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7158
7399
  GGML_ASSERT(ggml_is_contiguous(tensor));
7159
7400
 
7160
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7401
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7161
7402
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7162
7403
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7163
7404
  }
@@ -7214,58 +7455,47 @@ void ggml_cuda_free_scratch() {
7214
7455
  g_scratch_buffer = nullptr;
7215
7456
  }
7216
7457
 
7217
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
7458
+ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7218
7459
  ggml_cuda_func_t func;
7219
7460
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7220
7461
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
7221
7462
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
7222
7463
 
7464
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
7465
+ return false;
7466
+ }
7467
+
7223
7468
  switch (tensor->op) {
7469
+ case GGML_OP_REPEAT:
7470
+ func = ggml_cuda_repeat;
7471
+ break;
7472
+ case GGML_OP_GET_ROWS:
7473
+ func = ggml_cuda_get_rows;
7474
+ break;
7224
7475
  case GGML_OP_DUP:
7225
- if (!any_on_device) {
7226
- return false;
7227
- }
7228
7476
  func = ggml_cuda_dup;
7229
7477
  break;
7230
7478
  case GGML_OP_ADD:
7231
- if (!any_on_device) {
7232
- return false;
7233
- }
7234
7479
  func = ggml_cuda_add;
7235
7480
  break;
7236
7481
  case GGML_OP_MUL:
7237
- if (!any_on_device) {
7238
- return false;
7239
- }
7240
7482
  func = ggml_cuda_mul;
7241
7483
  break;
7242
7484
  case GGML_OP_UNARY:
7243
7485
  switch (ggml_get_unary_op(tensor)) {
7244
7486
  case GGML_UNARY_OP_GELU:
7245
- if (!any_on_device) {
7246
- return false;
7247
- }
7248
7487
  func = ggml_cuda_gelu;
7249
7488
  break;
7250
7489
  case GGML_UNARY_OP_SILU:
7251
- if (!any_on_device) {
7252
- return false;
7253
- }
7254
7490
  func = ggml_cuda_silu;
7255
7491
  break;
7256
7492
  default:
7257
7493
  return false;
7258
7494
  } break;
7259
7495
  case GGML_OP_NORM:
7260
- if (!any_on_device) {
7261
- return false;
7262
- }
7263
7496
  func = ggml_cuda_norm;
7264
7497
  break;
7265
7498
  case GGML_OP_RMS_NORM:
7266
- if (!any_on_device) {
7267
- return false;
7268
- }
7269
7499
  func = ggml_cuda_rms_norm;
7270
7500
  break;
7271
7501
  case GGML_OP_MUL_MAT:
@@ -7275,54 +7505,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7275
7505
  func = ggml_cuda_mul_mat;
7276
7506
  break;
7277
7507
  case GGML_OP_SCALE:
7278
- if (!any_on_device) {
7279
- return false;
7280
- }
7281
7508
  func = ggml_cuda_scale;
7282
7509
  break;
7283
- case GGML_OP_CPY:
7510
+ case GGML_OP_CLAMP:
7284
7511
  if (!any_on_device) {
7285
7512
  return false;
7286
7513
  }
7514
+ func = ggml_cuda_clamp;
7515
+ break;
7516
+ case GGML_OP_CPY:
7287
7517
  func = ggml_cuda_cpy;
7288
7518
  break;
7289
7519
  case GGML_OP_CONT:
7290
- if (!any_on_device) {
7291
- return false;
7292
- }
7293
7520
  func = ggml_cuda_dup;
7294
7521
  break;
7295
7522
  case GGML_OP_RESHAPE:
7296
7523
  case GGML_OP_VIEW:
7297
7524
  case GGML_OP_PERMUTE:
7298
7525
  case GGML_OP_TRANSPOSE:
7299
- if (!any_on_device) {
7300
- return false;
7301
- }
7302
7526
  func = ggml_cuda_nop;
7303
7527
  break;
7304
7528
  case GGML_OP_DIAG_MASK_INF:
7305
- if (!any_on_device) {
7306
- return false;
7307
- }
7308
7529
  func = ggml_cuda_diag_mask_inf;
7309
7530
  break;
7310
7531
  case GGML_OP_SOFT_MAX:
7311
- if (!any_on_device) {
7312
- return false;
7313
- }
7314
7532
  func = ggml_cuda_soft_max;
7315
7533
  break;
7316
7534
  case GGML_OP_ROPE:
7317
- if (!any_on_device) {
7318
- return false;
7319
- }
7320
7535
  func = ggml_cuda_rope;
7321
7536
  break;
7322
7537
  case GGML_OP_ALIBI:
7323
- if (!any_on_device) {
7324
- return false;
7325
- }
7326
7538
  func = ggml_cuda_alibi;
7327
7539
  break;
7328
7540
  default:
@@ -7350,3 +7562,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
7350
7562
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
7351
7563
  snprintf(description, description_size, "%s", prop.name);
7352
7564
  }
7565
+
7566
+ ////////////////////////////////////////////////////////////////////////////////
7567
+
7568
+ // backend interface
7569
+
7570
+ #define UNUSED GGML_UNUSED
7571
+
7572
+ struct ggml_backend_context_cuda {
7573
+ };
7574
+
7575
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
7576
+ return GGML_CUDA_NAME;
7577
+
7578
+ UNUSED(backend);
7579
+ }
7580
+
7581
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
7582
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
7583
+ delete cuda_ctx;
7584
+ delete backend;
7585
+ }
7586
+
7587
+ struct ggml_backend_buffer_context_cuda {
7588
+ void * device;
7589
+
7590
+ ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
7591
+ size_t temp_tensor_extra_index = 0;
7592
+
7593
+ ~ggml_backend_buffer_context_cuda() {
7594
+ delete[] temp_tensor_extras;
7595
+ }
7596
+
7597
+ ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7598
+ if (temp_tensor_extras == nullptr) {
7599
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7600
+ }
7601
+
7602
+ size_t alloc_index = temp_tensor_extra_index;
7603
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7604
+ ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
7605
+ memset(extra, 0, sizeof(*extra));
7606
+
7607
+ return extra;
7608
+ }
7609
+ };
7610
+
7611
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
7612
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7613
+ CUDA_CHECK(cudaFree(ctx->device));
7614
+ delete ctx;
7615
+ }
7616
+
7617
+ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
7618
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7619
+ return ctx->device;
7620
+ }
7621
+
7622
+ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7623
+ int64_t row_low = 0;
7624
+ int64_t row_high = ggml_nrows(tensor);
7625
+ int64_t nrows_split = row_high - row_low;
7626
+
7627
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
7628
+
7629
+ int64_t ne0 = tensor->ne[0];
7630
+
7631
+ if (ggml_is_quantized(tensor->type)) {
7632
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
7633
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
7634
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
7635
+ }
7636
+ }
7637
+
7638
+ return size;
7639
+
7640
+ UNUSED(buffer);
7641
+ }
7642
+
7643
+ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7644
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7645
+
7646
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
7647
+ assert(tensor->view_src->buffer->backend == buffer->backend);
7648
+ tensor->backend = tensor->view_src->backend;
7649
+ tensor->extra = tensor->view_src->extra;
7650
+ return;
7651
+ }
7652
+
7653
+ ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
7654
+
7655
+ extra->data_device[g_main_device] = tensor->data;
7656
+
7657
+ tensor->backend = GGML_BACKEND_GPU;
7658
+ tensor->extra = extra;
7659
+
7660
+ if (ggml_is_quantized(tensor->type)) {
7661
+ // initialize padding to 0 to avoid possible NaN values
7662
+ int64_t row_low = 0;
7663
+ int64_t row_high = ggml_nrows(tensor);
7664
+ int64_t nrows_split = row_high - row_low;
7665
+
7666
+ size_t original_size = ggml_nbytes_split(tensor, nrows_split);
7667
+ size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
7668
+
7669
+ if (padded_size > original_size && tensor->view_src == nullptr) {
7670
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
7671
+ }
7672
+ }
7673
+
7674
+ UNUSED(buffer);
7675
+ }
7676
+
7677
+ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
7678
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
7679
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
7680
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
7681
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
7682
+ /* .free_tensor = */ NULL,
7683
+ };
7684
+
7685
+ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
7686
+ ggml_cuda_set_device(g_main_device);
7687
+
7688
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
7689
+ CUDA_CHECK(cudaMalloc(&ctx->device, size));
7690
+ return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
7691
+ }
7692
+
7693
+ static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
7694
+ return 128;
7695
+ UNUSED(backend);
7696
+ }
7697
+
7698
+ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
7699
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
7700
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7701
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7702
+
7703
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
7704
+
7705
+ UNUSED(backend);
7706
+ }
7707
+
7708
+ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
7709
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
7710
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7711
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7712
+
7713
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
7714
+
7715
+ UNUSED(backend);
7716
+ }
7717
+
7718
+ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
7719
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
7720
+
7721
+ UNUSED(backend);
7722
+ }
7723
+
7724
+ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
7725
+ GGML_ASSERT(!"not implemented");
7726
+
7727
+ return nullptr;
7728
+
7729
+ UNUSED(backend);
7730
+ UNUSED(cgraph);
7731
+ }
7732
+
7733
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7734
+ GGML_ASSERT(!"not implemented");
7735
+
7736
+ UNUSED(backend);
7737
+ UNUSED(plan);
7738
+ }
7739
+
7740
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7741
+ GGML_ASSERT(!"not implemented");
7742
+
7743
+ UNUSED(backend);
7744
+ UNUSED(plan);
7745
+ }
7746
+
7747
+ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
7748
+ ggml_cuda_set_device(g_main_device);
7749
+
7750
+ ggml_compute_params params = {};
7751
+ params.type = GGML_TASK_COMPUTE;
7752
+ params.ith = 0;
7753
+ for (int i = 0; i < cgraph->n_nodes; i++) {
7754
+ ggml_tensor * node = cgraph->nodes[i];
7755
+
7756
+ assert(node->backend == GGML_BACKEND_GPU);
7757
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
7758
+ if (node->src[j] != nullptr) {
7759
+ assert(node->src[j]->backend == GGML_BACKEND_GPU);
7760
+ }
7761
+ }
7762
+
7763
+ bool ok = ggml_cuda_compute_forward(&params, node);
7764
+ if (!ok) {
7765
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
7766
+ }
7767
+ GGML_ASSERT(ok);
7768
+
7769
+ #if 0
7770
+ if (node->type == GGML_TYPE_F32) {
7771
+ cudaDeviceSynchronize();
7772
+ std::vector<float> tmp(ggml_nelements(node), 0.0f);
7773
+ cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
7774
+ printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
7775
+ ggml_type_name(node->src[0]->type),
7776
+ node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
7777
+ node->src[0]->name,
7778
+ node->src[1] ? node->src[1]->name : "none");
7779
+ double sum = 0.0;
7780
+ double sq_sum = 0.0;
7781
+ for (int i = 0; i < ggml_nelements(node); i++) {
7782
+ printf("%f ", tmp[i]);
7783
+ sum += tmp[i];
7784
+ sq_sum += tmp[i]*tmp[i];
7785
+ }
7786
+ printf("\n");
7787
+ printf("sum: %f, ", sum);
7788
+ printf("sq_sum: %f\n", sq_sum);
7789
+ }
7790
+ #endif
7791
+ }
7792
+
7793
+ UNUSED(backend);
7794
+ }
7795
+
7796
+ static ggml_backend_i cuda_backend_i = {
7797
+ /* .get_name = */ ggml_backend_cuda_name,
7798
+ /* .free = */ ggml_backend_cuda_free,
7799
+ /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
7800
+ /* .get_alignment = */ ggml_backend_cuda_get_alignment,
7801
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
7802
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
7803
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
7804
+ /* .cpy_tensor_from = */ nullptr,
7805
+ /* .cpy_tensor_to = */ nullptr,
7806
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
7807
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
7808
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
7809
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
7810
+ /* .supports_op = */ nullptr,
7811
+ };
7812
+
7813
+ ggml_backend_t ggml_backend_cuda_init() {
7814
+ ggml_init_cublas(); // TODO: remove from ggml.c
7815
+
7816
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
7817
+
7818
+ ggml_backend_t cuda_backend = new ggml_backend {
7819
+ /* .interface = */ cuda_backend_i,
7820
+ /* .context = */ ctx
7821
+ };
7822
+
7823
+ return cuda_backend;
7824
+ }