llama_cpp 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -80,9 +80,9 @@
80
80
  #include "ggml.h"
81
81
 
82
82
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
83
- #define CC_TURING 700
83
+ #define CC_VOLTA 700
84
84
  #define CC_OFFSET_AMD 1000000
85
- #define CC_RDNA2 CC_OFFSET_AMD + 1030
85
+ #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
86
86
 
87
87
  #if defined(GGML_USE_HIPBLAS)
88
88
  #define __CUDA_ARCH__ 1300
@@ -715,7 +715,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
715
715
 
716
716
  //================================== k-quants
717
717
 
718
- static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
718
+ template<typename dst_t>
719
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
719
720
 
720
721
  const int i = blockIdx.x;
721
722
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -727,7 +728,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
727
728
  const int is = 8*n + l/16;
728
729
 
729
730
  const uint8_t q = x[i].qs[32*n + l];
730
- float * y = yy + i*QK_K + 128*n;
731
+ dst_t * y = yy + i*QK_K + 128*n;
731
732
 
732
733
  float dall = __low2half(x[i].dm);
733
734
  float dmin = __high2half(x[i].dm);
@@ -739,7 +740,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
739
740
  const int is = tid/16; // 0 or 1
740
741
  const int il = tid%16; // 0...15
741
742
  const uint8_t q = x[i].qs[il] >> (2*is);
742
- float * y = yy + i*QK_K + 16*is + il;
743
+ dst_t * y = yy + i*QK_K + 16*is + il;
743
744
  float dall = __low2half(x[i].dm);
744
745
  float dmin = __high2half(x[i].dm);
745
746
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
@@ -748,7 +749,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
748
749
 
749
750
  }
750
751
 
751
- static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
752
+ template<typename dst_t>
753
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
752
754
 
753
755
  const int i = blockIdx.x;
754
756
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -772,7 +774,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
772
774
  float d_all = x[i].d;
773
775
  float dl = d_all * (us - 32);
774
776
 
775
- float * y = yy + i*QK_K + 128*n + 32*j;
777
+ dst_t * y = yy + i*QK_K + 128*n + 32*j;
776
778
  const uint8_t * q = x[i].qs + 32*n;
777
779
  const uint8_t * hm = x[i].hmask;
778
780
 
@@ -784,7 +786,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
784
786
  const int im = il/8; // 0...1
785
787
  const int in = il%8; // 0...7
786
788
 
787
- float * y = yy + i*QK_K + 16*is + il;
789
+ dst_t * y = yy + i*QK_K + 16*is + il;
788
790
 
789
791
  const uint8_t q = x[i].qs[il] >> (2*is);
790
792
  const uint8_t h = x[i].hmask[in] >> (2*is + im);
@@ -812,7 +814,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
812
814
  }
813
815
  #endif
814
816
 
815
- static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
817
+ template<typename dst_t>
818
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
816
819
  const block_q4_K * x = (const block_q4_K *) vx;
817
820
 
818
821
  const int i = blockIdx.x;
@@ -825,7 +828,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
825
828
  const int is = 2*il;
826
829
  const int n = 4;
827
830
 
828
- float * y = yy + i*QK_K + 64*il + n*ir;
831
+ dst_t * y = yy + i*QK_K + 64*il + n*ir;
829
832
 
830
833
  const float dall = __low2half(x[i].dm);
831
834
  const float dmin = __high2half(x[i].dm);
@@ -844,7 +847,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
844
847
  #else
845
848
  const int tid = threadIdx.x;
846
849
  const uint8_t * q = x[i].qs;
847
- float * y = yy + i*QK_K;
850
+ dst_t * y = yy + i*QK_K;
848
851
  const float d = (float)x[i].dm[0];
849
852
  const float m = (float)x[i].dm[1];
850
853
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
@@ -852,7 +855,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
852
855
  #endif
853
856
  }
854
857
 
855
- static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
858
+ template<typename dst_t>
859
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
856
860
  const block_q5_K * x = (const block_q5_K *) vx;
857
861
 
858
862
  const int i = blockIdx.x;
@@ -864,7 +868,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
864
868
  const int ir = tid%16; // ir is in 0...15
865
869
  const int is = 2*il; // is is in 0...6
866
870
 
867
- float * y = yy + i*QK_K + 64*il + 2*ir;
871
+ dst_t * y = yy + i*QK_K + 64*il + 2*ir;
868
872
 
869
873
  const float dall = __low2half(x[i].dm);
870
874
  const float dmin = __high2half(x[i].dm);
@@ -892,13 +896,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
892
896
  const int is = tid/16; // 0 or 1
893
897
  const uint8_t h = x[i].qh[in] >> im;
894
898
  const float d = x[i].d;
895
- float * y = yy + i*QK_K + tid;
899
+ dst_t * y = yy + i*QK_K + tid;
896
900
  y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
897
901
  y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
898
902
  #endif
899
903
  }
900
904
 
901
- static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
905
+ template<typename dst_t>
906
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
902
907
  const block_q6_K * x = (const block_q6_K *) vx;
903
908
 
904
909
  const int i = blockIdx.x;
@@ -910,7 +915,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
910
915
  const int il = tid - 32*ip; // 0...32
911
916
  const int is = 8*ip + il/16;
912
917
 
913
- float * y = yy + i*QK_K + 128*ip + il;
918
+ dst_t * y = yy + i*QK_K + 128*ip + il;
914
919
 
915
920
  const float d = x[i].d;
916
921
 
@@ -929,7 +934,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
929
934
  const int ip = tid/16; // 0 or 1
930
935
  const int il = tid - 16*ip; // 0...15
931
936
 
932
- float * y = yy + i*QK_K + 16*ip + il;
937
+ dst_t * y = yy + i*QK_K + 16*ip + il;
933
938
 
934
939
  const float d = x[i].d;
935
940
 
@@ -3548,7 +3553,7 @@ template <bool need_check> static __global__ void
3548
3553
  load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3549
3554
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3550
3555
 
3551
- #elif __CUDA_ARCH__ >= CC_TURING
3556
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3552
3557
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3553
3558
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3554
3559
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3568,7 +3573,7 @@ template <bool need_check> static __global__ void
3568
3573
  #else
3569
3574
  (void) vec_dot_q4_0_q8_1_mul_mat;
3570
3575
  assert(false);
3571
- #endif // __CUDA_ARCH__ >= CC_TURING
3576
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3572
3577
  }
3573
3578
 
3574
3579
  #define MMQ_X_Q4_1_RDNA2 64
@@ -3589,9 +3594,9 @@ template <bool need_check> static __global__ void
3589
3594
  #if defined(RDNA3) || defined(RDNA2)
3590
3595
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3591
3596
  #endif // defined(RDNA3) || defined(RDNA2)
3592
- #elif __CUDA_ARCH__ < CC_TURING
3597
+ #elif __CUDA_ARCH__ < CC_VOLTA
3593
3598
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3594
- #endif // __CUDA_ARCH__ < CC_TURING
3599
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3595
3600
  mul_mat_q4_1(
3596
3601
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3597
3602
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3611,7 +3616,7 @@ template <bool need_check> static __global__ void
3611
3616
  load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3612
3617
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3613
3618
 
3614
- #elif __CUDA_ARCH__ >= CC_TURING
3619
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3615
3620
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3616
3621
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3617
3622
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3631,7 +3636,7 @@ template <bool need_check> static __global__ void
3631
3636
  #else
3632
3637
  (void) vec_dot_q4_1_q8_1_mul_mat;
3633
3638
  assert(false);
3634
- #endif // __CUDA_ARCH__ >= CC_TURING
3639
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3635
3640
  }
3636
3641
 
3637
3642
  #define MMQ_X_Q5_0_RDNA2 64
@@ -3672,7 +3677,7 @@ template <bool need_check> static __global__ void
3672
3677
  load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3673
3678
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3674
3679
 
3675
- #elif __CUDA_ARCH__ >= CC_TURING
3680
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3676
3681
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3677
3682
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3678
3683
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3692,7 +3697,7 @@ template <bool need_check> static __global__ void
3692
3697
  #else
3693
3698
  (void) vec_dot_q5_0_q8_1_mul_mat;
3694
3699
  assert(false);
3695
- #endif // __CUDA_ARCH__ >= CC_TURING
3700
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3696
3701
  }
3697
3702
 
3698
3703
  #define MMQ_X_Q5_1_RDNA2 64
@@ -3733,7 +3738,7 @@ mul_mat_q5_1(
3733
3738
  load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3734
3739
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3735
3740
 
3736
- #elif __CUDA_ARCH__ >= CC_TURING
3741
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3737
3742
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3738
3743
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3739
3744
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3753,7 +3758,7 @@ mul_mat_q5_1(
3753
3758
  #else
3754
3759
  (void) vec_dot_q5_1_q8_1_mul_mat;
3755
3760
  assert(false);
3756
- #endif // __CUDA_ARCH__ >= CC_TURING
3761
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3757
3762
  }
3758
3763
 
3759
3764
  #define MMQ_X_Q8_0_RDNA2 64
@@ -3794,7 +3799,7 @@ template <bool need_check> static __global__ void
3794
3799
  load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3795
3800
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3796
3801
 
3797
- #elif __CUDA_ARCH__ >= CC_TURING
3802
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3798
3803
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3799
3804
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3800
3805
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3814,7 +3819,7 @@ template <bool need_check> static __global__ void
3814
3819
  #else
3815
3820
  (void) vec_dot_q8_0_q8_1_mul_mat;
3816
3821
  assert(false);
3817
- #endif // __CUDA_ARCH__ >= CC_TURING
3822
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3818
3823
  }
3819
3824
 
3820
3825
  #define MMQ_X_Q2_K_RDNA2 64
@@ -3855,7 +3860,7 @@ mul_mat_q2_K(
3855
3860
  load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3856
3861
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3857
3862
 
3858
- #elif __CUDA_ARCH__ >= CC_TURING
3863
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3859
3864
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3860
3865
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3861
3866
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3875,7 +3880,7 @@ mul_mat_q2_K(
3875
3880
  #else
3876
3881
  (void) vec_dot_q2_K_q8_1_mul_mat;
3877
3882
  assert(false);
3878
- #endif // __CUDA_ARCH__ >= CC_TURING
3883
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3879
3884
  }
3880
3885
 
3881
3886
  #define MMQ_X_Q3_K_RDNA2 128
@@ -3896,9 +3901,9 @@ template <bool need_check> static __global__ void
3896
3901
  #if defined(RDNA3) || defined(RDNA2)
3897
3902
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3898
3903
  #endif // defined(RDNA3) || defined(RDNA2)
3899
- #elif __CUDA_ARCH__ < CC_TURING
3904
+ #elif __CUDA_ARCH__ < CC_VOLTA
3900
3905
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3901
- #endif // __CUDA_ARCH__ < CC_TURING
3906
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3902
3907
  mul_mat_q3_K(
3903
3908
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3904
3909
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3918,7 +3923,7 @@ template <bool need_check> static __global__ void
3918
3923
  load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3919
3924
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3920
3925
 
3921
- #elif __CUDA_ARCH__ >= CC_TURING
3926
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3922
3927
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3923
3928
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3924
3929
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3938,7 +3943,7 @@ template <bool need_check> static __global__ void
3938
3943
  #else
3939
3944
  (void) vec_dot_q3_K_q8_1_mul_mat;
3940
3945
  assert(false);
3941
- #endif // __CUDA_ARCH__ >= CC_TURING
3946
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3942
3947
  }
3943
3948
 
3944
3949
  #define MMQ_X_Q4_K_RDNA2 64
@@ -3959,9 +3964,9 @@ template <bool need_check> static __global__ void
3959
3964
  #if defined(RDNA3) || defined(RDNA2)
3960
3965
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3961
3966
  #endif // defined(RDNA3) || defined(RDNA2)
3962
- #elif __CUDA_ARCH__ < CC_TURING
3967
+ #elif __CUDA_ARCH__ < CC_VOLTA
3963
3968
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3964
- #endif // __CUDA_ARCH__ < CC_TURING
3969
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3965
3970
  mul_mat_q4_K(
3966
3971
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3967
3972
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3981,7 +3986,7 @@ template <bool need_check> static __global__ void
3981
3986
  load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3982
3987
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3983
3988
 
3984
- #elif __CUDA_ARCH__ >= CC_TURING
3989
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3985
3990
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3986
3991
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3987
3992
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -4001,7 +4006,7 @@ template <bool need_check> static __global__ void
4001
4006
  #else
4002
4007
  (void) vec_dot_q4_K_q8_1_mul_mat;
4003
4008
  assert(false);
4004
- #endif // __CUDA_ARCH__ >= CC_TURING
4009
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4005
4010
  }
4006
4011
 
4007
4012
  #define MMQ_X_Q5_K_RDNA2 64
@@ -4042,7 +4047,7 @@ mul_mat_q5_K(
4042
4047
  load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4043
4048
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4044
4049
 
4045
- #elif __CUDA_ARCH__ >= CC_TURING
4050
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4046
4051
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
4047
4052
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
4048
4053
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -4062,7 +4067,7 @@ mul_mat_q5_K(
4062
4067
  #else
4063
4068
  (void) vec_dot_q5_K_q8_1_mul_mat;
4064
4069
  assert(false);
4065
- #endif // __CUDA_ARCH__ >= CC_TURING
4070
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4066
4071
  }
4067
4072
 
4068
4073
  #define MMQ_X_Q6_K_RDNA2 64
@@ -4083,9 +4088,9 @@ template <bool need_check> static __global__ void
4083
4088
  #if defined(RDNA3) || defined(RDNA2)
4084
4089
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4085
4090
  #endif // defined(RDNA3) || defined(RDNA2)
4086
- #elif __CUDA_ARCH__ < CC_TURING
4091
+ #elif __CUDA_ARCH__ < CC_VOLTA
4087
4092
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
4088
- #endif // __CUDA_ARCH__ < CC_TURING
4093
+ #endif // __CUDA_ARCH__ < CC_VOLTA
4089
4094
  mul_mat_q6_K(
4090
4095
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
4091
4096
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -4105,7 +4110,7 @@ template <bool need_check> static __global__ void
4105
4110
  load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4106
4111
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4107
4112
 
4108
- #elif __CUDA_ARCH__ >= CC_TURING
4113
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4109
4114
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
4110
4115
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
4111
4116
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4125,7 +4130,7 @@ template <bool need_check> static __global__ void
4125
4130
  #else
4126
4131
  (void) vec_dot_q6_K_q8_1_mul_mat;
4127
4132
  assert(false);
4128
- #endif // __CUDA_ARCH__ >= CC_TURING
4133
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4129
4134
  }
4130
4135
 
4131
4136
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -4604,32 +4609,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4604
4609
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4605
4610
  }
4606
4611
 
4607
- static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4612
+ template<typename dst_t>
4613
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4608
4614
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4609
4615
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4610
4616
  }
4611
4617
 
4612
- static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4618
+ template<typename dst_t>
4619
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4613
4620
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4614
4621
  dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4615
4622
  }
4616
4623
 
4617
- static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4624
+ template<typename dst_t>
4625
+ static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4618
4626
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4619
4627
  dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4620
4628
  }
4621
4629
 
4622
- static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4630
+ template<typename dst_t>
4631
+ static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4623
4632
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4624
4633
  dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4625
4634
  }
4626
4635
 
4627
- static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4636
+ template<typename dst_t>
4637
+ static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4628
4638
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4629
4639
  dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4630
4640
  }
4631
4641
 
4632
- static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4642
+ template<typename dst_t>
4643
+ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4633
4644
  const int nb = k / QK_K;
4634
4645
  #if QK_K == 256
4635
4646
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4638,7 +4649,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
4638
4649
  #endif
4639
4650
  }
4640
4651
 
4641
- static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4652
+ template<typename dst_t>
4653
+ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4642
4654
  const int nb = k / QK_K;
4643
4655
  #if QK_K == 256
4644
4656
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4647,12 +4659,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
4647
4659
  #endif
4648
4660
  }
4649
4661
 
4650
- static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4662
+ template<typename dst_t>
4663
+ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4651
4664
  const int nb = k / QK_K;
4652
4665
  dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
4653
4666
  }
4654
4667
 
4655
- static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4668
+ template<typename dst_t>
4669
+ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4656
4670
  const int nb = k / QK_K;
4657
4671
  #if QK_K == 256
4658
4672
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4661,7 +4675,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu
4661
4675
  #endif
4662
4676
  }
4663
4677
 
4664
- static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4678
+ template<typename dst_t>
4679
+ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4665
4680
  const int nb = k / QK_K;
4666
4681
  #if QK_K == 256
4667
4682
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4868,6 +4883,26 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
4868
4883
 
4869
4884
  static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
4870
4885
  switch (type) {
4886
+ case GGML_TYPE_Q4_0:
4887
+ return dequantize_row_q4_0_cuda;
4888
+ case GGML_TYPE_Q4_1:
4889
+ return dequantize_row_q4_1_cuda;
4890
+ case GGML_TYPE_Q5_0:
4891
+ return dequantize_row_q5_0_cuda;
4892
+ case GGML_TYPE_Q5_1:
4893
+ return dequantize_row_q5_1_cuda;
4894
+ case GGML_TYPE_Q8_0:
4895
+ return dequantize_row_q8_0_cuda;
4896
+ case GGML_TYPE_Q2_K:
4897
+ return dequantize_row_q2_K_cuda;
4898
+ case GGML_TYPE_Q3_K:
4899
+ return dequantize_row_q3_K_cuda;
4900
+ case GGML_TYPE_Q4_K:
4901
+ return dequantize_row_q4_K_cuda;
4902
+ case GGML_TYPE_Q5_K:
4903
+ return dequantize_row_q5_K_cuda;
4904
+ case GGML_TYPE_Q6_K:
4905
+ return dequantize_row_q6_K_cuda;
4871
4906
  case GGML_TYPE_F32:
4872
4907
  return convert_fp32_to_fp16_cuda;
4873
4908
  default:
@@ -4921,7 +4956,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4921
4956
  mmq_x = MMQ_X_Q4_0_RDNA1;
4922
4957
  mmq_y = MMQ_Y_Q4_0_RDNA1;
4923
4958
  nwarps = NWARPS_Q4_0_RDNA1;
4924
- } else if (compute_capability >= CC_TURING) {
4959
+ } else if (compute_capability >= CC_VOLTA) {
4925
4960
  mmq_x = MMQ_X_Q4_0_AMPERE;
4926
4961
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4927
4962
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4966,7 +5001,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4966
5001
  mmq_x = MMQ_X_Q4_1_RDNA1;
4967
5002
  mmq_y = MMQ_Y_Q4_1_RDNA1;
4968
5003
  nwarps = NWARPS_Q4_1_RDNA1;
4969
- } else if (compute_capability >= CC_TURING) {
5004
+ } else if (compute_capability >= CC_VOLTA) {
4970
5005
  mmq_x = MMQ_X_Q4_1_AMPERE;
4971
5006
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4972
5007
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -5011,7 +5046,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
5011
5046
  mmq_x = MMQ_X_Q5_0_RDNA1;
5012
5047
  mmq_y = MMQ_Y_Q5_0_RDNA1;
5013
5048
  nwarps = NWARPS_Q5_0_RDNA1;
5014
- } else if (compute_capability >= CC_TURING) {
5049
+ } else if (compute_capability >= CC_VOLTA) {
5015
5050
  mmq_x = MMQ_X_Q5_0_AMPERE;
5016
5051
  mmq_y = MMQ_Y_Q5_0_AMPERE;
5017
5052
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -5056,7 +5091,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
5056
5091
  mmq_x = MMQ_X_Q5_1_RDNA1;
5057
5092
  mmq_y = MMQ_Y_Q5_1_RDNA1;
5058
5093
  nwarps = NWARPS_Q5_1_RDNA1;
5059
- } else if (compute_capability >= CC_TURING) {
5094
+ } else if (compute_capability >= CC_VOLTA) {
5060
5095
  mmq_x = MMQ_X_Q5_1_AMPERE;
5061
5096
  mmq_y = MMQ_Y_Q5_1_AMPERE;
5062
5097
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -5101,7 +5136,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
5101
5136
  mmq_x = MMQ_X_Q8_0_RDNA1;
5102
5137
  mmq_y = MMQ_Y_Q8_0_RDNA1;
5103
5138
  nwarps = NWARPS_Q8_0_RDNA1;
5104
- } else if (compute_capability >= CC_TURING) {
5139
+ } else if (compute_capability >= CC_VOLTA) {
5105
5140
  mmq_x = MMQ_X_Q8_0_AMPERE;
5106
5141
  mmq_y = MMQ_Y_Q8_0_AMPERE;
5107
5142
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -5146,7 +5181,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
5146
5181
  mmq_x = MMQ_X_Q2_K_RDNA1;
5147
5182
  mmq_y = MMQ_Y_Q2_K_RDNA1;
5148
5183
  nwarps = NWARPS_Q2_K_RDNA1;
5149
- } else if (compute_capability >= CC_TURING) {
5184
+ } else if (compute_capability >= CC_VOLTA) {
5150
5185
  mmq_x = MMQ_X_Q2_K_AMPERE;
5151
5186
  mmq_y = MMQ_Y_Q2_K_AMPERE;
5152
5187
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -5193,7 +5228,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
5193
5228
  mmq_x = MMQ_X_Q3_K_RDNA1;
5194
5229
  mmq_y = MMQ_Y_Q3_K_RDNA1;
5195
5230
  nwarps = NWARPS_Q3_K_RDNA1;
5196
- } else if (compute_capability >= CC_TURING) {
5231
+ } else if (compute_capability >= CC_VOLTA) {
5197
5232
  mmq_x = MMQ_X_Q3_K_AMPERE;
5198
5233
  mmq_y = MMQ_Y_Q3_K_AMPERE;
5199
5234
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -5239,7 +5274,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
5239
5274
  mmq_x = MMQ_X_Q4_K_RDNA1;
5240
5275
  mmq_y = MMQ_Y_Q4_K_RDNA1;
5241
5276
  nwarps = NWARPS_Q4_K_RDNA1;
5242
- } else if (compute_capability >= CC_TURING) {
5277
+ } else if (compute_capability >= CC_VOLTA) {
5243
5278
  mmq_x = MMQ_X_Q4_K_AMPERE;
5244
5279
  mmq_y = MMQ_Y_Q4_K_AMPERE;
5245
5280
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -5284,7 +5319,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
5284
5319
  mmq_x = MMQ_X_Q5_K_RDNA1;
5285
5320
  mmq_y = MMQ_Y_Q5_K_RDNA1;
5286
5321
  nwarps = NWARPS_Q5_K_RDNA1;
5287
- } else if (compute_capability >= CC_TURING) {
5322
+ } else if (compute_capability >= CC_VOLTA) {
5288
5323
  mmq_x = MMQ_X_Q5_K_AMPERE;
5289
5324
  mmq_y = MMQ_Y_Q5_K_AMPERE;
5290
5325
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -5329,7 +5364,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
5329
5364
  mmq_x = MMQ_X_Q6_K_RDNA1;
5330
5365
  mmq_y = MMQ_Y_Q6_K_RDNA1;
5331
5366
  nwarps = NWARPS_Q6_K_RDNA1;
5332
- } else if (compute_capability >= CC_TURING) {
5367
+ } else if (compute_capability >= CC_VOLTA) {
5333
5368
  mmq_x = MMQ_X_Q6_K_AMPERE;
5334
5369
  mmq_y = MMQ_Y_Q6_K_AMPERE;
5335
5370
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -5907,7 +5942,7 @@ static int64_t get_row_rounding(ggml_type type) {
5907
5942
  switch(type) {
5908
5943
  case GGML_TYPE_Q4_0:
5909
5944
  case GGML_TYPE_Q4_1:
5910
- return max_compute_capability >= CC_TURING ? 128 : 64;
5945
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5911
5946
  case GGML_TYPE_Q5_0:
5912
5947
  case GGML_TYPE_Q5_1:
5913
5948
  case GGML_TYPE_Q8_0:
@@ -5918,7 +5953,7 @@ static int64_t get_row_rounding(ggml_type type) {
5918
5953
  case GGML_TYPE_Q3_K:
5919
5954
  case GGML_TYPE_Q4_K:
5920
5955
  case GGML_TYPE_Q5_K:
5921
- return max_compute_capability >= CC_TURING ? 128 : 64;
5956
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5922
5957
  case GGML_TYPE_Q6_K:
5923
5958
  return 64;
5924
5959
  default:
@@ -6083,8 +6118,19 @@ inline void ggml_cuda_op_mul_mat_cublas(
6083
6118
 
6084
6119
  const int compute_capability = g_compute_capabilities[id];
6085
6120
 
6086
- if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
6087
- // convert src1 to fp16, multiply as fp16, convert dst to fp32
6121
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
6122
+ // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
6123
+ half * src0_as_f16 = nullptr;
6124
+ size_t src0_as = 0;
6125
+ if (src0->type != GGML_TYPE_F16) {
6126
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
6127
+ GGML_ASSERT(to_fp16_cuda != nullptr);
6128
+ size_t ne = row_diff*ne00;
6129
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
6130
+ to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
6131
+ }
6132
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
6133
+
6088
6134
  half * src1_as_f16 = nullptr;
6089
6135
  size_t src1_as = 0;
6090
6136
  if (src1->type != GGML_TYPE_F16) {
@@ -6106,9 +6152,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
6106
6152
  CUBLAS_CHECK(
6107
6153
  cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6108
6154
  row_diff, src1_ncols, ne10,
6109
- &alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
6110
- src1_ptr, CUDA_R_16F, ne10,
6111
- &beta_f16, dst_f16, CUDA_R_16F, ldc,
6155
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
6156
+ src1_ptr, CUDA_R_16F, ne10,
6157
+ &beta_f16, dst_f16, CUDA_R_16F, ldc,
6112
6158
  CUBLAS_COMPUTE_16F,
6113
6159
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
6114
6160
 
@@ -6117,6 +6163,10 @@ inline void ggml_cuda_op_mul_mat_cublas(
6117
6163
 
6118
6164
  ggml_cuda_pool_free(dst_f16, dst_as);
6119
6165
 
6166
+ if (src0_as != 0) {
6167
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
6168
+ }
6169
+
6120
6170
  if (src1_as != 0) {
6121
6171
  ggml_cuda_pool_free(src1_as_f16, src1_as);
6122
6172
  }
@@ -1213,12 +1213,9 @@ void ggml_metal_graph_compute(
1213
1213
  float max_bias;
1214
1214
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
1215
1215
 
1216
- if (__builtin_popcount(n_head) != 1) {
1217
- GGML_ASSERT(false && "only power-of-two n_head implemented");
1218
- }
1219
-
1220
1216
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
1221
1217
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
1218
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
1222
1219
 
1223
1220
  [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
1224
1221
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1239,7 +1236,9 @@ void ggml_metal_graph_compute(
1239
1236
  [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
1240
1237
  [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
1241
1238
  [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
1242
- [encoder setBytes:&m0 length:sizeof( float) atIndex:18];
1239
+ [encoder setBytes:&m0 length:sizeof( float) atIndex:18];
1240
+ [encoder setBytes:&m1 length:sizeof( float) atIndex:19];
1241
+ [encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20];
1243
1242
 
1244
1243
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1245
1244
  } break;
@@ -830,7 +830,9 @@ kernel void kernel_alibi_f32(
830
830
  constant uint64_t & nb1,
831
831
  constant uint64_t & nb2,
832
832
  constant uint64_t & nb3,
833
- constant float & m0,
833
+ constant float & m0,
834
+ constant float & m1,
835
+ constant int & n_heads_log2_floor,
834
836
  uint3 tgpig[[threadgroup_position_in_grid]],
835
837
  uint3 tpitg[[thread_position_in_threadgroup]],
836
838
  uint3 ntg[[threads_per_threadgroup]]) {
@@ -846,7 +848,12 @@ kernel void kernel_alibi_f32(
846
848
  const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
847
849
 
848
850
  device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
849
- float m_k = pow(m0, i2 + 1);
851
+ float m_k;
852
+ if (i2 < n_heads_log2_floor) {
853
+ m_k = pow(m0, i2 + 1);
854
+ } else {
855
+ m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1);
856
+ }
850
857
  for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
851
858
  device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
852
859
  dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);