llama_cpp 0.5.3 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,4 @@
1
+ #include <algorithm>
1
2
  #include <cstddef>
2
3
  #include <cstdint>
3
4
  #include <limits>
@@ -14,9 +15,11 @@
14
15
  // for rocblas_initialize()
15
16
  #include "rocblas/rocblas.h"
16
17
  #endif // __HIP_PLATFORM_AMD__
18
+ #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
17
19
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
20
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
21
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
22
+ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
20
23
  #define CUBLAS_OP_N HIPBLAS_OP_N
21
24
  #define CUBLAS_OP_T HIPBLAS_OP_T
22
25
  #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
@@ -77,9 +80,9 @@
77
80
  #include "ggml.h"
78
81
 
79
82
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
80
- #define CC_TURING 700
83
+ #define CC_VOLTA 700
81
84
  #define CC_OFFSET_AMD 1000000
82
- #define CC_RDNA2 CC_OFFSET_AMD + 1030
85
+ #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
83
86
 
84
87
  #if defined(GGML_USE_HIPBLAS)
85
88
  #define __CUDA_ARCH__ 1300
@@ -235,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
235
238
  return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
236
239
  }
237
240
 
241
+ template<typename T>
242
+ using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
243
+ typedef to_t_cuda_t<float> to_fp32_cuda_t;
244
+ typedef to_t_cuda_t<half> to_fp16_cuda_t;
245
+
238
246
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
239
- typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
240
247
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
241
248
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
242
249
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
@@ -461,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
461
468
  static bool g_mul_mat_q = true;
462
469
 
463
470
  static void * g_scratch_buffer = nullptr;
464
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
471
+ static size_t g_scratch_size = 0; // disabled by default
465
472
  static size_t g_scratch_offset = 0;
466
473
 
467
474
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -708,7 +715,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
708
715
 
709
716
  //================================== k-quants
710
717
 
711
- static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
718
+ template<typename dst_t>
719
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
712
720
 
713
721
  const int i = blockIdx.x;
714
722
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -720,7 +728,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
720
728
  const int is = 8*n + l/16;
721
729
 
722
730
  const uint8_t q = x[i].qs[32*n + l];
723
- float * y = yy + i*QK_K + 128*n;
731
+ dst_t * y = yy + i*QK_K + 128*n;
724
732
 
725
733
  float dall = __low2half(x[i].dm);
726
734
  float dmin = __high2half(x[i].dm);
@@ -732,7 +740,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
732
740
  const int is = tid/16; // 0 or 1
733
741
  const int il = tid%16; // 0...15
734
742
  const uint8_t q = x[i].qs[il] >> (2*is);
735
- float * y = yy + i*QK_K + 16*is + il;
743
+ dst_t * y = yy + i*QK_K + 16*is + il;
736
744
  float dall = __low2half(x[i].dm);
737
745
  float dmin = __high2half(x[i].dm);
738
746
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
@@ -741,7 +749,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
741
749
 
742
750
  }
743
751
 
744
- static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
752
+ template<typename dst_t>
753
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
745
754
 
746
755
  const int i = blockIdx.x;
747
756
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -765,7 +774,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
765
774
  float d_all = x[i].d;
766
775
  float dl = d_all * (us - 32);
767
776
 
768
- float * y = yy + i*QK_K + 128*n + 32*j;
777
+ dst_t * y = yy + i*QK_K + 128*n + 32*j;
769
778
  const uint8_t * q = x[i].qs + 32*n;
770
779
  const uint8_t * hm = x[i].hmask;
771
780
 
@@ -777,7 +786,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
777
786
  const int im = il/8; // 0...1
778
787
  const int in = il%8; // 0...7
779
788
 
780
- float * y = yy + i*QK_K + 16*is + il;
789
+ dst_t * y = yy + i*QK_K + 16*is + il;
781
790
 
782
791
  const uint8_t q = x[i].qs[il] >> (2*is);
783
792
  const uint8_t h = x[i].hmask[in] >> (2*is + im);
@@ -805,7 +814,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
805
814
  }
806
815
  #endif
807
816
 
808
- static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
817
+ template<typename dst_t>
818
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
809
819
  const block_q4_K * x = (const block_q4_K *) vx;
810
820
 
811
821
  const int i = blockIdx.x;
@@ -818,7 +828,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
818
828
  const int is = 2*il;
819
829
  const int n = 4;
820
830
 
821
- float * y = yy + i*QK_K + 64*il + n*ir;
831
+ dst_t * y = yy + i*QK_K + 64*il + n*ir;
822
832
 
823
833
  const float dall = __low2half(x[i].dm);
824
834
  const float dmin = __high2half(x[i].dm);
@@ -837,7 +847,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
837
847
  #else
838
848
  const int tid = threadIdx.x;
839
849
  const uint8_t * q = x[i].qs;
840
- float * y = yy + i*QK_K;
850
+ dst_t * y = yy + i*QK_K;
841
851
  const float d = (float)x[i].dm[0];
842
852
  const float m = (float)x[i].dm[1];
843
853
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
@@ -845,7 +855,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
845
855
  #endif
846
856
  }
847
857
 
848
- static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
858
+ template<typename dst_t>
859
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
849
860
  const block_q5_K * x = (const block_q5_K *) vx;
850
861
 
851
862
  const int i = blockIdx.x;
@@ -857,7 +868,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
857
868
  const int ir = tid%16; // ir is in 0...15
858
869
  const int is = 2*il; // is is in 0...6
859
870
 
860
- float * y = yy + i*QK_K + 64*il + 2*ir;
871
+ dst_t * y = yy + i*QK_K + 64*il + 2*ir;
861
872
 
862
873
  const float dall = __low2half(x[i].dm);
863
874
  const float dmin = __high2half(x[i].dm);
@@ -885,13 +896,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
885
896
  const int is = tid/16; // 0 or 1
886
897
  const uint8_t h = x[i].qh[in] >> im;
887
898
  const float d = x[i].d;
888
- float * y = yy + i*QK_K + tid;
899
+ dst_t * y = yy + i*QK_K + tid;
889
900
  y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
890
901
  y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
891
902
  #endif
892
903
  }
893
904
 
894
- static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
905
+ template<typename dst_t>
906
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
895
907
  const block_q6_K * x = (const block_q6_K *) vx;
896
908
 
897
909
  const int i = blockIdx.x;
@@ -903,7 +915,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
903
915
  const int il = tid - 32*ip; // 0...32
904
916
  const int is = 8*ip + il/16;
905
917
 
906
- float * y = yy + i*QK_K + 128*ip + il;
918
+ dst_t * y = yy + i*QK_K + 128*ip + il;
907
919
 
908
920
  const float d = x[i].d;
909
921
 
@@ -922,7 +934,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
922
934
  const int ip = tid/16; // 0 or 1
923
935
  const int il = tid - 16*ip; // 0...15
924
936
 
925
- float * y = yy + i*QK_K + 16*ip + il;
937
+ dst_t * y = yy + i*QK_K + 16*ip + il;
926
938
 
927
939
  const float d = x[i].d;
928
940
 
@@ -1515,6 +1527,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1515
1527
  v.y = x[ib + iqs + 1];
1516
1528
  }
1517
1529
 
1530
+ static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
1531
+ const float * x = (const float *) vx;
1532
+
1533
+ // automatic half -> float type cast if dfloat == float
1534
+ v.x = x[ib + iqs + 0];
1535
+ v.y = x[ib + iqs + 1];
1536
+ }
1537
+
1518
1538
  static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1519
1539
  const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1520
1540
 
@@ -1554,8 +1574,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1554
1574
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1555
1575
  }
1556
1576
 
1557
- template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1558
- static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1577
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1578
+ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1559
1579
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1560
1580
 
1561
1581
  if (i >= k) {
@@ -3533,7 +3553,7 @@ template <bool need_check> static __global__ void
3533
3553
  load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3534
3554
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3535
3555
 
3536
- #elif __CUDA_ARCH__ >= CC_TURING
3556
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3537
3557
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3538
3558
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3539
3559
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3553,7 +3573,7 @@ template <bool need_check> static __global__ void
3553
3573
  #else
3554
3574
  (void) vec_dot_q4_0_q8_1_mul_mat;
3555
3575
  assert(false);
3556
- #endif // __CUDA_ARCH__ >= CC_TURING
3576
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3557
3577
  }
3558
3578
 
3559
3579
  #define MMQ_X_Q4_1_RDNA2 64
@@ -3574,9 +3594,9 @@ template <bool need_check> static __global__ void
3574
3594
  #if defined(RDNA3) || defined(RDNA2)
3575
3595
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3576
3596
  #endif // defined(RDNA3) || defined(RDNA2)
3577
- #elif __CUDA_ARCH__ < CC_TURING
3597
+ #elif __CUDA_ARCH__ < CC_VOLTA
3578
3598
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3579
- #endif // __CUDA_ARCH__ < CC_TURING
3599
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3580
3600
  mul_mat_q4_1(
3581
3601
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3582
3602
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3596,7 +3616,7 @@ template <bool need_check> static __global__ void
3596
3616
  load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3597
3617
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3598
3618
 
3599
- #elif __CUDA_ARCH__ >= CC_TURING
3619
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3600
3620
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3601
3621
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3602
3622
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3616,7 +3636,7 @@ template <bool need_check> static __global__ void
3616
3636
  #else
3617
3637
  (void) vec_dot_q4_1_q8_1_mul_mat;
3618
3638
  assert(false);
3619
- #endif // __CUDA_ARCH__ >= CC_TURING
3639
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3620
3640
  }
3621
3641
 
3622
3642
  #define MMQ_X_Q5_0_RDNA2 64
@@ -3657,7 +3677,7 @@ template <bool need_check> static __global__ void
3657
3677
  load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3658
3678
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3659
3679
 
3660
- #elif __CUDA_ARCH__ >= CC_TURING
3680
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3661
3681
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3662
3682
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3663
3683
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3677,7 +3697,7 @@ template <bool need_check> static __global__ void
3677
3697
  #else
3678
3698
  (void) vec_dot_q5_0_q8_1_mul_mat;
3679
3699
  assert(false);
3680
- #endif // __CUDA_ARCH__ >= CC_TURING
3700
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3681
3701
  }
3682
3702
 
3683
3703
  #define MMQ_X_Q5_1_RDNA2 64
@@ -3718,7 +3738,7 @@ mul_mat_q5_1(
3718
3738
  load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3719
3739
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3720
3740
 
3721
- #elif __CUDA_ARCH__ >= CC_TURING
3741
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3722
3742
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3723
3743
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3724
3744
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3738,7 +3758,7 @@ mul_mat_q5_1(
3738
3758
  #else
3739
3759
  (void) vec_dot_q5_1_q8_1_mul_mat;
3740
3760
  assert(false);
3741
- #endif // __CUDA_ARCH__ >= CC_TURING
3761
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3742
3762
  }
3743
3763
 
3744
3764
  #define MMQ_X_Q8_0_RDNA2 64
@@ -3779,7 +3799,7 @@ template <bool need_check> static __global__ void
3779
3799
  load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3780
3800
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3781
3801
 
3782
- #elif __CUDA_ARCH__ >= CC_TURING
3802
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3783
3803
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3784
3804
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3785
3805
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3799,7 +3819,7 @@ template <bool need_check> static __global__ void
3799
3819
  #else
3800
3820
  (void) vec_dot_q8_0_q8_1_mul_mat;
3801
3821
  assert(false);
3802
- #endif // __CUDA_ARCH__ >= CC_TURING
3822
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3803
3823
  }
3804
3824
 
3805
3825
  #define MMQ_X_Q2_K_RDNA2 64
@@ -3840,7 +3860,7 @@ mul_mat_q2_K(
3840
3860
  load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3841
3861
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3842
3862
 
3843
- #elif __CUDA_ARCH__ >= CC_TURING
3863
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3844
3864
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3845
3865
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3846
3866
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3860,7 +3880,7 @@ mul_mat_q2_K(
3860
3880
  #else
3861
3881
  (void) vec_dot_q2_K_q8_1_mul_mat;
3862
3882
  assert(false);
3863
- #endif // __CUDA_ARCH__ >= CC_TURING
3883
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3864
3884
  }
3865
3885
 
3866
3886
  #define MMQ_X_Q3_K_RDNA2 128
@@ -3881,9 +3901,9 @@ template <bool need_check> static __global__ void
3881
3901
  #if defined(RDNA3) || defined(RDNA2)
3882
3902
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3883
3903
  #endif // defined(RDNA3) || defined(RDNA2)
3884
- #elif __CUDA_ARCH__ < CC_TURING
3904
+ #elif __CUDA_ARCH__ < CC_VOLTA
3885
3905
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3886
- #endif // __CUDA_ARCH__ < CC_TURING
3906
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3887
3907
  mul_mat_q3_K(
3888
3908
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3889
3909
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3903,7 +3923,7 @@ template <bool need_check> static __global__ void
3903
3923
  load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3904
3924
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3905
3925
 
3906
- #elif __CUDA_ARCH__ >= CC_TURING
3926
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3907
3927
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3908
3928
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3909
3929
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3923,7 +3943,7 @@ template <bool need_check> static __global__ void
3923
3943
  #else
3924
3944
  (void) vec_dot_q3_K_q8_1_mul_mat;
3925
3945
  assert(false);
3926
- #endif // __CUDA_ARCH__ >= CC_TURING
3946
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3927
3947
  }
3928
3948
 
3929
3949
  #define MMQ_X_Q4_K_RDNA2 64
@@ -3944,9 +3964,9 @@ template <bool need_check> static __global__ void
3944
3964
  #if defined(RDNA3) || defined(RDNA2)
3945
3965
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3946
3966
  #endif // defined(RDNA3) || defined(RDNA2)
3947
- #elif __CUDA_ARCH__ < CC_TURING
3967
+ #elif __CUDA_ARCH__ < CC_VOLTA
3948
3968
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3949
- #endif // __CUDA_ARCH__ < CC_TURING
3969
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3950
3970
  mul_mat_q4_K(
3951
3971
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3952
3972
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3966,7 +3986,7 @@ template <bool need_check> static __global__ void
3966
3986
  load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3967
3987
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3968
3988
 
3969
- #elif __CUDA_ARCH__ >= CC_TURING
3989
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3970
3990
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3971
3991
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3972
3992
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -3986,7 +4006,7 @@ template <bool need_check> static __global__ void
3986
4006
  #else
3987
4007
  (void) vec_dot_q4_K_q8_1_mul_mat;
3988
4008
  assert(false);
3989
- #endif // __CUDA_ARCH__ >= CC_TURING
4009
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3990
4010
  }
3991
4011
 
3992
4012
  #define MMQ_X_Q5_K_RDNA2 64
@@ -4027,7 +4047,7 @@ mul_mat_q5_K(
4027
4047
  load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4028
4048
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4029
4049
 
4030
- #elif __CUDA_ARCH__ >= CC_TURING
4050
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4031
4051
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
4032
4052
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
4033
4053
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -4047,7 +4067,7 @@ mul_mat_q5_K(
4047
4067
  #else
4048
4068
  (void) vec_dot_q5_K_q8_1_mul_mat;
4049
4069
  assert(false);
4050
- #endif // __CUDA_ARCH__ >= CC_TURING
4070
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4051
4071
  }
4052
4072
 
4053
4073
  #define MMQ_X_Q6_K_RDNA2 64
@@ -4068,9 +4088,9 @@ template <bool need_check> static __global__ void
4068
4088
  #if defined(RDNA3) || defined(RDNA2)
4069
4089
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4070
4090
  #endif // defined(RDNA3) || defined(RDNA2)
4071
- #elif __CUDA_ARCH__ < CC_TURING
4091
+ #elif __CUDA_ARCH__ < CC_VOLTA
4072
4092
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
4073
- #endif // __CUDA_ARCH__ < CC_TURING
4093
+ #endif // __CUDA_ARCH__ < CC_VOLTA
4074
4094
  mul_mat_q6_K(
4075
4095
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
4076
4096
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -4090,7 +4110,7 @@ template <bool need_check> static __global__ void
4090
4110
  load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4091
4111
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4092
4112
 
4093
- #elif __CUDA_ARCH__ >= CC_TURING
4113
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4094
4114
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
4095
4115
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
4096
4116
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4110,7 +4130,7 @@ template <bool need_check> static __global__ void
4110
4130
  #else
4111
4131
  (void) vec_dot_q6_K_q8_1_mul_mat;
4112
4132
  assert(false);
4113
- #endif // __CUDA_ARCH__ >= CC_TURING
4133
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4114
4134
  }
4115
4135
 
4116
4136
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -4355,8 +4375,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4355
4375
  }
4356
4376
 
4357
4377
  // rope == RoPE == rotary positional embedding
4358
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
4359
- const float p_delta, const int p_delta_rows, const float theta_scale) {
4378
+
4379
+ template<typename T, bool has_pos>
4380
+ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4381
+ const int p_delta_rows, const float theta_scale) {
4360
4382
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4361
4383
 
4362
4384
  if (col >= ncols) {
@@ -4365,8 +4387,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
4365
4387
 
4366
4388
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4367
4389
  const int i = row*ncols + col;
4390
+ const int i2 = row/p_delta_rows;
4368
4391
 
4369
- const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4392
+ const int p = has_pos ? pos[i2] : 0;
4393
+ const float p0 = p*freq_scale;
4394
+ const float theta = p0*powf(theta_scale, col/2);
4370
4395
  const float sin_theta = sinf(theta);
4371
4396
  const float cos_theta = cosf(theta);
4372
4397
 
@@ -4377,8 +4402,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
4377
4402
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
4378
4403
  }
4379
4404
 
4380
- static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4381
- const float p_delta, const int p_delta_rows, const float theta_scale) {
4405
+ template<typename T, bool has_pos>
4406
+ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4407
+ const int p_delta_rows, const float theta_scale) {
4382
4408
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4383
4409
 
4384
4410
  if (col >= ncols) {
@@ -4387,8 +4413,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4387
4413
 
4388
4414
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4389
4415
  const int i = row*ncols + col/2;
4416
+ const int i2 = row/p_delta_rows;
4390
4417
 
4391
- const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4418
+ const int p = has_pos ? pos[i2] : 0;
4419
+ const float p0 = p*freq_scale;
4420
+ const float theta = p0*powf(theta_scale, col/2);
4392
4421
  const float sin_theta = sinf(theta);
4393
4422
  const float cos_theta = cosf(theta);
4394
4423
 
@@ -4399,8 +4428,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4399
4428
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4400
4429
  }
4401
4430
 
4402
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4403
- const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4431
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4432
+ const int p_delta_rows, const float theta_scale, const int n_ctx) {
4404
4433
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4405
4434
  const int half_n_dims = ncols/4;
4406
4435
 
@@ -4410,11 +4439,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4410
4439
 
4411
4440
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4412
4441
  const int i = row*ncols + col;
4442
+ const int i2 = row/p_delta_rows;
4413
4443
 
4414
4444
  const float col_theta_scale = powf(theta_scale, col);
4415
- const float p = p0 + p_delta*(row/p_delta_rows);
4445
+ // FIXME: this is likely wrong
4446
+ const int p = pos != nullptr ? pos[i2] : 0;
4416
4447
 
4417
- const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4448
+ const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
4418
4449
  const float sin_theta = sinf(theta);
4419
4450
  const float cos_theta = cosf(theta);
4420
4451
 
@@ -4424,7 +4455,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4424
4455
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4425
4456
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4426
4457
 
4427
- const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4458
+ const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
4428
4459
  const float sin_block_theta = sinf(block_theta);
4429
4460
  const float cos_block_theta = cosf(block_theta);
4430
4461
 
@@ -4578,32 +4609,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4578
4609
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4579
4610
  }
4580
4611
 
4581
- static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4612
+ template<typename dst_t>
4613
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4582
4614
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4583
4615
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4584
4616
  }
4585
4617
 
4586
- static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4618
+ template<typename dst_t>
4619
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4587
4620
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4588
4621
  dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4589
4622
  }
4590
4623
 
4591
- static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4624
+ template<typename dst_t>
4625
+ static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4592
4626
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4593
4627
  dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4594
4628
  }
4595
4629
 
4596
- static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4630
+ template<typename dst_t>
4631
+ static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4597
4632
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4598
4633
  dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4599
4634
  }
4600
4635
 
4601
- static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4636
+ template<typename dst_t>
4637
+ static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4602
4638
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4603
4639
  dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4604
4640
  }
4605
4641
 
4606
- static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4642
+ template<typename dst_t>
4643
+ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4607
4644
  const int nb = k / QK_K;
4608
4645
  #if QK_K == 256
4609
4646
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4612,7 +4649,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
4612
4649
  #endif
4613
4650
  }
4614
4651
 
4615
- static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4652
+ template<typename dst_t>
4653
+ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4616
4654
  const int nb = k / QK_K;
4617
4655
  #if QK_K == 256
4618
4656
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4621,12 +4659,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
4621
4659
  #endif
4622
4660
  }
4623
4661
 
4624
- static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4662
+ template<typename dst_t>
4663
+ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4625
4664
  const int nb = k / QK_K;
4626
4665
  dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
4627
4666
  }
4628
4667
 
4629
- static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4668
+ template<typename dst_t>
4669
+ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4630
4670
  const int nb = k / QK_K;
4631
4671
  #if QK_K == 256
4632
4672
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4635,7 +4675,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu
4635
4675
  #endif
4636
4676
  }
4637
4677
 
4638
- static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4678
+ template<typename dst_t>
4679
+ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4639
4680
  const int nb = k / QK_K;
4640
4681
  #if QK_K == 256
4641
4682
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4826,6 +4867,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
4826
4867
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4827
4868
  }
4828
4869
 
4870
+ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
4871
+ const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
4872
+ dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4873
+ }
4874
+
4829
4875
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4830
4876
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4831
4877
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -4835,6 +4881,35 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
4835
4881
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4836
4882
  }
4837
4883
 
4884
+ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
4885
+ switch (type) {
4886
+ case GGML_TYPE_Q4_0:
4887
+ return dequantize_row_q4_0_cuda;
4888
+ case GGML_TYPE_Q4_1:
4889
+ return dequantize_row_q4_1_cuda;
4890
+ case GGML_TYPE_Q5_0:
4891
+ return dequantize_row_q5_0_cuda;
4892
+ case GGML_TYPE_Q5_1:
4893
+ return dequantize_row_q5_1_cuda;
4894
+ case GGML_TYPE_Q8_0:
4895
+ return dequantize_row_q8_0_cuda;
4896
+ case GGML_TYPE_Q2_K:
4897
+ return dequantize_row_q2_K_cuda;
4898
+ case GGML_TYPE_Q3_K:
4899
+ return dequantize_row_q3_K_cuda;
4900
+ case GGML_TYPE_Q4_K:
4901
+ return dequantize_row_q4_K_cuda;
4902
+ case GGML_TYPE_Q5_K:
4903
+ return dequantize_row_q5_K_cuda;
4904
+ case GGML_TYPE_Q6_K:
4905
+ return dequantize_row_q6_K_cuda;
4906
+ case GGML_TYPE_F32:
4907
+ return convert_fp32_to_fp16_cuda;
4908
+ default:
4909
+ return nullptr;
4910
+ }
4911
+ }
4912
+
4838
4913
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
4839
4914
  switch (type) {
4840
4915
  case GGML_TYPE_Q4_0:
@@ -4881,7 +4956,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4881
4956
  mmq_x = MMQ_X_Q4_0_RDNA1;
4882
4957
  mmq_y = MMQ_Y_Q4_0_RDNA1;
4883
4958
  nwarps = NWARPS_Q4_0_RDNA1;
4884
- } else if (compute_capability >= CC_TURING) {
4959
+ } else if (compute_capability >= CC_VOLTA) {
4885
4960
  mmq_x = MMQ_X_Q4_0_AMPERE;
4886
4961
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4887
4962
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4926,7 +5001,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4926
5001
  mmq_x = MMQ_X_Q4_1_RDNA1;
4927
5002
  mmq_y = MMQ_Y_Q4_1_RDNA1;
4928
5003
  nwarps = NWARPS_Q4_1_RDNA1;
4929
- } else if (compute_capability >= CC_TURING) {
5004
+ } else if (compute_capability >= CC_VOLTA) {
4930
5005
  mmq_x = MMQ_X_Q4_1_AMPERE;
4931
5006
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4932
5007
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -4971,7 +5046,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4971
5046
  mmq_x = MMQ_X_Q5_0_RDNA1;
4972
5047
  mmq_y = MMQ_Y_Q5_0_RDNA1;
4973
5048
  nwarps = NWARPS_Q5_0_RDNA1;
4974
- } else if (compute_capability >= CC_TURING) {
5049
+ } else if (compute_capability >= CC_VOLTA) {
4975
5050
  mmq_x = MMQ_X_Q5_0_AMPERE;
4976
5051
  mmq_y = MMQ_Y_Q5_0_AMPERE;
4977
5052
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -5016,7 +5091,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
5016
5091
  mmq_x = MMQ_X_Q5_1_RDNA1;
5017
5092
  mmq_y = MMQ_Y_Q5_1_RDNA1;
5018
5093
  nwarps = NWARPS_Q5_1_RDNA1;
5019
- } else if (compute_capability >= CC_TURING) {
5094
+ } else if (compute_capability >= CC_VOLTA) {
5020
5095
  mmq_x = MMQ_X_Q5_1_AMPERE;
5021
5096
  mmq_y = MMQ_Y_Q5_1_AMPERE;
5022
5097
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -5061,7 +5136,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
5061
5136
  mmq_x = MMQ_X_Q8_0_RDNA1;
5062
5137
  mmq_y = MMQ_Y_Q8_0_RDNA1;
5063
5138
  nwarps = NWARPS_Q8_0_RDNA1;
5064
- } else if (compute_capability >= CC_TURING) {
5139
+ } else if (compute_capability >= CC_VOLTA) {
5065
5140
  mmq_x = MMQ_X_Q8_0_AMPERE;
5066
5141
  mmq_y = MMQ_Y_Q8_0_AMPERE;
5067
5142
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -5106,7 +5181,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
5106
5181
  mmq_x = MMQ_X_Q2_K_RDNA1;
5107
5182
  mmq_y = MMQ_Y_Q2_K_RDNA1;
5108
5183
  nwarps = NWARPS_Q2_K_RDNA1;
5109
- } else if (compute_capability >= CC_TURING) {
5184
+ } else if (compute_capability >= CC_VOLTA) {
5110
5185
  mmq_x = MMQ_X_Q2_K_AMPERE;
5111
5186
  mmq_y = MMQ_Y_Q2_K_AMPERE;
5112
5187
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -5153,7 +5228,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
5153
5228
  mmq_x = MMQ_X_Q3_K_RDNA1;
5154
5229
  mmq_y = MMQ_Y_Q3_K_RDNA1;
5155
5230
  nwarps = NWARPS_Q3_K_RDNA1;
5156
- } else if (compute_capability >= CC_TURING) {
5231
+ } else if (compute_capability >= CC_VOLTA) {
5157
5232
  mmq_x = MMQ_X_Q3_K_AMPERE;
5158
5233
  mmq_y = MMQ_Y_Q3_K_AMPERE;
5159
5234
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -5199,7 +5274,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
5199
5274
  mmq_x = MMQ_X_Q4_K_RDNA1;
5200
5275
  mmq_y = MMQ_Y_Q4_K_RDNA1;
5201
5276
  nwarps = NWARPS_Q4_K_RDNA1;
5202
- } else if (compute_capability >= CC_TURING) {
5277
+ } else if (compute_capability >= CC_VOLTA) {
5203
5278
  mmq_x = MMQ_X_Q4_K_AMPERE;
5204
5279
  mmq_y = MMQ_Y_Q4_K_AMPERE;
5205
5280
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -5244,7 +5319,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
5244
5319
  mmq_x = MMQ_X_Q5_K_RDNA1;
5245
5320
  mmq_y = MMQ_Y_Q5_K_RDNA1;
5246
5321
  nwarps = NWARPS_Q5_K_RDNA1;
5247
- } else if (compute_capability >= CC_TURING) {
5322
+ } else if (compute_capability >= CC_VOLTA) {
5248
5323
  mmq_x = MMQ_X_Q5_K_AMPERE;
5249
5324
  mmq_y = MMQ_Y_Q5_K_AMPERE;
5250
5325
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -5289,7 +5364,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
5289
5364
  mmq_x = MMQ_X_Q6_K_RDNA1;
5290
5365
  mmq_y = MMQ_Y_Q6_K_RDNA1;
5291
5366
  nwarps = NWARPS_Q6_K_RDNA1;
5292
- } else if (compute_capability >= CC_TURING) {
5367
+ } else if (compute_capability >= CC_VOLTA) {
5293
5368
  mmq_x = MMQ_X_Q6_K_AMPERE;
5294
5369
  mmq_y = MMQ_Y_Q6_K_AMPERE;
5295
5370
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -5361,31 +5436,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5361
5436
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5362
5437
  }
5363
5438
 
5364
- static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5365
- const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5439
+ template<typename T>
5440
+ static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5441
+ const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5366
5442
  GGML_ASSERT(ncols % 2 == 0);
5367
5443
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5368
5444
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5369
5445
  const dim3 block_nums(nrows, num_blocks_x, 1);
5370
- rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
5446
+ if (pos == nullptr) {
5447
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5448
+ } else {
5449
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5450
+ }
5371
5451
  }
5372
5452
 
5373
- static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5374
- const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5453
+ template<typename T>
5454
+ static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5455
+ const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5375
5456
  GGML_ASSERT(ncols % 2 == 0);
5376
5457
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5377
5458
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5378
5459
  const dim3 block_nums(nrows, num_blocks_x, 1);
5379
- rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
5460
+ if (pos == nullptr) {
5461
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5462
+ } else {
5463
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5464
+ }
5380
5465
  }
5381
5466
 
5382
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5383
- const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5467
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5468
+ const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5384
5469
  GGML_ASSERT(ncols % 4 == 0);
5385
5470
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5386
5471
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5387
5472
  const dim3 block_nums(num_blocks_x, nrows, 1);
5388
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
5473
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5389
5474
  }
5390
5475
 
5391
5476
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5857,7 +5942,7 @@ static int64_t get_row_rounding(ggml_type type) {
5857
5942
  switch(type) {
5858
5943
  case GGML_TYPE_Q4_0:
5859
5944
  case GGML_TYPE_Q4_1:
5860
- return max_compute_capability >= CC_TURING ? 128 : 64;
5945
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5861
5946
  case GGML_TYPE_Q5_0:
5862
5947
  case GGML_TYPE_Q5_1:
5863
5948
  case GGML_TYPE_Q8_0:
@@ -5868,7 +5953,7 @@ static int64_t get_row_rounding(ggml_type type) {
5868
5953
  case GGML_TYPE_Q3_K:
5869
5954
  case GGML_TYPE_Q4_K:
5870
5955
  case GGML_TYPE_Q5_K:
5871
- return max_compute_capability >= CC_TURING ? 128 : 64;
5956
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5872
5957
  case GGML_TYPE_Q6_K:
5873
5958
  return 64;
5874
5959
  default:
@@ -6016,8 +6101,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
6016
6101
  GGML_ASSERT(src1_ddf_i != nullptr);
6017
6102
  GGML_ASSERT(dst_dd_i != nullptr);
6018
6103
 
6019
- const float alpha = 1.0f;
6020
- const float beta = 0.0f;
6021
6104
 
6022
6105
  const int64_t ne00 = src0->ne[0];
6023
6106
 
@@ -6026,16 +6109,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
6026
6109
  const int64_t ne0 = dst->ne[0];
6027
6110
  const int64_t row_diff = row_high - row_low;
6028
6111
 
6029
- float * src0_ddq_as_f32;
6030
- size_t src0_as = 0;
6031
-
6032
- if (src0->type != GGML_TYPE_F32) {
6033
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6034
- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6035
- to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6036
- }
6037
- const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
6038
-
6039
6112
  int id;
6040
6113
  CUDA_CHECK(cudaGetDevice(&id));
6041
6114
 
@@ -6043,16 +6116,87 @@ inline void ggml_cuda_op_mul_mat_cublas(
6043
6116
  // ldc == nrows of the matrix that cuBLAS writes into
6044
6117
  int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
6045
6118
 
6046
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6047
- CUBLAS_CHECK(
6048
- cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6049
- row_diff, src1_ncols, ne10,
6050
- &alpha, src0_ddf_i, ne00,
6051
- src1_ddf_i, ne10,
6052
- &beta, dst_dd_i, ldc));
6119
+ const int compute_capability = g_compute_capabilities[id];
6120
+
6121
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
6122
+ // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
6123
+ half * src0_as_f16 = nullptr;
6124
+ size_t src0_as = 0;
6125
+ if (src0->type != GGML_TYPE_F16) {
6126
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
6127
+ GGML_ASSERT(to_fp16_cuda != nullptr);
6128
+ size_t ne = row_diff*ne00;
6129
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
6130
+ to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
6131
+ }
6132
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
6133
+
6134
+ half * src1_as_f16 = nullptr;
6135
+ size_t src1_as = 0;
6136
+ if (src1->type != GGML_TYPE_F16) {
6137
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
6138
+ GGML_ASSERT(to_fp16_cuda != nullptr);
6139
+ size_t ne = src1_ncols*ne10;
6140
+ src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6141
+ to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6142
+ }
6143
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6144
+
6145
+ size_t dst_as = 0;
6146
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6147
+
6148
+ const half alpha_f16 = 1.0f;
6149
+ const half beta_f16 = 0.0f;
6150
+
6151
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6152
+ CUBLAS_CHECK(
6153
+ cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6154
+ row_diff, src1_ncols, ne10,
6155
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
6156
+ src1_ptr, CUDA_R_16F, ne10,
6157
+ &beta_f16, dst_f16, CUDA_R_16F, ldc,
6158
+ CUBLAS_COMPUTE_16F,
6159
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
6160
+
6161
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
6162
+ to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
6163
+
6164
+ ggml_cuda_pool_free(dst_f16, dst_as);
6165
+
6166
+ if (src0_as != 0) {
6167
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
6168
+ }
6053
6169
 
6054
- if (src0_as > 0) {
6055
- ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6170
+ if (src1_as != 0) {
6171
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
6172
+ }
6173
+ }
6174
+ else {
6175
+ float * src0_ddq_as_f32 = nullptr;
6176
+ size_t src0_as = 0;
6177
+
6178
+ if (src0->type != GGML_TYPE_F32) {
6179
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6180
+ GGML_ASSERT(to_fp32_cuda != nullptr);
6181
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6182
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6183
+ }
6184
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
6185
+
6186
+ const float alpha = 1.0f;
6187
+ const float beta = 0.0f;
6188
+
6189
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6190
+ CUBLAS_CHECK(
6191
+ cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6192
+ row_diff, src1_ncols, ne10,
6193
+ &alpha, src0_ddf_i, ne00,
6194
+ src1_ddf_i, ne10,
6195
+ &beta, dst_dd_i, ldc));
6196
+
6197
+ if (src0_as != 0) {
6198
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6199
+ }
6056
6200
  }
6057
6201
 
6058
6202
  (void) dst;
@@ -6064,14 +6208,16 @@ inline void ggml_cuda_op_rope(
6064
6208
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6065
6209
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6066
6210
 
6067
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
6068
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
6211
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
6212
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
6213
+ GGML_ASSERT(src0->type == dst->type);
6069
6214
 
6070
6215
  const int64_t ne00 = src0->ne[0];
6071
6216
  const int64_t ne01 = src0->ne[1];
6217
+ const int64_t ne2 = dst->ne[2];
6072
6218
  const int64_t nrows = ggml_nrows(src0);
6073
6219
 
6074
- const int n_past = ((int32_t *) dst->op_params)[0];
6220
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6075
6221
  const int n_dims = ((int32_t *) dst->op_params)[1];
6076
6222
  const int mode = ((int32_t *) dst->op_params)[2];
6077
6223
  const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -6082,19 +6228,38 @@ inline void ggml_cuda_op_rope(
6082
6228
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6083
6229
 
6084
6230
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6085
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
6231
+
6232
+ const int32_t * pos = nullptr;
6233
+ if ((mode & 1) == 0) {
6234
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
6235
+ GGML_ASSERT(src1->ne[0] == ne2);
6236
+ pos = (const int32_t *) src1_dd;
6237
+ }
6086
6238
 
6087
6239
  const bool is_neox = mode & 2;
6088
6240
  const bool is_glm = mode & 4;
6089
6241
 
6090
6242
  // compute
6091
6243
  if (is_glm) {
6092
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6244
+ GGML_ASSERT(false);
6245
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6093
6246
  } else if (is_neox) {
6094
6247
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6095
- rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
6248
+ if (src0->type == GGML_TYPE_F32) {
6249
+ rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6250
+ } else if (src0->type == GGML_TYPE_F16) {
6251
+ rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6252
+ } else {
6253
+ GGML_ASSERT(false);
6254
+ }
6096
6255
  } else {
6097
- rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
6256
+ if (src0->type == GGML_TYPE_F32) {
6257
+ rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6258
+ } else if (src0->type == GGML_TYPE_F16) {
6259
+ rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6260
+ } else {
6261
+ GGML_ASSERT(false);
6262
+ }
6098
6263
  }
6099
6264
 
6100
6265
  (void) src1;
@@ -6265,7 +6430,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6265
6430
  }
6266
6431
  }
6267
6432
 
6268
- void ggml_cuda_set_peer_access(const int n_tokens) {
6433
+ static void ggml_cuda_set_peer_access(const int n_tokens) {
6269
6434
  static bool peer_access_enabled = false;
6270
6435
 
6271
6436
  const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
@@ -6593,27 +6758,27 @@ static void ggml_cuda_op_mul_mat(
6593
6758
  }
6594
6759
  }
6595
6760
 
6596
- void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6761
+ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6597
6762
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6598
6763
  }
6599
6764
 
6600
- void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6765
+ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6601
6766
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6602
6767
  }
6603
6768
 
6604
- void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6769
+ static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6605
6770
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6606
6771
  }
6607
6772
 
6608
- void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6773
+ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6609
6774
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6610
6775
  }
6611
6776
 
6612
- void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6777
+ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6613
6778
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6614
6779
  }
6615
6780
 
6616
- void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6781
+ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6617
6782
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6618
6783
  }
6619
6784
 
@@ -6624,17 +6789,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
6624
6789
  const int64_t ne1 = dst->ne[1];
6625
6790
 
6626
6791
  // TODO: find the optimal values for these
6627
- if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
6628
- src1->type == GGML_TYPE_F32 &&
6629
- dst->type == GGML_TYPE_F32 &&
6630
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
6631
- return true;
6632
- }
6633
-
6634
- return false;
6792
+ return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
6793
+ src1->type == GGML_TYPE_F32 &&
6794
+ dst->type == GGML_TYPE_F32 &&
6795
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
6635
6796
  }
6636
6797
 
6637
- void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6798
+ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6638
6799
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
6639
6800
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
6640
6801
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -6663,7 +6824,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6663
6824
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6664
6825
  }
6665
6826
 
6666
- void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6827
+ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6667
6828
  GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
6668
6829
  GGML_ASSERT(!ggml_is_permuted(src0));
6669
6830
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
@@ -6697,7 +6858,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6697
6858
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6698
6859
  }
6699
6860
 
6700
- void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6861
+ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6701
6862
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6702
6863
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6703
6864
 
@@ -6741,11 +6902,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6741
6902
  }
6742
6903
  }
6743
6904
 
6744
- void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6905
+ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6745
6906
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6746
6907
  }
6747
6908
 
6748
- void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6909
+ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6749
6910
  const int64_t ne = ggml_nelements(src0);
6750
6911
  GGML_ASSERT(ne == ggml_nelements(src1));
6751
6912
 
@@ -6787,35 +6948,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6787
6948
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6788
6949
  ne10, ne11, nb10, nb11, nb12, main_stream);
6789
6950
  } else {
6951
+ fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
6952
+ ggml_type_name(src0->type), ggml_type_name(src1->type));
6790
6953
  GGML_ASSERT(false);
6791
6954
  }
6792
6955
 
6793
6956
  (void) dst;
6794
6957
  }
6795
6958
 
6796
- void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6959
+ static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6797
6960
  ggml_cuda_cpy(src0, dst, nullptr);
6798
6961
  (void) src1;
6799
6962
  }
6800
6963
 
6801
- void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6964
+ static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6802
6965
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6803
6966
  }
6804
6967
 
6805
- void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6968
+ static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6806
6969
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6807
6970
  }
6808
6971
 
6809
- void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6972
+ static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6810
6973
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6811
6974
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6812
6975
  }
6813
6976
 
6814
- void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6977
+ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6815
6978
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6816
6979
  }
6817
6980
 
6818
- void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6981
+ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6819
6982
  (void) src0;
6820
6983
  (void) src1;
6821
6984
  (void) dst;
@@ -6938,11 +7101,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6938
7101
  return extra;
6939
7102
  }
6940
7103
 
6941
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
7104
+ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6942
7105
  if (scratch && g_scratch_size == 0) {
6943
7106
  return;
6944
7107
  }
6945
7108
 
7109
+ tensor->backend = GGML_BACKEND_GPU;
7110
+
6946
7111
  // recursively assign CUDA buffers until a compute tensor is found
6947
7112
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6948
7113
  const ggml_op src0_op = tensor->src[0]->op;
@@ -6954,8 +7119,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6954
7119
  ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6955
7120
  }
6956
7121
 
6957
- tensor->backend = GGML_BACKEND_GPU;
6958
-
6959
7122
  if (scratch && no_alloc) {
6960
7123
  return;
6961
7124
  }
@@ -7040,6 +7203,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
7040
7203
  tensor->extra = extra;
7041
7204
  }
7042
7205
 
7206
+ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7207
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7208
+ GGML_ASSERT(ggml_is_contiguous(tensor));
7209
+
7210
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7211
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7212
+ CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7213
+ }
7214
+
7043
7215
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
7044
7216
  ggml_cuda_assign_buffers_impl(tensor, true, false, false);
7045
7217
  }
@@ -7075,7 +7247,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7075
7247
  }
7076
7248
 
7077
7249
  void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7078
- g_scratch_size = scratch_size;
7250
+ // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7251
+ // it still won't always work as expected, but it's better than nothing
7252
+ if (scratch_size > g_scratch_size) {
7253
+ ggml_cuda_free_scratch();
7254
+ }
7255
+ g_scratch_size = std::max(g_scratch_size, scratch_size);
7079
7256
  }
7080
7257
 
7081
7258
  void ggml_cuda_free_scratch() {