llama_cpp 0.5.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ #include <algorithm>
1
2
  #include <cstddef>
2
3
  #include <cstdint>
3
4
  #include <limits>
@@ -14,9 +15,11 @@
14
15
  // for rocblas_initialize()
15
16
  #include "rocblas/rocblas.h"
16
17
  #endif // __HIP_PLATFORM_AMD__
18
+ #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
17
19
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
20
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
21
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
22
+ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
20
23
  #define CUBLAS_OP_N HIPBLAS_OP_N
21
24
  #define CUBLAS_OP_T HIPBLAS_OP_T
22
25
  #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
@@ -77,9 +80,9 @@
77
80
  #include "ggml.h"
78
81
 
79
82
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
80
- #define CC_TURING 700
83
+ #define CC_VOLTA 700
81
84
  #define CC_OFFSET_AMD 1000000
82
- #define CC_RDNA2 CC_OFFSET_AMD + 1030
85
+ #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
83
86
 
84
87
  #if defined(GGML_USE_HIPBLAS)
85
88
  #define __CUDA_ARCH__ 1300
@@ -235,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
235
238
  return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
236
239
  }
237
240
 
241
+ template<typename T>
242
+ using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
243
+ typedef to_t_cuda_t<float> to_fp32_cuda_t;
244
+ typedef to_t_cuda_t<half> to_fp16_cuda_t;
245
+
238
246
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
239
- typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
240
247
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
241
248
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
242
249
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
@@ -461,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
461
468
  static bool g_mul_mat_q = true;
462
469
 
463
470
  static void * g_scratch_buffer = nullptr;
464
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
471
+ static size_t g_scratch_size = 0; // disabled by default
465
472
  static size_t g_scratch_offset = 0;
466
473
 
467
474
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -708,7 +715,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
708
715
 
709
716
  //================================== k-quants
710
717
 
711
- static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
718
+ template<typename dst_t>
719
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
712
720
 
713
721
  const int i = blockIdx.x;
714
722
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -720,7 +728,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
720
728
  const int is = 8*n + l/16;
721
729
 
722
730
  const uint8_t q = x[i].qs[32*n + l];
723
- float * y = yy + i*QK_K + 128*n;
731
+ dst_t * y = yy + i*QK_K + 128*n;
724
732
 
725
733
  float dall = __low2half(x[i].dm);
726
734
  float dmin = __high2half(x[i].dm);
@@ -732,7 +740,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
732
740
  const int is = tid/16; // 0 or 1
733
741
  const int il = tid%16; // 0...15
734
742
  const uint8_t q = x[i].qs[il] >> (2*is);
735
- float * y = yy + i*QK_K + 16*is + il;
743
+ dst_t * y = yy + i*QK_K + 16*is + il;
736
744
  float dall = __low2half(x[i].dm);
737
745
  float dmin = __high2half(x[i].dm);
738
746
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
@@ -741,7 +749,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
741
749
 
742
750
  }
743
751
 
744
- static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
752
+ template<typename dst_t>
753
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
745
754
 
746
755
  const int i = blockIdx.x;
747
756
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -765,7 +774,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
765
774
  float d_all = x[i].d;
766
775
  float dl = d_all * (us - 32);
767
776
 
768
- float * y = yy + i*QK_K + 128*n + 32*j;
777
+ dst_t * y = yy + i*QK_K + 128*n + 32*j;
769
778
  const uint8_t * q = x[i].qs + 32*n;
770
779
  const uint8_t * hm = x[i].hmask;
771
780
 
@@ -777,7 +786,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
777
786
  const int im = il/8; // 0...1
778
787
  const int in = il%8; // 0...7
779
788
 
780
- float * y = yy + i*QK_K + 16*is + il;
789
+ dst_t * y = yy + i*QK_K + 16*is + il;
781
790
 
782
791
  const uint8_t q = x[i].qs[il] >> (2*is);
783
792
  const uint8_t h = x[i].hmask[in] >> (2*is + im);
@@ -805,7 +814,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
805
814
  }
806
815
  #endif
807
816
 
808
- static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
817
+ template<typename dst_t>
818
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
809
819
  const block_q4_K * x = (const block_q4_K *) vx;
810
820
 
811
821
  const int i = blockIdx.x;
@@ -818,7 +828,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
818
828
  const int is = 2*il;
819
829
  const int n = 4;
820
830
 
821
- float * y = yy + i*QK_K + 64*il + n*ir;
831
+ dst_t * y = yy + i*QK_K + 64*il + n*ir;
822
832
 
823
833
  const float dall = __low2half(x[i].dm);
824
834
  const float dmin = __high2half(x[i].dm);
@@ -837,7 +847,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
837
847
  #else
838
848
  const int tid = threadIdx.x;
839
849
  const uint8_t * q = x[i].qs;
840
- float * y = yy + i*QK_K;
850
+ dst_t * y = yy + i*QK_K;
841
851
  const float d = (float)x[i].dm[0];
842
852
  const float m = (float)x[i].dm[1];
843
853
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
@@ -845,7 +855,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
845
855
  #endif
846
856
  }
847
857
 
848
- static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
858
+ template<typename dst_t>
859
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
849
860
  const block_q5_K * x = (const block_q5_K *) vx;
850
861
 
851
862
  const int i = blockIdx.x;
@@ -857,7 +868,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
857
868
  const int ir = tid%16; // ir is in 0...15
858
869
  const int is = 2*il; // is is in 0...6
859
870
 
860
- float * y = yy + i*QK_K + 64*il + 2*ir;
871
+ dst_t * y = yy + i*QK_K + 64*il + 2*ir;
861
872
 
862
873
  const float dall = __low2half(x[i].dm);
863
874
  const float dmin = __high2half(x[i].dm);
@@ -885,13 +896,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
885
896
  const int is = tid/16; // 0 or 1
886
897
  const uint8_t h = x[i].qh[in] >> im;
887
898
  const float d = x[i].d;
888
- float * y = yy + i*QK_K + tid;
899
+ dst_t * y = yy + i*QK_K + tid;
889
900
  y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
890
901
  y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
891
902
  #endif
892
903
  }
893
904
 
894
- static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
905
+ template<typename dst_t>
906
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
895
907
  const block_q6_K * x = (const block_q6_K *) vx;
896
908
 
897
909
  const int i = blockIdx.x;
@@ -903,7 +915,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
903
915
  const int il = tid - 32*ip; // 0...32
904
916
  const int is = 8*ip + il/16;
905
917
 
906
- float * y = yy + i*QK_K + 128*ip + il;
918
+ dst_t * y = yy + i*QK_K + 128*ip + il;
907
919
 
908
920
  const float d = x[i].d;
909
921
 
@@ -922,7 +934,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
922
934
  const int ip = tid/16; // 0 or 1
923
935
  const int il = tid - 16*ip; // 0...15
924
936
 
925
- float * y = yy + i*QK_K + 16*ip + il;
937
+ dst_t * y = yy + i*QK_K + 16*ip + il;
926
938
 
927
939
  const float d = x[i].d;
928
940
 
@@ -1515,6 +1527,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1515
1527
  v.y = x[ib + iqs + 1];
1516
1528
  }
1517
1529
 
1530
+ static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
1531
+ const float * x = (const float *) vx;
1532
+
1533
+ // automatic half -> float type cast if dfloat == float
1534
+ v.x = x[ib + iqs + 0];
1535
+ v.y = x[ib + iqs + 1];
1536
+ }
1537
+
1518
1538
  static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1519
1539
  const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1520
1540
 
@@ -1554,8 +1574,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1554
1574
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1555
1575
  }
1556
1576
 
1557
- template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1558
- static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1577
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1578
+ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1559
1579
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1560
1580
 
1561
1581
  if (i >= k) {
@@ -3533,7 +3553,7 @@ template <bool need_check> static __global__ void
3533
3553
  load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3534
3554
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3535
3555
 
3536
- #elif __CUDA_ARCH__ >= CC_TURING
3556
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3537
3557
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3538
3558
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3539
3559
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3553,7 +3573,7 @@ template <bool need_check> static __global__ void
3553
3573
  #else
3554
3574
  (void) vec_dot_q4_0_q8_1_mul_mat;
3555
3575
  assert(false);
3556
- #endif // __CUDA_ARCH__ >= CC_TURING
3576
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3557
3577
  }
3558
3578
 
3559
3579
  #define MMQ_X_Q4_1_RDNA2 64
@@ -3574,9 +3594,9 @@ template <bool need_check> static __global__ void
3574
3594
  #if defined(RDNA3) || defined(RDNA2)
3575
3595
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3576
3596
  #endif // defined(RDNA3) || defined(RDNA2)
3577
- #elif __CUDA_ARCH__ < CC_TURING
3597
+ #elif __CUDA_ARCH__ < CC_VOLTA
3578
3598
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3579
- #endif // __CUDA_ARCH__ < CC_TURING
3599
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3580
3600
  mul_mat_q4_1(
3581
3601
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3582
3602
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3596,7 +3616,7 @@ template <bool need_check> static __global__ void
3596
3616
  load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3597
3617
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3598
3618
 
3599
- #elif __CUDA_ARCH__ >= CC_TURING
3619
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3600
3620
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3601
3621
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3602
3622
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3616,7 +3636,7 @@ template <bool need_check> static __global__ void
3616
3636
  #else
3617
3637
  (void) vec_dot_q4_1_q8_1_mul_mat;
3618
3638
  assert(false);
3619
- #endif // __CUDA_ARCH__ >= CC_TURING
3639
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3620
3640
  }
3621
3641
 
3622
3642
  #define MMQ_X_Q5_0_RDNA2 64
@@ -3657,7 +3677,7 @@ template <bool need_check> static __global__ void
3657
3677
  load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3658
3678
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3659
3679
 
3660
- #elif __CUDA_ARCH__ >= CC_TURING
3680
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3661
3681
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3662
3682
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3663
3683
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3677,7 +3697,7 @@ template <bool need_check> static __global__ void
3677
3697
  #else
3678
3698
  (void) vec_dot_q5_0_q8_1_mul_mat;
3679
3699
  assert(false);
3680
- #endif // __CUDA_ARCH__ >= CC_TURING
3700
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3681
3701
  }
3682
3702
 
3683
3703
  #define MMQ_X_Q5_1_RDNA2 64
@@ -3718,7 +3738,7 @@ mul_mat_q5_1(
3718
3738
  load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3719
3739
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3720
3740
 
3721
- #elif __CUDA_ARCH__ >= CC_TURING
3741
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3722
3742
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3723
3743
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3724
3744
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3738,7 +3758,7 @@ mul_mat_q5_1(
3738
3758
  #else
3739
3759
  (void) vec_dot_q5_1_q8_1_mul_mat;
3740
3760
  assert(false);
3741
- #endif // __CUDA_ARCH__ >= CC_TURING
3761
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3742
3762
  }
3743
3763
 
3744
3764
  #define MMQ_X_Q8_0_RDNA2 64
@@ -3779,7 +3799,7 @@ template <bool need_check> static __global__ void
3779
3799
  load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3780
3800
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3781
3801
 
3782
- #elif __CUDA_ARCH__ >= CC_TURING
3802
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3783
3803
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3784
3804
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3785
3805
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3799,7 +3819,7 @@ template <bool need_check> static __global__ void
3799
3819
  #else
3800
3820
  (void) vec_dot_q8_0_q8_1_mul_mat;
3801
3821
  assert(false);
3802
- #endif // __CUDA_ARCH__ >= CC_TURING
3822
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3803
3823
  }
3804
3824
 
3805
3825
  #define MMQ_X_Q2_K_RDNA2 64
@@ -3840,7 +3860,7 @@ mul_mat_q2_K(
3840
3860
  load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3841
3861
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3842
3862
 
3843
- #elif __CUDA_ARCH__ >= CC_TURING
3863
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3844
3864
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3845
3865
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3846
3866
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3860,7 +3880,7 @@ mul_mat_q2_K(
3860
3880
  #else
3861
3881
  (void) vec_dot_q2_K_q8_1_mul_mat;
3862
3882
  assert(false);
3863
- #endif // __CUDA_ARCH__ >= CC_TURING
3883
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3864
3884
  }
3865
3885
 
3866
3886
  #define MMQ_X_Q3_K_RDNA2 128
@@ -3881,9 +3901,9 @@ template <bool need_check> static __global__ void
3881
3901
  #if defined(RDNA3) || defined(RDNA2)
3882
3902
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3883
3903
  #endif // defined(RDNA3) || defined(RDNA2)
3884
- #elif __CUDA_ARCH__ < CC_TURING
3904
+ #elif __CUDA_ARCH__ < CC_VOLTA
3885
3905
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3886
- #endif // __CUDA_ARCH__ < CC_TURING
3906
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3887
3907
  mul_mat_q3_K(
3888
3908
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3889
3909
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3903,7 +3923,7 @@ template <bool need_check> static __global__ void
3903
3923
  load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3904
3924
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3905
3925
 
3906
- #elif __CUDA_ARCH__ >= CC_TURING
3926
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3907
3927
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3908
3928
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3909
3929
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3923,7 +3943,7 @@ template <bool need_check> static __global__ void
3923
3943
  #else
3924
3944
  (void) vec_dot_q3_K_q8_1_mul_mat;
3925
3945
  assert(false);
3926
- #endif // __CUDA_ARCH__ >= CC_TURING
3946
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3927
3947
  }
3928
3948
 
3929
3949
  #define MMQ_X_Q4_K_RDNA2 64
@@ -3944,9 +3964,9 @@ template <bool need_check> static __global__ void
3944
3964
  #if defined(RDNA3) || defined(RDNA2)
3945
3965
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3946
3966
  #endif // defined(RDNA3) || defined(RDNA2)
3947
- #elif __CUDA_ARCH__ < CC_TURING
3967
+ #elif __CUDA_ARCH__ < CC_VOLTA
3948
3968
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3949
- #endif // __CUDA_ARCH__ < CC_TURING
3969
+ #endif // __CUDA_ARCH__ < CC_VOLTA
3950
3970
  mul_mat_q4_K(
3951
3971
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3952
3972
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -3966,7 +3986,7 @@ template <bool need_check> static __global__ void
3966
3986
  load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3967
3987
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3968
3988
 
3969
- #elif __CUDA_ARCH__ >= CC_TURING
3989
+ #elif __CUDA_ARCH__ >= CC_VOLTA
3970
3990
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3971
3991
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3972
3992
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -3986,7 +4006,7 @@ template <bool need_check> static __global__ void
3986
4006
  #else
3987
4007
  (void) vec_dot_q4_K_q8_1_mul_mat;
3988
4008
  assert(false);
3989
- #endif // __CUDA_ARCH__ >= CC_TURING
4009
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
3990
4010
  }
3991
4011
 
3992
4012
  #define MMQ_X_Q5_K_RDNA2 64
@@ -4027,7 +4047,7 @@ mul_mat_q5_K(
4027
4047
  load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4028
4048
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4029
4049
 
4030
- #elif __CUDA_ARCH__ >= CC_TURING
4050
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4031
4051
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
4032
4052
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
4033
4053
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -4047,7 +4067,7 @@ mul_mat_q5_K(
4047
4067
  #else
4048
4068
  (void) vec_dot_q5_K_q8_1_mul_mat;
4049
4069
  assert(false);
4050
- #endif // __CUDA_ARCH__ >= CC_TURING
4070
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4051
4071
  }
4052
4072
 
4053
4073
  #define MMQ_X_Q6_K_RDNA2 64
@@ -4068,9 +4088,9 @@ template <bool need_check> static __global__ void
4068
4088
  #if defined(RDNA3) || defined(RDNA2)
4069
4089
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4070
4090
  #endif // defined(RDNA3) || defined(RDNA2)
4071
- #elif __CUDA_ARCH__ < CC_TURING
4091
+ #elif __CUDA_ARCH__ < CC_VOLTA
4072
4092
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
4073
- #endif // __CUDA_ARCH__ < CC_TURING
4093
+ #endif // __CUDA_ARCH__ < CC_VOLTA
4074
4094
  mul_mat_q6_K(
4075
4095
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
4076
4096
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
@@ -4090,7 +4110,7 @@ template <bool need_check> static __global__ void
4090
4110
  load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4091
4111
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4092
4112
 
4093
- #elif __CUDA_ARCH__ >= CC_TURING
4113
+ #elif __CUDA_ARCH__ >= CC_VOLTA
4094
4114
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
4095
4115
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
4096
4116
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4110,7 +4130,7 @@ template <bool need_check> static __global__ void
4110
4130
  #else
4111
4131
  (void) vec_dot_q6_K_q8_1_mul_mat;
4112
4132
  assert(false);
4113
- #endif // __CUDA_ARCH__ >= CC_TURING
4133
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
4114
4134
  }
4115
4135
 
4116
4136
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -4355,8 +4375,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4355
4375
  }
4356
4376
 
4357
4377
  // rope == RoPE == rotary positional embedding
4358
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
4359
- const float p_delta, const int p_delta_rows, const float theta_scale) {
4378
+
4379
+ template<typename T, bool has_pos>
4380
+ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4381
+ const int p_delta_rows, const float theta_scale) {
4360
4382
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4361
4383
 
4362
4384
  if (col >= ncols) {
@@ -4365,8 +4387,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
4365
4387
 
4366
4388
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4367
4389
  const int i = row*ncols + col;
4390
+ const int i2 = row/p_delta_rows;
4368
4391
 
4369
- const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4392
+ const int p = has_pos ? pos[i2] : 0;
4393
+ const float p0 = p*freq_scale;
4394
+ const float theta = p0*powf(theta_scale, col/2);
4370
4395
  const float sin_theta = sinf(theta);
4371
4396
  const float cos_theta = cosf(theta);
4372
4397
 
@@ -4377,8 +4402,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
4377
4402
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
4378
4403
  }
4379
4404
 
4380
- static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4381
- const float p_delta, const int p_delta_rows, const float theta_scale) {
4405
+ template<typename T, bool has_pos>
4406
+ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4407
+ const int p_delta_rows, const float theta_scale) {
4382
4408
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4383
4409
 
4384
4410
  if (col >= ncols) {
@@ -4387,8 +4413,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4387
4413
 
4388
4414
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4389
4415
  const int i = row*ncols + col/2;
4416
+ const int i2 = row/p_delta_rows;
4390
4417
 
4391
- const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4418
+ const int p = has_pos ? pos[i2] : 0;
4419
+ const float p0 = p*freq_scale;
4420
+ const float theta = p0*powf(theta_scale, col/2);
4392
4421
  const float sin_theta = sinf(theta);
4393
4422
  const float cos_theta = cosf(theta);
4394
4423
 
@@ -4399,8 +4428,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4399
4428
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4400
4429
  }
4401
4430
 
4402
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4403
- const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4431
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4432
+ const int p_delta_rows, const float theta_scale, const int n_ctx) {
4404
4433
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4405
4434
  const int half_n_dims = ncols/4;
4406
4435
 
@@ -4410,11 +4439,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4410
4439
 
4411
4440
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4412
4441
  const int i = row*ncols + col;
4442
+ const int i2 = row/p_delta_rows;
4413
4443
 
4414
4444
  const float col_theta_scale = powf(theta_scale, col);
4415
- const float p = p0 + p_delta*(row/p_delta_rows);
4445
+ // FIXME: this is likely wrong
4446
+ const int p = pos != nullptr ? pos[i2] : 0;
4416
4447
 
4417
- const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4448
+ const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
4418
4449
  const float sin_theta = sinf(theta);
4419
4450
  const float cos_theta = cosf(theta);
4420
4451
 
@@ -4424,7 +4455,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4424
4455
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4425
4456
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4426
4457
 
4427
- const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4458
+ const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
4428
4459
  const float sin_block_theta = sinf(block_theta);
4429
4460
  const float cos_block_theta = cosf(block_theta);
4430
4461
 
@@ -4578,32 +4609,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4578
4609
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4579
4610
  }
4580
4611
 
4581
- static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4612
+ template<typename dst_t>
4613
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4582
4614
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4583
4615
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4584
4616
  }
4585
4617
 
4586
- static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4618
+ template<typename dst_t>
4619
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4587
4620
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4588
4621
  dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4589
4622
  }
4590
4623
 
4591
- static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4624
+ template<typename dst_t>
4625
+ static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4592
4626
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4593
4627
  dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4594
4628
  }
4595
4629
 
4596
- static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4630
+ template<typename dst_t>
4631
+ static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4597
4632
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4598
4633
  dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4599
4634
  }
4600
4635
 
4601
- static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4636
+ template<typename dst_t>
4637
+ static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4602
4638
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4603
4639
  dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4604
4640
  }
4605
4641
 
4606
- static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4642
+ template<typename dst_t>
4643
+ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4607
4644
  const int nb = k / QK_K;
4608
4645
  #if QK_K == 256
4609
4646
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4612,7 +4649,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
4612
4649
  #endif
4613
4650
  }
4614
4651
 
4615
- static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4652
+ template<typename dst_t>
4653
+ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4616
4654
  const int nb = k / QK_K;
4617
4655
  #if QK_K == 256
4618
4656
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4621,12 +4659,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
4621
4659
  #endif
4622
4660
  }
4623
4661
 
4624
- static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4662
+ template<typename dst_t>
4663
+ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4625
4664
  const int nb = k / QK_K;
4626
4665
  dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
4627
4666
  }
4628
4667
 
4629
- static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4668
+ template<typename dst_t>
4669
+ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4630
4670
  const int nb = k / QK_K;
4631
4671
  #if QK_K == 256
4632
4672
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4635,7 +4675,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu
4635
4675
  #endif
4636
4676
  }
4637
4677
 
4638
- static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
4678
+ template<typename dst_t>
4679
+ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4639
4680
  const int nb = k / QK_K;
4640
4681
  #if QK_K == 256
4641
4682
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -4826,6 +4867,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
4826
4867
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4827
4868
  }
4828
4869
 
4870
+ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
4871
+ const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
4872
+ dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4873
+ }
4874
+
4829
4875
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4830
4876
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4831
4877
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -4835,6 +4881,35 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
4835
4881
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4836
4882
  }
4837
4883
 
4884
+ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
4885
+ switch (type) {
4886
+ case GGML_TYPE_Q4_0:
4887
+ return dequantize_row_q4_0_cuda;
4888
+ case GGML_TYPE_Q4_1:
4889
+ return dequantize_row_q4_1_cuda;
4890
+ case GGML_TYPE_Q5_0:
4891
+ return dequantize_row_q5_0_cuda;
4892
+ case GGML_TYPE_Q5_1:
4893
+ return dequantize_row_q5_1_cuda;
4894
+ case GGML_TYPE_Q8_0:
4895
+ return dequantize_row_q8_0_cuda;
4896
+ case GGML_TYPE_Q2_K:
4897
+ return dequantize_row_q2_K_cuda;
4898
+ case GGML_TYPE_Q3_K:
4899
+ return dequantize_row_q3_K_cuda;
4900
+ case GGML_TYPE_Q4_K:
4901
+ return dequantize_row_q4_K_cuda;
4902
+ case GGML_TYPE_Q5_K:
4903
+ return dequantize_row_q5_K_cuda;
4904
+ case GGML_TYPE_Q6_K:
4905
+ return dequantize_row_q6_K_cuda;
4906
+ case GGML_TYPE_F32:
4907
+ return convert_fp32_to_fp16_cuda;
4908
+ default:
4909
+ return nullptr;
4910
+ }
4911
+ }
4912
+
4838
4913
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
4839
4914
  switch (type) {
4840
4915
  case GGML_TYPE_Q4_0:
@@ -4881,7 +4956,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4881
4956
  mmq_x = MMQ_X_Q4_0_RDNA1;
4882
4957
  mmq_y = MMQ_Y_Q4_0_RDNA1;
4883
4958
  nwarps = NWARPS_Q4_0_RDNA1;
4884
- } else if (compute_capability >= CC_TURING) {
4959
+ } else if (compute_capability >= CC_VOLTA) {
4885
4960
  mmq_x = MMQ_X_Q4_0_AMPERE;
4886
4961
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4887
4962
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4926,7 +5001,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4926
5001
  mmq_x = MMQ_X_Q4_1_RDNA1;
4927
5002
  mmq_y = MMQ_Y_Q4_1_RDNA1;
4928
5003
  nwarps = NWARPS_Q4_1_RDNA1;
4929
- } else if (compute_capability >= CC_TURING) {
5004
+ } else if (compute_capability >= CC_VOLTA) {
4930
5005
  mmq_x = MMQ_X_Q4_1_AMPERE;
4931
5006
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4932
5007
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -4971,7 +5046,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4971
5046
  mmq_x = MMQ_X_Q5_0_RDNA1;
4972
5047
  mmq_y = MMQ_Y_Q5_0_RDNA1;
4973
5048
  nwarps = NWARPS_Q5_0_RDNA1;
4974
- } else if (compute_capability >= CC_TURING) {
5049
+ } else if (compute_capability >= CC_VOLTA) {
4975
5050
  mmq_x = MMQ_X_Q5_0_AMPERE;
4976
5051
  mmq_y = MMQ_Y_Q5_0_AMPERE;
4977
5052
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -5016,7 +5091,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
5016
5091
  mmq_x = MMQ_X_Q5_1_RDNA1;
5017
5092
  mmq_y = MMQ_Y_Q5_1_RDNA1;
5018
5093
  nwarps = NWARPS_Q5_1_RDNA1;
5019
- } else if (compute_capability >= CC_TURING) {
5094
+ } else if (compute_capability >= CC_VOLTA) {
5020
5095
  mmq_x = MMQ_X_Q5_1_AMPERE;
5021
5096
  mmq_y = MMQ_Y_Q5_1_AMPERE;
5022
5097
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -5061,7 +5136,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
5061
5136
  mmq_x = MMQ_X_Q8_0_RDNA1;
5062
5137
  mmq_y = MMQ_Y_Q8_0_RDNA1;
5063
5138
  nwarps = NWARPS_Q8_0_RDNA1;
5064
- } else if (compute_capability >= CC_TURING) {
5139
+ } else if (compute_capability >= CC_VOLTA) {
5065
5140
  mmq_x = MMQ_X_Q8_0_AMPERE;
5066
5141
  mmq_y = MMQ_Y_Q8_0_AMPERE;
5067
5142
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -5106,7 +5181,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
5106
5181
  mmq_x = MMQ_X_Q2_K_RDNA1;
5107
5182
  mmq_y = MMQ_Y_Q2_K_RDNA1;
5108
5183
  nwarps = NWARPS_Q2_K_RDNA1;
5109
- } else if (compute_capability >= CC_TURING) {
5184
+ } else if (compute_capability >= CC_VOLTA) {
5110
5185
  mmq_x = MMQ_X_Q2_K_AMPERE;
5111
5186
  mmq_y = MMQ_Y_Q2_K_AMPERE;
5112
5187
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -5153,7 +5228,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
5153
5228
  mmq_x = MMQ_X_Q3_K_RDNA1;
5154
5229
  mmq_y = MMQ_Y_Q3_K_RDNA1;
5155
5230
  nwarps = NWARPS_Q3_K_RDNA1;
5156
- } else if (compute_capability >= CC_TURING) {
5231
+ } else if (compute_capability >= CC_VOLTA) {
5157
5232
  mmq_x = MMQ_X_Q3_K_AMPERE;
5158
5233
  mmq_y = MMQ_Y_Q3_K_AMPERE;
5159
5234
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -5199,7 +5274,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
5199
5274
  mmq_x = MMQ_X_Q4_K_RDNA1;
5200
5275
  mmq_y = MMQ_Y_Q4_K_RDNA1;
5201
5276
  nwarps = NWARPS_Q4_K_RDNA1;
5202
- } else if (compute_capability >= CC_TURING) {
5277
+ } else if (compute_capability >= CC_VOLTA) {
5203
5278
  mmq_x = MMQ_X_Q4_K_AMPERE;
5204
5279
  mmq_y = MMQ_Y_Q4_K_AMPERE;
5205
5280
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -5244,7 +5319,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
5244
5319
  mmq_x = MMQ_X_Q5_K_RDNA1;
5245
5320
  mmq_y = MMQ_Y_Q5_K_RDNA1;
5246
5321
  nwarps = NWARPS_Q5_K_RDNA1;
5247
- } else if (compute_capability >= CC_TURING) {
5322
+ } else if (compute_capability >= CC_VOLTA) {
5248
5323
  mmq_x = MMQ_X_Q5_K_AMPERE;
5249
5324
  mmq_y = MMQ_Y_Q5_K_AMPERE;
5250
5325
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -5289,7 +5364,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
5289
5364
  mmq_x = MMQ_X_Q6_K_RDNA1;
5290
5365
  mmq_y = MMQ_Y_Q6_K_RDNA1;
5291
5366
  nwarps = NWARPS_Q6_K_RDNA1;
5292
- } else if (compute_capability >= CC_TURING) {
5367
+ } else if (compute_capability >= CC_VOLTA) {
5293
5368
  mmq_x = MMQ_X_Q6_K_AMPERE;
5294
5369
  mmq_y = MMQ_Y_Q6_K_AMPERE;
5295
5370
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -5361,31 +5436,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5361
5436
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5362
5437
  }
5363
5438
 
5364
- static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5365
- const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5439
+ template<typename T>
5440
+ static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5441
+ const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5366
5442
  GGML_ASSERT(ncols % 2 == 0);
5367
5443
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5368
5444
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5369
5445
  const dim3 block_nums(nrows, num_blocks_x, 1);
5370
- rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
5446
+ if (pos == nullptr) {
5447
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5448
+ } else {
5449
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5450
+ }
5371
5451
  }
5372
5452
 
5373
- static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5374
- const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5453
+ template<typename T>
5454
+ static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5455
+ const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5375
5456
  GGML_ASSERT(ncols % 2 == 0);
5376
5457
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5377
5458
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5378
5459
  const dim3 block_nums(nrows, num_blocks_x, 1);
5379
- rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
5460
+ if (pos == nullptr) {
5461
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5462
+ } else {
5463
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5464
+ }
5380
5465
  }
5381
5466
 
5382
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5383
- const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5467
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5468
+ const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5384
5469
  GGML_ASSERT(ncols % 4 == 0);
5385
5470
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5386
5471
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5387
5472
  const dim3 block_nums(num_blocks_x, nrows, 1);
5388
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
5473
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5389
5474
  }
5390
5475
 
5391
5476
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5857,7 +5942,7 @@ static int64_t get_row_rounding(ggml_type type) {
5857
5942
  switch(type) {
5858
5943
  case GGML_TYPE_Q4_0:
5859
5944
  case GGML_TYPE_Q4_1:
5860
- return max_compute_capability >= CC_TURING ? 128 : 64;
5945
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5861
5946
  case GGML_TYPE_Q5_0:
5862
5947
  case GGML_TYPE_Q5_1:
5863
5948
  case GGML_TYPE_Q8_0:
@@ -5868,7 +5953,7 @@ static int64_t get_row_rounding(ggml_type type) {
5868
5953
  case GGML_TYPE_Q3_K:
5869
5954
  case GGML_TYPE_Q4_K:
5870
5955
  case GGML_TYPE_Q5_K:
5871
- return max_compute_capability >= CC_TURING ? 128 : 64;
5956
+ return max_compute_capability >= CC_VOLTA ? 128 : 64;
5872
5957
  case GGML_TYPE_Q6_K:
5873
5958
  return 64;
5874
5959
  default:
@@ -6016,8 +6101,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
6016
6101
  GGML_ASSERT(src1_ddf_i != nullptr);
6017
6102
  GGML_ASSERT(dst_dd_i != nullptr);
6018
6103
 
6019
- const float alpha = 1.0f;
6020
- const float beta = 0.0f;
6021
6104
 
6022
6105
  const int64_t ne00 = src0->ne[0];
6023
6106
 
@@ -6026,16 +6109,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
6026
6109
  const int64_t ne0 = dst->ne[0];
6027
6110
  const int64_t row_diff = row_high - row_low;
6028
6111
 
6029
- float * src0_ddq_as_f32;
6030
- size_t src0_as = 0;
6031
-
6032
- if (src0->type != GGML_TYPE_F32) {
6033
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6034
- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6035
- to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6036
- }
6037
- const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
6038
-
6039
6112
  int id;
6040
6113
  CUDA_CHECK(cudaGetDevice(&id));
6041
6114
 
@@ -6043,16 +6116,87 @@ inline void ggml_cuda_op_mul_mat_cublas(
6043
6116
  // ldc == nrows of the matrix that cuBLAS writes into
6044
6117
  int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
6045
6118
 
6046
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6047
- CUBLAS_CHECK(
6048
- cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6049
- row_diff, src1_ncols, ne10,
6050
- &alpha, src0_ddf_i, ne00,
6051
- src1_ddf_i, ne10,
6052
- &beta, dst_dd_i, ldc));
6119
+ const int compute_capability = g_compute_capabilities[id];
6120
+
6121
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
6122
+ // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
6123
+ half * src0_as_f16 = nullptr;
6124
+ size_t src0_as = 0;
6125
+ if (src0->type != GGML_TYPE_F16) {
6126
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
6127
+ GGML_ASSERT(to_fp16_cuda != nullptr);
6128
+ size_t ne = row_diff*ne00;
6129
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
6130
+ to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
6131
+ }
6132
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
6133
+
6134
+ half * src1_as_f16 = nullptr;
6135
+ size_t src1_as = 0;
6136
+ if (src1->type != GGML_TYPE_F16) {
6137
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
6138
+ GGML_ASSERT(to_fp16_cuda != nullptr);
6139
+ size_t ne = src1_ncols*ne10;
6140
+ src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6141
+ to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6142
+ }
6143
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6144
+
6145
+ size_t dst_as = 0;
6146
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6147
+
6148
+ const half alpha_f16 = 1.0f;
6149
+ const half beta_f16 = 0.0f;
6150
+
6151
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6152
+ CUBLAS_CHECK(
6153
+ cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6154
+ row_diff, src1_ncols, ne10,
6155
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
6156
+ src1_ptr, CUDA_R_16F, ne10,
6157
+ &beta_f16, dst_f16, CUDA_R_16F, ldc,
6158
+ CUBLAS_COMPUTE_16F,
6159
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
6160
+
6161
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
6162
+ to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
6163
+
6164
+ ggml_cuda_pool_free(dst_f16, dst_as);
6165
+
6166
+ if (src0_as != 0) {
6167
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
6168
+ }
6053
6169
 
6054
- if (src0_as > 0) {
6055
- ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6170
+ if (src1_as != 0) {
6171
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
6172
+ }
6173
+ }
6174
+ else {
6175
+ float * src0_ddq_as_f32 = nullptr;
6176
+ size_t src0_as = 0;
6177
+
6178
+ if (src0->type != GGML_TYPE_F32) {
6179
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6180
+ GGML_ASSERT(to_fp32_cuda != nullptr);
6181
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6182
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6183
+ }
6184
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
6185
+
6186
+ const float alpha = 1.0f;
6187
+ const float beta = 0.0f;
6188
+
6189
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6190
+ CUBLAS_CHECK(
6191
+ cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6192
+ row_diff, src1_ncols, ne10,
6193
+ &alpha, src0_ddf_i, ne00,
6194
+ src1_ddf_i, ne10,
6195
+ &beta, dst_dd_i, ldc));
6196
+
6197
+ if (src0_as != 0) {
6198
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6199
+ }
6056
6200
  }
6057
6201
 
6058
6202
  (void) dst;
@@ -6064,14 +6208,16 @@ inline void ggml_cuda_op_rope(
6064
6208
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6065
6209
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6066
6210
 
6067
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
6068
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
6211
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
6212
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
6213
+ GGML_ASSERT(src0->type == dst->type);
6069
6214
 
6070
6215
  const int64_t ne00 = src0->ne[0];
6071
6216
  const int64_t ne01 = src0->ne[1];
6217
+ const int64_t ne2 = dst->ne[2];
6072
6218
  const int64_t nrows = ggml_nrows(src0);
6073
6219
 
6074
- const int n_past = ((int32_t *) dst->op_params)[0];
6220
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6075
6221
  const int n_dims = ((int32_t *) dst->op_params)[1];
6076
6222
  const int mode = ((int32_t *) dst->op_params)[2];
6077
6223
  const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -6082,19 +6228,38 @@ inline void ggml_cuda_op_rope(
6082
6228
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6083
6229
 
6084
6230
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6085
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
6231
+
6232
+ const int32_t * pos = nullptr;
6233
+ if ((mode & 1) == 0) {
6234
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
6235
+ GGML_ASSERT(src1->ne[0] == ne2);
6236
+ pos = (const int32_t *) src1_dd;
6237
+ }
6086
6238
 
6087
6239
  const bool is_neox = mode & 2;
6088
6240
  const bool is_glm = mode & 4;
6089
6241
 
6090
6242
  // compute
6091
6243
  if (is_glm) {
6092
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6244
+ GGML_ASSERT(false);
6245
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6093
6246
  } else if (is_neox) {
6094
6247
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6095
- rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
6248
+ if (src0->type == GGML_TYPE_F32) {
6249
+ rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6250
+ } else if (src0->type == GGML_TYPE_F16) {
6251
+ rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6252
+ } else {
6253
+ GGML_ASSERT(false);
6254
+ }
6096
6255
  } else {
6097
- rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
6256
+ if (src0->type == GGML_TYPE_F32) {
6257
+ rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6258
+ } else if (src0->type == GGML_TYPE_F16) {
6259
+ rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6260
+ } else {
6261
+ GGML_ASSERT(false);
6262
+ }
6098
6263
  }
6099
6264
 
6100
6265
  (void) src1;
@@ -6265,7 +6430,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6265
6430
  }
6266
6431
  }
6267
6432
 
6268
- void ggml_cuda_set_peer_access(const int n_tokens) {
6433
+ static void ggml_cuda_set_peer_access(const int n_tokens) {
6269
6434
  static bool peer_access_enabled = false;
6270
6435
 
6271
6436
  const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
@@ -6593,27 +6758,27 @@ static void ggml_cuda_op_mul_mat(
6593
6758
  }
6594
6759
  }
6595
6760
 
6596
- void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6761
+ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6597
6762
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6598
6763
  }
6599
6764
 
6600
- void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6765
+ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6601
6766
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6602
6767
  }
6603
6768
 
6604
- void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6769
+ static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6605
6770
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6606
6771
  }
6607
6772
 
6608
- void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6773
+ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6609
6774
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6610
6775
  }
6611
6776
 
6612
- void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6777
+ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6613
6778
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6614
6779
  }
6615
6780
 
6616
- void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6781
+ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6617
6782
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6618
6783
  }
6619
6784
 
@@ -6624,17 +6789,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
6624
6789
  const int64_t ne1 = dst->ne[1];
6625
6790
 
6626
6791
  // TODO: find the optimal values for these
6627
- if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
6628
- src1->type == GGML_TYPE_F32 &&
6629
- dst->type == GGML_TYPE_F32 &&
6630
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
6631
- return true;
6632
- }
6633
-
6634
- return false;
6792
+ return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
6793
+ src1->type == GGML_TYPE_F32 &&
6794
+ dst->type == GGML_TYPE_F32 &&
6795
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
6635
6796
  }
6636
6797
 
6637
- void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6798
+ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6638
6799
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
6639
6800
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
6640
6801
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -6663,7 +6824,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6663
6824
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6664
6825
  }
6665
6826
 
6666
- void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6827
+ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6667
6828
  GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
6668
6829
  GGML_ASSERT(!ggml_is_permuted(src0));
6669
6830
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
@@ -6697,7 +6858,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6697
6858
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6698
6859
  }
6699
6860
 
6700
- void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6861
+ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6701
6862
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6702
6863
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6703
6864
 
@@ -6741,11 +6902,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6741
6902
  }
6742
6903
  }
6743
6904
 
6744
- void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6905
+ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6745
6906
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6746
6907
  }
6747
6908
 
6748
- void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6909
+ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6749
6910
  const int64_t ne = ggml_nelements(src0);
6750
6911
  GGML_ASSERT(ne == ggml_nelements(src1));
6751
6912
 
@@ -6787,35 +6948,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6787
6948
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6788
6949
  ne10, ne11, nb10, nb11, nb12, main_stream);
6789
6950
  } else {
6951
+ fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
6952
+ ggml_type_name(src0->type), ggml_type_name(src1->type));
6790
6953
  GGML_ASSERT(false);
6791
6954
  }
6792
6955
 
6793
6956
  (void) dst;
6794
6957
  }
6795
6958
 
6796
- void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6959
+ static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6797
6960
  ggml_cuda_cpy(src0, dst, nullptr);
6798
6961
  (void) src1;
6799
6962
  }
6800
6963
 
6801
- void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6964
+ static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6802
6965
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6803
6966
  }
6804
6967
 
6805
- void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6968
+ static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6806
6969
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6807
6970
  }
6808
6971
 
6809
- void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6972
+ static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6810
6973
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6811
6974
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6812
6975
  }
6813
6976
 
6814
- void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6977
+ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6815
6978
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6816
6979
  }
6817
6980
 
6818
- void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6981
+ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6819
6982
  (void) src0;
6820
6983
  (void) src1;
6821
6984
  (void) dst;
@@ -6938,11 +7101,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6938
7101
  return extra;
6939
7102
  }
6940
7103
 
6941
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
7104
+ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6942
7105
  if (scratch && g_scratch_size == 0) {
6943
7106
  return;
6944
7107
  }
6945
7108
 
7109
+ tensor->backend = GGML_BACKEND_GPU;
7110
+
6946
7111
  // recursively assign CUDA buffers until a compute tensor is found
6947
7112
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6948
7113
  const ggml_op src0_op = tensor->src[0]->op;
@@ -6954,8 +7119,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6954
7119
  ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6955
7120
  }
6956
7121
 
6957
- tensor->backend = GGML_BACKEND_GPU;
6958
-
6959
7122
  if (scratch && no_alloc) {
6960
7123
  return;
6961
7124
  }
@@ -7040,6 +7203,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
7040
7203
  tensor->extra = extra;
7041
7204
  }
7042
7205
 
7206
+ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7207
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7208
+ GGML_ASSERT(ggml_is_contiguous(tensor));
7209
+
7210
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7211
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7212
+ CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7213
+ }
7214
+
7043
7215
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
7044
7216
  ggml_cuda_assign_buffers_impl(tensor, true, false, false);
7045
7217
  }
@@ -7075,7 +7247,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7075
7247
  }
7076
7248
 
7077
7249
  void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7078
- g_scratch_size = scratch_size;
7250
+ // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7251
+ // it still won't always work as expected, but it's better than nothing
7252
+ if (scratch_size > g_scratch_size) {
7253
+ ggml_cuda_free_scratch();
7254
+ }
7255
+ g_scratch_size = std::max(g_scratch_size, scratch_size);
7079
7256
  }
7080
7257
 
7081
7258
  void ggml_cuda_free_scratch() {