llama_cpp 0.10.0 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,15 @@
1
1
  #include <algorithm>
2
+ #include <assert.h>
3
+ #include <atomic>
4
+ #include <cinttypes>
2
5
  #include <cstddef>
3
6
  #include <cstdint>
4
- #include <cinttypes>
5
7
  #include <float.h>
6
8
  #include <limits>
7
9
  #include <stdint.h>
8
10
  #include <stdio.h>
9
- #include <atomic>
10
- #include <assert.h>
11
+ #include <vector>
12
+
11
13
 
12
14
  #if defined(GGML_USE_HIPBLAS)
13
15
  #include <hip/hip_runtime.h>
@@ -29,6 +31,7 @@
29
31
  #define CUDA_R_16F HIPBLAS_R_16F
30
32
  #define CUDA_R_32F HIPBLAS_R_32F
31
33
  #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
34
+ #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
32
35
  #define cublasCreate hipblasCreate
33
36
  #define cublasGemmEx hipblasGemmEx
34
37
  #define cublasGemmBatchedEx hipblasGemmBatchedEx
@@ -38,6 +41,7 @@
38
41
  #define cublasSetStream hipblasSetStream
39
42
  #define cublasSgemm hipblasSgemm
40
43
  #define cublasStatus_t hipblasStatus_t
44
+ #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
41
45
  #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
42
46
  #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
43
47
  #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@@ -56,8 +60,13 @@
56
60
  #define cudaGetDeviceProperties hipGetDeviceProperties
57
61
  #define cudaGetErrorString hipGetErrorString
58
62
  #define cudaGetLastError hipGetLastError
63
+ #ifdef GGML_HIP_UMA
64
+ #define cudaMalloc hipMallocManaged
65
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
66
+ #else
59
67
  #define cudaMalloc hipMalloc
60
68
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
69
+ #endif
61
70
  #define cudaMemcpy hipMemcpy
62
71
  #define cudaMemcpy2DAsync hipMemcpy2DAsync
63
72
  #define cudaMemcpyAsync hipMemcpyAsync
@@ -76,6 +85,7 @@
76
85
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
77
86
  #define cudaStream_t hipStream_t
78
87
  #define cudaSuccess hipSuccess
88
+ #define __trap abort
79
89
  #else
80
90
  #include <cuda_runtime.h>
81
91
  #include <cublas_v2.h>
@@ -437,6 +447,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
437
447
 
438
448
  #define CUDA_GELU_BLOCK_SIZE 256
439
449
  #define CUDA_SILU_BLOCK_SIZE 256
450
+ #define CUDA_TANH_BLOCK_SIZE 256
440
451
  #define CUDA_RELU_BLOCK_SIZE 256
441
452
  #define CUDA_SQR_BLOCK_SIZE 256
442
453
  #define CUDA_CPY_BLOCK_SIZE 32
@@ -449,6 +460,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
449
460
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
450
461
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
451
462
  #define CUDA_GET_ROWS_BLOCK_SIZE 256
463
+ #define CUDA_UPSCALE_BLOCK_SIZE 256
464
+ #define CUDA_CONCAT_BLOCK_SIZE 256
465
+ #define CUDA_PAD_BLOCK_SIZE 256
466
+ #define CUDA_ACC_BLOCK_SIZE 256
467
+ #define CUDA_IM2COL_BLOCK_SIZE 256
452
468
 
453
469
  // dmmv = dequantize_mul_mat_vec
454
470
  #ifndef GGML_CUDA_DMMV_X
@@ -502,6 +518,14 @@ static size_t g_scratch_offset = 0;
502
518
 
503
519
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
504
520
 
521
+ [[noreturn]]
522
+ static __device__ void bad_arch() {
523
+ printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
524
+ __trap();
525
+
526
+ (void) bad_arch; // suppress unused function warning
527
+ }
528
+
505
529
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
506
530
  #pragma unroll
507
531
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -610,6 +634,24 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
610
634
  dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
611
635
  }
612
636
 
637
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
638
+ const int ne10, const int ne11, const int ne12,
639
+ const int nb1, const int nb2, int offset) {
640
+ const int i = blockDim.x * blockIdx.x + threadIdx.x;
641
+ if (i >= ne) {
642
+ return;
643
+ }
644
+ int src1_idx = i - offset;
645
+ int oz = src1_idx / nb2;
646
+ int oy = (src1_idx - (oz * nb2)) / nb1;
647
+ int ox = src1_idx % nb1;
648
+ if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
649
+ dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
650
+ } else {
651
+ dst[i] = x[i];
652
+ }
653
+ }
654
+
613
655
  static __global__ void gelu_f32(const float * x, float * dst, const int k) {
614
656
  const float GELU_COEF_A = 0.044715f;
615
657
  const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@@ -632,6 +674,23 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
632
674
  dst[i] = x[i] / (1.0f + expf(-x[i]));
633
675
  }
634
676
 
677
+ static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
678
+ const float GELU_QUICK_COEF = -1.702f;
679
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
680
+ if (i >= k) {
681
+ return;
682
+ }
683
+ dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
684
+ }
685
+
686
+ static __global__ void tanh_f32(const float *x, float *dst, int k) {
687
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
688
+ if (i >= k) {
689
+ return;
690
+ }
691
+ dst[i] = tanhf(x[i]);
692
+ }
693
+
635
694
  static __global__ void relu_f32(const float * x, float * dst, const int k) {
636
695
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
637
696
 
@@ -641,6 +700,14 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
641
700
  dst[i] = fmaxf(x[i], 0);
642
701
  }
643
702
 
703
+ static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
704
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
705
+ if (i >= k) {
706
+ return;
707
+ }
708
+ dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
709
+ }
710
+
644
711
  static __global__ void sqr_f32(const float * x, float * dst, const int k) {
645
712
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
646
713
 
@@ -686,6 +753,132 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
686
753
  }
687
754
  }
688
755
 
756
+ static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
757
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
758
+ if (nidx >= ne0) {
759
+ return;
760
+ }
761
+ // operation
762
+ int offset_dst =
763
+ nidx +
764
+ blockIdx.y * ne0 +
765
+ blockIdx.z * ne0 * gridDim.y;
766
+ if (blockIdx.z < ne02) { // src0
767
+ int offset_src =
768
+ nidx +
769
+ blockIdx.y * ne0 +
770
+ blockIdx.z * ne0 * gridDim.y;
771
+ dst[offset_dst] = x[offset_src];
772
+ } else {
773
+ int offset_src =
774
+ nidx +
775
+ blockIdx.y * ne0 +
776
+ (blockIdx.z - ne02) * ne0 * gridDim.y;
777
+ dst[offset_dst] = y[offset_src];
778
+ }
779
+ }
780
+
781
+ static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
782
+ int ne0 = ne00 * scale_factor;
783
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
784
+ if (nidx >= ne0) {
785
+ return;
786
+ }
787
+ // operation
788
+ int i00 = nidx / scale_factor;
789
+ int i01 = blockIdx.y / scale_factor;
790
+ int offset_src =
791
+ i00 +
792
+ i01 * ne00 +
793
+ blockIdx.z * nb02;
794
+ int offset_dst =
795
+ nidx +
796
+ blockIdx.y * ne0 +
797
+ blockIdx.z * ne0 * gridDim.y;
798
+ dst[offset_dst] = x[offset_src];
799
+ }
800
+
801
+ static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
802
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
803
+ if (nidx >= ne0) {
804
+ return;
805
+ }
806
+
807
+ // operation
808
+ int offset_dst =
809
+ nidx +
810
+ blockIdx.y * ne0 +
811
+ blockIdx.z * ne0 * gridDim.y;
812
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
813
+ int offset_src =
814
+ nidx +
815
+ blockIdx.y * ne00 +
816
+ blockIdx.z * ne00 * ne01;
817
+ dst[offset_dst] = x[offset_src];
818
+ } else {
819
+ dst[offset_dst] = 0.0f;
820
+ }
821
+ }
822
+
823
+ template <int block_size>
824
+ static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
825
+ int start = blockIdx.x * group_size;
826
+ int end = start + group_size;
827
+
828
+ start += threadIdx.x;
829
+
830
+ if (end >= ne_elements) {
831
+ end = ne_elements;
832
+ }
833
+
834
+ float tmp = 0.0f; // partial sum for thread in warp
835
+
836
+ for (int j = start; j < end; j += block_size) {
837
+ tmp += x[j];
838
+ }
839
+
840
+ tmp = warp_reduce_sum(tmp);
841
+ if (block_size > WARP_SIZE) {
842
+ __shared__ float s_sum[32];
843
+ int warp_id = threadIdx.x / WARP_SIZE;
844
+ int lane_id = threadIdx.x % WARP_SIZE;
845
+ if (lane_id == 0) {
846
+ s_sum[warp_id] = tmp;
847
+ }
848
+ __syncthreads();
849
+ tmp = s_sum[lane_id];
850
+ tmp = warp_reduce_sum(tmp);
851
+ }
852
+
853
+ float mean = tmp / group_size;
854
+ tmp = 0.0f;
855
+
856
+ for (int j = start; j < end; j += block_size) {
857
+ float xi = x[j] - mean;
858
+ dst[j] = xi;
859
+ tmp += xi * xi;
860
+ }
861
+
862
+ tmp = warp_reduce_sum(tmp);
863
+ if (block_size > WARP_SIZE) {
864
+ __shared__ float s_sum[32];
865
+ int warp_id = threadIdx.x / WARP_SIZE;
866
+ int lane_id = threadIdx.x % WARP_SIZE;
867
+ if (lane_id == 0) {
868
+ s_sum[warp_id] = tmp;
869
+ }
870
+ __syncthreads();
871
+ tmp = s_sum[lane_id];
872
+ tmp = warp_reduce_sum(tmp);
873
+ }
874
+
875
+ float variance = tmp / group_size;
876
+ float scale = rsqrtf(variance + eps);
877
+ for (int j = start; j < end; j += block_size) {
878
+ dst[j] *= scale;
879
+ }
880
+ }
881
+
689
882
  template <int block_size>
690
883
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
691
884
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -1684,31 +1877,65 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1684
1877
  }
1685
1878
 
1686
1879
  template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1687
- static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1688
- const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1689
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
1690
-
1691
- if (col >= ncols) {
1880
+ static __global__ void k_get_rows(
1881
+ const void * src0, const int32_t * src1, dst_t * dst,
1882
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1883
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1884
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1885
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1886
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
1887
+
1888
+ const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1889
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
1890
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
1891
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
1892
+
1893
+ if (i00 >= ne00) {
1692
1894
  return;
1693
1895
  }
1694
1896
 
1695
- const int r = y[row];
1897
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1696
1898
 
1697
- // copy x[r*ncols + col] to dst[row*ncols + col]
1698
- const int xi = r*ncols + col;
1699
- const int di = row*ncols + col;
1899
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1900
+ const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
1700
1901
 
1701
- const int ib = xi/qk; // block index
1702
- const int iqs = (xi%qk)/qr; // quant index
1703
- const int iybs = di - di%qk; // y block start index
1902
+ const int ib = i00/qk; // block index
1903
+ const int iqs = (i00%qk)/qr; // quant index
1904
+ const int iybs = i00 - i00%qk; // dst block start index
1704
1905
  const int y_offset = qr == 1 ? 1 : qk/2;
1705
1906
 
1706
1907
  // dequantize
1707
1908
  dfloat2 v;
1708
- dequantize_kernel(x, ib, iqs, v);
1909
+ dequantize_kernel(src0_row, ib, iqs, v);
1709
1910
 
1710
- dst[iybs + iqs + 0] = v.x;
1711
- dst[iybs + iqs + y_offset] = v.y;
1911
+ dst_row[iybs + iqs + 0] = v.x;
1912
+ dst_row[iybs + iqs + y_offset] = v.y;
1913
+ }
1914
+
1915
+ template<typename src0_t, typename dst_t>
1916
+ static __global__ void k_get_rows_float(
1917
+ const src0_t * src0, const int32_t * src1, dst_t * dst,
1918
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1919
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1920
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1921
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1922
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
1923
+
1924
+ const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
1925
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
1926
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
1927
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
1928
+
1929
+ if (i00 >= ne00) {
1930
+ return;
1931
+ }
1932
+
1933
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1934
+
1935
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1936
+ const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
1937
+
1938
+ dst_row[i00] = src0_row[i00];
1712
1939
  }
1713
1940
 
1714
1941
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
@@ -1759,8 +1986,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1759
1986
  // second part effectively subtracts 8 from each quant value
1760
1987
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1761
1988
  #else
1762
- assert(false);
1763
- return 0.0f; // only to satisfy the compiler
1989
+ bad_arch();
1764
1990
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1765
1991
  }
1766
1992
 
@@ -1797,8 +2023,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
1797
2023
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1798
2024
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1799
2025
  #else
1800
- assert(false);
1801
- return 0.0f; // only to satisfy the compiler
2026
+ bad_arch();
1802
2027
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1803
2028
  }
1804
2029
 
@@ -1833,8 +2058,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1833
2058
  // second part effectively subtracts 16 from each quant value
1834
2059
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1835
2060
  #else
1836
- assert(false);
1837
- return 0.0f; // only to satisfy the compiler
2061
+ bad_arch();
1838
2062
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1839
2063
  }
1840
2064
 
@@ -1879,8 +2103,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1879
2103
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1880
2104
 
1881
2105
  #else
1882
- assert(false);
1883
- return 0.0f; // only to satisfy the compiler
2106
+ bad_arch();
1884
2107
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1885
2108
  }
1886
2109
 
@@ -1901,8 +2124,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
1901
2124
 
1902
2125
  return d8_0*d8_1 * sumi;
1903
2126
  #else
1904
- assert(false);
1905
- return 0.0f; // only to satisfy the compiler
2127
+ bad_arch();
1906
2128
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1907
2129
  }
1908
2130
 
@@ -1932,8 +2154,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1932
2154
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1933
2155
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1934
2156
  #else
1935
- assert(false);
1936
- return 0.0f; // only to satisfy the compiler
2157
+ bad_arch();
1937
2158
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1938
2159
  }
1939
2160
 
@@ -1968,8 +2189,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1968
2189
 
1969
2190
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
1970
2191
  #else
1971
- assert(false);
1972
- return 0.0f; // only to satisfy the compiler
2192
+ bad_arch();
1973
2193
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1974
2194
  }
1975
2195
 
@@ -2006,8 +2226,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
2006
2226
 
2007
2227
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
2008
2228
  #else
2009
- assert(false);
2010
- return 0.0f; // only to satisfy the compiler
2229
+ bad_arch();
2011
2230
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2012
2231
  }
2013
2232
 
@@ -2047,8 +2266,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
2047
2266
 
2048
2267
  return d3 * sumf;
2049
2268
  #else
2050
- assert(false);
2051
- return 0.0f; // only to satisfy the compiler
2269
+ bad_arch();
2052
2270
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2053
2271
  }
2054
2272
 
@@ -2073,8 +2291,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
2073
2291
 
2074
2292
  return d3*d8 * sumi;
2075
2293
  #else
2076
- assert(false);
2077
- return 0.0f; // only to satisfy the compiler
2294
+ bad_arch();
2078
2295
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2079
2296
  }
2080
2297
 
@@ -2107,8 +2324,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
2107
2324
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2108
2325
 
2109
2326
  #else
2110
- assert(false);
2111
- return 0.0f; // only to satisfy the compiler
2327
+ bad_arch();
2112
2328
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2113
2329
  }
2114
2330
 
@@ -2141,8 +2357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
2141
2357
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2142
2358
 
2143
2359
  #else
2144
- assert(false);
2145
- return 0.0f; // only to satisfy the compiler
2360
+ bad_arch();
2146
2361
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2147
2362
  }
2148
2363
 
@@ -2182,8 +2397,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
2182
2397
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
2183
2398
 
2184
2399
  #else
2185
- assert(false);
2186
- return 0.0f; // only to satisfy the compiler
2400
+ bad_arch();
2187
2401
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2188
2402
  }
2189
2403
 
@@ -2216,8 +2430,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
2216
2430
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2217
2431
 
2218
2432
  #else
2219
- assert(false);
2220
- return 0.0f; // only to satisfy the compiler
2433
+ bad_arch();
2221
2434
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2222
2435
  }
2223
2436
 
@@ -2247,8 +2460,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
2247
2460
 
2248
2461
  return d*sumf;
2249
2462
  #else
2250
- assert(false);
2251
- return 0.0f; // only to satisfy the compiler
2463
+ bad_arch();
2252
2464
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2253
2465
  }
2254
2466
 
@@ -2279,8 +2491,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
2279
2491
  return d6 * sumf_d;
2280
2492
 
2281
2493
  #else
2282
- assert(false);
2283
- return 0.0f; // only to satisfy the compiler
2494
+ bad_arch();
2284
2495
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2285
2496
  }
2286
2497
 
@@ -3146,8 +3357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3146
3357
  return dall * sumf_d - dmin * sumf_m;
3147
3358
 
3148
3359
  #else
3149
- assert(false);
3150
- return 0.0f; // only to satisfy the compiler
3360
+ bad_arch();
3151
3361
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3152
3362
 
3153
3363
  #endif
@@ -3330,8 +3540,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3330
3540
  return d * sumf_d;
3331
3541
 
3332
3542
  #else
3333
- assert(false);
3334
- return 0.0f; // only to satisfy the compiler
3543
+ bad_arch();
3335
3544
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3336
3545
 
3337
3546
  #endif
@@ -3741,7 +3950,7 @@ template <bool need_check> static __global__ void
3741
3950
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3742
3951
  #else
3743
3952
  (void) vec_dot_q4_0_q8_1_mul_mat;
3744
- assert(false);
3953
+ bad_arch();
3745
3954
  #endif // __CUDA_ARCH__ >= CC_VOLTA
3746
3955
  }
3747
3956
 
@@ -3810,7 +4019,7 @@ template <bool need_check> static __global__ void
3810
4019
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3811
4020
  #else
3812
4021
  (void) vec_dot_q4_1_q8_1_mul_mat;
3813
- assert(false);
4022
+ bad_arch();
3814
4023
  #endif // __CUDA_ARCH__ >= CC_VOLTA
3815
4024
  }
3816
4025
 
@@ -3877,7 +4086,7 @@ template <bool need_check> static __global__ void
3877
4086
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3878
4087
  #else
3879
4088
  (void) vec_dot_q5_0_q8_1_mul_mat;
3880
- assert(false);
4089
+ bad_arch();
3881
4090
  #endif // __CUDA_ARCH__ >= CC_VOLTA
3882
4091
  }
3883
4092
 
@@ -3944,7 +4153,7 @@ mul_mat_q5_1(
3944
4153
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3945
4154
  #else
3946
4155
  (void) vec_dot_q5_1_q8_1_mul_mat;
3947
- assert(false);
4156
+ bad_arch();
3948
4157
  #endif // __CUDA_ARCH__ >= CC_VOLTA
3949
4158
  }
3950
4159
 
@@ -4011,7 +4220,7 @@ template <bool need_check> static __global__ void
4011
4220
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4012
4221
  #else
4013
4222
  (void) vec_dot_q8_0_q8_1_mul_mat;
4014
- assert(false);
4223
+ bad_arch();
4015
4224
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4016
4225
  }
4017
4226
 
@@ -4078,7 +4287,7 @@ mul_mat_q2_K(
4078
4287
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4079
4288
  #else
4080
4289
  (void) vec_dot_q2_K_q8_1_mul_mat;
4081
- assert(false);
4290
+ bad_arch();
4082
4291
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4083
4292
  }
4084
4293
 
@@ -4147,7 +4356,7 @@ template <bool need_check> static __global__ void
4147
4356
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4148
4357
  #else
4149
4358
  (void) vec_dot_q3_K_q8_1_mul_mat;
4150
- assert(false);
4359
+ bad_arch();
4151
4360
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4152
4361
  }
4153
4362
 
@@ -4216,7 +4425,7 @@ template <bool need_check> static __global__ void
4216
4425
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4217
4426
  #else
4218
4427
  (void) vec_dot_q4_K_q8_1_mul_mat;
4219
- assert(false);
4428
+ bad_arch();
4220
4429
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4221
4430
  }
4222
4431
 
@@ -4283,7 +4492,7 @@ mul_mat_q5_K(
4283
4492
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4284
4493
  #else
4285
4494
  (void) vec_dot_q5_K_q8_1_mul_mat;
4286
- assert(false);
4495
+ bad_arch();
4287
4496
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4288
4497
  }
4289
4498
 
@@ -4352,7 +4561,7 @@ template <bool need_check> static __global__ void
4352
4561
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4353
4562
  #else
4354
4563
  (void) vec_dot_q6_K_q8_1_mul_mat;
4355
- assert(false);
4564
+ bad_arch();
4356
4565
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4357
4566
  }
4358
4567
 
@@ -4787,7 +4996,16 @@ static __global__ void rope_neox(
4787
4996
  const int ib = col / n_dims;
4788
4997
  const int ic = col % n_dims;
4789
4998
 
4790
- const int i = row*ncols + ib*n_dims + ic/2;
4999
+ if (ib > 0) {
5000
+ const int i = row*ncols + ib*n_dims + ic;
5001
+
5002
+ dst[i + 0] = x[i + 0];
5003
+ dst[i + 1] = x[i + 1];
5004
+
5005
+ return;
5006
+ }
5007
+
5008
+ const int i = row*ncols + ib*n_dims + ic/2;
4791
5009
  const int i2 = row/p_delta_rows;
4792
5010
 
4793
5011
  float cur_rot = inv_ndims * ic - ib;
@@ -5035,29 +5253,98 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
5035
5253
 
5036
5254
  static __global__ void im2col_f32_f16(
5037
5255
  const float * x, half * dst,
5038
- int ofs0, int ofs1, int IW, int IH, int CHW,
5256
+ int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
5039
5257
  int s0, int s1, int p0, int p1, int d0, int d1) {
5040
- const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
5041
- const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
5258
+ const int i = threadIdx.x + blockIdx.x * blockDim.x;
5259
+ if (i >= pelements) {
5260
+ return;
5261
+ }
5262
+
5263
+ const int ksize = OW * (KH > 1 ? KW : 1);
5264
+ const int kx = i / ksize;
5265
+ const int kd = kx * ksize;
5266
+ const int ky = (i - kd) / OW;
5267
+ const int ix = i % OW;
5268
+
5269
+ const int iiw = ix * s0 + kx * d0 - p0;
5270
+ const int iih = blockIdx.y * s1 + ky * d1 - p1;
5042
5271
 
5043
5272
  const int offset_dst =
5044
- (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
5045
- (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
5273
+ (blockIdx.y * OW + ix) * CHW +
5274
+ (blockIdx.z * (KW * KH) + ky * KW + kx);
5046
5275
 
5047
5276
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5048
5277
  dst[offset_dst] = __float2half(0.0f);
5049
5278
  } else {
5050
- const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
5279
+ const int offset_src = blockIdx.z * offset_delta;
5051
5280
  dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
5052
5281
  }
5053
5282
  }
5054
5283
 
5055
5284
  template<int qk, int qr, dequantize_kernel_t dq>
5056
- static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
5285
+ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5286
+ const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5287
+
5288
+ GGML_TENSOR_BINARY_OP_LOCALS
5289
+
5057
5290
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
5058
- const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
5059
- const dim3 block_nums(block_num_x, nrows, 1);
5060
- k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
5291
+ const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
5292
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5293
+
5294
+ // strides in elements
5295
+ //const size_t s0 = nb0 / ggml_element_size(dst);
5296
+ const size_t s1 = nb1 / ggml_element_size(dst);
5297
+ const size_t s2 = nb2 / ggml_element_size(dst);
5298
+ const size_t s3 = nb3 / ggml_element_size(dst);
5299
+
5300
+ const size_t s10 = nb10 / ggml_element_size(src1);
5301
+ const size_t s11 = nb11 / ggml_element_size(src1);
5302
+ const size_t s12 = nb12 / ggml_element_size(src1);
5303
+ //const size_t s13 = nb13 / ggml_element_size(src1);
5304
+
5305
+ GGML_ASSERT(ne00 % 2 == 0);
5306
+
5307
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
5308
+ src0_dd, src1_dd, dst_dd,
5309
+ ne00, /*ne01, ne02, ne03,*/
5310
+ /*ne10, ne11,*/ ne12, /*ne13,*/
5311
+ /* s0,*/ s1, s2, s3,
5312
+ /* nb00,*/ nb01, nb02, nb03,
5313
+ s10, s11, s12/*, s13*/);
5314
+
5315
+ (void) dst;
5316
+ }
5317
+
5318
+ template<typename src0_t>
5319
+ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5320
+ const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5321
+
5322
+ GGML_TENSOR_BINARY_OP_LOCALS
5323
+
5324
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
5325
+ const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
5326
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5327
+
5328
+ // strides in elements
5329
+ //const size_t s0 = nb0 / ggml_element_size(dst);
5330
+ const size_t s1 = nb1 / ggml_element_size(dst);
5331
+ const size_t s2 = nb2 / ggml_element_size(dst);
5332
+ const size_t s3 = nb3 / ggml_element_size(dst);
5333
+
5334
+ const size_t s10 = nb10 / ggml_element_size(src1);
5335
+ const size_t s11 = nb11 / ggml_element_size(src1);
5336
+ const size_t s12 = nb12 / ggml_element_size(src1);
5337
+ //const size_t s13 = nb13 / ggml_element_size(src1);
5338
+
5339
+ k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
5340
+ src0_dd, src1_dd, dst_dd,
5341
+ ne00, /*ne01, ne02, ne03,*/
5342
+ /*ne10, ne11,*/ ne12, /*ne13,*/
5343
+ /* s0,*/ s1, s2, s3,
5344
+ /* nb00,*/ nb01, nb02, nb03,
5345
+ s10, s11, s12/*, s13*/);
5346
+
5347
+ (void) dst;
5061
5348
  }
5062
5349
 
5063
5350
  template<float (*bin_op)(const float, const float)>
@@ -5069,7 +5356,6 @@ struct bin_bcast_cuda {
5069
5356
 
5070
5357
  GGML_TENSOR_BINARY_OP_LOCALS
5071
5358
 
5072
-
5073
5359
  int nr0 = ne10/ne0;
5074
5360
  int nr1 = ne11/ne1;
5075
5361
  int nr2 = ne12/ne2;
@@ -5117,26 +5403,28 @@ struct bin_bcast_cuda {
5117
5403
  int64_t ne12 = cne1[2];
5118
5404
  int64_t ne13 = cne1[3];
5119
5405
 
5120
- //size_t nb0 = cnb0[0];
5406
+ size_t nb0 = cnb0[0];
5121
5407
  size_t nb1 = cnb0[1];
5122
5408
  size_t nb2 = cnb0[2];
5123
5409
  size_t nb3 = cnb0[3];
5124
5410
 
5125
- //size_t nb10 = cnb1[0];
5411
+ size_t nb10 = cnb1[0];
5126
5412
  size_t nb11 = cnb1[1];
5127
5413
  size_t nb12 = cnb1[2];
5128
5414
  size_t nb13 = cnb1[3];
5129
5415
 
5130
- //size_t s0 = nb0 / sizeof(src1_t);
5131
- size_t s1 = nb1 / sizeof(src1_t);
5132
- size_t s2 = nb2 / sizeof(src1_t);
5133
- size_t s3 = nb3 / sizeof(src1_t);
5416
+ size_t s0 = nb0 / sizeof(dst_t);
5417
+ size_t s1 = nb1 / sizeof(dst_t);
5418
+ size_t s2 = nb2 / sizeof(dst_t);
5419
+ size_t s3 = nb3 / sizeof(dst_t);
5134
5420
 
5135
- //size_t s10 = nb10 / sizeof(src1_t);
5421
+ size_t s10 = nb10 / sizeof(src1_t);
5136
5422
  size_t s11 = nb11 / sizeof(src1_t);
5137
5423
  size_t s12 = nb12 / sizeof(src1_t);
5138
5424
  size_t s13 = nb13 / sizeof(src1_t);
5139
5425
 
5426
+ GGML_ASSERT(s0 == 1);
5427
+ GGML_ASSERT(s10 == 1);
5140
5428
 
5141
5429
  const int block_size = 128;
5142
5430
 
@@ -5174,6 +5462,13 @@ struct bin_bcast_cuda {
5174
5462
  }
5175
5463
  };
5176
5464
 
5465
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
5466
+ const int ne10, const int ne11, const int ne12,
5467
+ const int nb1, const int nb2, const int offset, cudaStream_t stream) {
5468
+ int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
5469
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
5470
+ }
5471
+
5177
5472
  static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5178
5473
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
5179
5474
  gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -5184,11 +5479,26 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
5184
5479
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5185
5480
  }
5186
5481
 
5482
+ static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5483
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
5484
+ gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5485
+ }
5486
+
5487
+ static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5488
+ const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
5489
+ tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5490
+ }
5491
+
5187
5492
  static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5188
5493
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
5189
5494
  relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5190
5495
  }
5191
5496
 
5497
+ static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
5498
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
5499
+ leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
5500
+ }
5501
+
5192
5502
  static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5193
5503
  const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
5194
5504
  sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -5205,6 +5515,38 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
5205
5515
  }
5206
5516
  }
5207
5517
 
5518
+ static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
5519
+ static const float eps = 1e-6f;
5520
+ if (group_size < 1024) {
5521
+ const dim3 block_dims(WARP_SIZE, 1, 1);
5522
+ group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
5523
+ } else {
5524
+ const dim3 block_dims(1024, 1, 1);
5525
+ group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
5526
+ }
5527
+ }
5528
+
5529
+ static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
5530
+ int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
5531
+ dim3 gridDim(num_blocks, ne1, ne2);
5532
+ concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
5533
+ }
5534
+
5535
+ static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
5536
+ int ne0 = (ne00 * scale_factor);
5537
+ int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
5538
+ dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
5539
+ upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
5540
+ }
5541
+
5542
+ static void pad_f32_cuda(const float * x, float * dst,
5543
+ const int ne00, const int ne01, const int ne02,
5544
+ const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
5545
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
5546
+ dim3 gridDim(num_blocks, ne1, ne2);
5547
+ pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
5548
+ }
5549
+
5208
5550
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
5209
5551
  GGML_ASSERT(ncols % WARP_SIZE == 0);
5210
5552
  if (ncols < 1024) {
@@ -6167,13 +6509,14 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
6167
6509
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
6168
6510
  }
6169
6511
 
6170
- static void im2col_f32_f16_cuda(const float * x, half * dst,
6171
- int OH, int IW, int IH, int OW, int IC,
6172
- int KH, int KW, int N, int ofs0, int ofs1,
6173
- int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
6174
- dim3 block_nums(IC, OH, OW);
6175
- dim3 block_dims(N, KH, KW);
6176
- im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
6512
+ static void im2col_f32_f16_cuda(const float* x, half* dst,
6513
+ int IW, int IH, int OW, int OH, int KW, int KH, int IC,
6514
+ int offset_delta,
6515
+ int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
6516
+ const int parallel_elements = OW * KW * KH;
6517
+ const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
6518
+ dim3 block_nums(num_blocks, OH, IC);
6519
+ im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
6177
6520
  }
6178
6521
 
6179
6522
  // buffer pool for cuda
@@ -6447,39 +6790,38 @@ static void ggml_cuda_op_get_rows(
6447
6790
 
6448
6791
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
6449
6792
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
6450
- GGML_ASSERT(ggml_is_contiguous(src0));
6451
- GGML_ASSERT(ggml_is_contiguous(src1));
6452
- GGML_ASSERT(ggml_is_contiguous(dst));
6453
6793
 
6454
- const int ncols = src0->ne[0];
6455
- const int nrows = ggml_nelements(src1);
6794
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
6795
+ GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
6796
+ GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
6456
6797
 
6457
6798
  const int32_t * src1_i32 = (const int32_t *) src1_d;
6458
6799
 
6459
6800
  switch (src0->type) {
6460
6801
  case GGML_TYPE_F16:
6461
- get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6802
+ get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
6462
6803
  break;
6463
6804
  case GGML_TYPE_F32:
6464
- get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6805
+ get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6465
6806
  break;
6466
6807
  case GGML_TYPE_Q4_0:
6467
- get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6808
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6468
6809
  break;
6469
6810
  case GGML_TYPE_Q4_1:
6470
- get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6811
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6471
6812
  break;
6472
6813
  case GGML_TYPE_Q5_0:
6473
- get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6814
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6474
6815
  break;
6475
6816
  case GGML_TYPE_Q5_1:
6476
- get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6817
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6477
6818
  break;
6478
6819
  case GGML_TYPE_Q8_0:
6479
- get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6820
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6480
6821
  break;
6481
6822
  default:
6482
6823
  // TODO: k-quants
6824
+ fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
6483
6825
  GGML_ASSERT(false);
6484
6826
  break;
6485
6827
  }
@@ -6522,6 +6864,25 @@ inline void ggml_cuda_op_add(
6522
6864
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6523
6865
  }
6524
6866
 
6867
+ inline void ggml_cuda_op_acc(
6868
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6869
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6870
+
6871
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6872
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6873
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6874
+ GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
6875
+
6876
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
6877
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
6878
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
6879
+ int offset = dst->op_params[3] / 4; // offset in bytes
6880
+
6881
+ acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
6882
+
6883
+ (void) dst;
6884
+ }
6885
+
6525
6886
  inline void ggml_cuda_op_mul(
6526
6887
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6527
6888
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6564,6 +6925,34 @@ inline void ggml_cuda_op_silu(
6564
6925
  (void) src1_dd;
6565
6926
  }
6566
6927
 
6928
+ inline void ggml_cuda_op_gelu_quick(
6929
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6930
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6931
+
6932
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6933
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6934
+
6935
+ gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6936
+
6937
+ (void) src1;
6938
+ (void) dst;
6939
+ (void) src1_dd;
6940
+ }
6941
+
6942
+ inline void ggml_cuda_op_tanh(
6943
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6944
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6945
+
6946
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6947
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6948
+
6949
+ tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6950
+
6951
+ (void) src1;
6952
+ (void) dst;
6953
+ (void) src1_dd;
6954
+ }
6955
+
6567
6956
  inline void ggml_cuda_op_relu(
6568
6957
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6569
6958
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6578,6 +6967,23 @@ inline void ggml_cuda_op_relu(
6578
6967
  (void) src1_dd;
6579
6968
  }
6580
6969
 
6970
+ inline void ggml_cuda_op_leaky_relu(
6971
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6972
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6973
+
6974
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6975
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6976
+
6977
+ float negative_slope;
6978
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
6979
+
6980
+ leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
6981
+
6982
+ (void) src1;
6983
+ (void) dst;
6984
+ (void) src1_dd;
6985
+ }
6986
+
6581
6987
  inline void ggml_cuda_op_sqr(
6582
6988
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6583
6989
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6612,6 +7018,73 @@ inline void ggml_cuda_op_norm(
6612
7018
  (void) src1_dd;
6613
7019
  }
6614
7020
 
7021
+
7022
+ inline void ggml_cuda_op_group_norm(
7023
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7024
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7025
+
7026
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7027
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7028
+
7029
+ int num_groups = dst->op_params[0];
7030
+ int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
7031
+ group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
7032
+
7033
+ (void) src1;
7034
+ (void) dst;
7035
+ (void) src1_dd;
7036
+ }
7037
+
7038
+ inline void ggml_cuda_op_concat(
7039
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7040
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7041
+
7042
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7043
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7044
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7045
+
7046
+ for (int i3 = 0; i3 < dst->ne[3]; i3++) {
7047
+ concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
7048
+ }
7049
+
7050
+ (void) src1;
7051
+ (void) dst;
7052
+ }
7053
+
7054
+ inline void ggml_cuda_op_upscale(
7055
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7056
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7057
+
7058
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7059
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7060
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
7061
+
7062
+ const int scale_factor = dst->op_params[0];
7063
+
7064
+ upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
7065
+
7066
+ (void) src1;
7067
+ (void) dst;
7068
+ (void) src1_dd;
7069
+ }
7070
+
7071
+ inline void ggml_cuda_op_pad(
7072
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7073
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7074
+
7075
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7076
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7077
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
7078
+
7079
+ pad_f32_cuda(src0_dd, dst_dd,
7080
+ src0->ne[0], src0->ne[1], src0->ne[2],
7081
+ dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
7082
+
7083
+ (void) src1;
7084
+ (void) dst;
7085
+ (void) src1_dd;
7086
+ }
7087
+
6615
7088
  inline void ggml_cuda_op_rms_norm(
6616
7089
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6617
7090
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6913,7 +7386,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6913
7386
 
6914
7387
  const int compute_capability = g_compute_capabilities[id];
6915
7388
 
6916
- if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
7389
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
6917
7390
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
6918
7391
  half * src0_as_f16 = nullptr;
6919
7392
  size_t src0_as = 0;
@@ -7126,7 +7599,6 @@ inline void ggml_cuda_op_im2col(
7126
7599
 
7127
7600
  const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
7128
7601
 
7129
- const int64_t N = src1->ne[is_2D ? 3 : 2];
7130
7602
  const int64_t IC = src1->ne[is_2D ? 2 : 1];
7131
7603
  const int64_t IH = is_2D ? src1->ne[1] : 1;
7132
7604
  const int64_t IW = src1->ne[0];
@@ -7137,17 +7609,15 @@ inline void ggml_cuda_op_im2col(
7137
7609
  const int64_t OH = is_2D ? dst->ne[2] : 1;
7138
7610
  const int64_t OW = dst->ne[1];
7139
7611
 
7140
- const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
7141
- const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
7612
+ const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
7142
7613
 
7143
- im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
7144
- OH, IW, IH, OW, IC, KH, KW, N,
7145
- ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
7614
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
7146
7615
 
7147
7616
  (void) src0;
7148
7617
  (void) src0_dd;
7149
7618
  }
7150
7619
 
7620
+
7151
7621
  inline void ggml_cuda_op_sum_rows(
7152
7622
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7153
7623
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7230,17 +7700,9 @@ inline void ggml_cuda_op_scale(
7230
7700
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7231
7701
 
7232
7702
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7233
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
7234
7703
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
7235
7704
 
7236
- float scale;
7237
- // HACK: support for ggml backend interface
7238
- if (src1->backend == GGML_BACKEND_CPU) {
7239
- scale = ((float *) src1->data)[0];
7240
- } else {
7241
- // TODO: pass pointer to kernel instead of copying to host
7242
- CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
7243
- }
7705
+ const float scale = ((float *) dst->op_params)[0];
7244
7706
 
7245
7707
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
7246
7708
  CUDA_CHECK(cudaGetLastError());
@@ -7287,8 +7749,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7287
7749
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
7288
7750
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
7289
7751
 
7290
- const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
7291
-
7292
7752
  // dd = data device
7293
7753
  float * src0_ddf = nullptr;
7294
7754
  float * src1_ddf = nullptr;
@@ -7309,7 +7769,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7309
7769
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
7310
7770
  }
7311
7771
 
7312
- if (use_src1 && !src1_stays_on_host) {
7772
+ if (use_src1) {
7313
7773
  if (src1_on_device) {
7314
7774
  src1_ddf = (float *) src1_extra->data_device[g_main_device];
7315
7775
  } else {
@@ -7357,6 +7817,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
7357
7817
  }
7358
7818
 
7359
7819
  #ifdef NDEBUG
7820
+ for (int id = 0; id < g_device_count; ++id) {
7821
+ CUDA_CHECK(ggml_cuda_set_device(id));
7822
+ CUDA_CHECK(cudaDeviceSynchronize());
7823
+ }
7824
+
7360
7825
  for (int id = 0; id < g_device_count; ++id) {
7361
7826
  CUDA_CHECK(ggml_cuda_set_device(id));
7362
7827
 
@@ -7408,8 +7873,6 @@ static void ggml_cuda_op_mul_mat(
7408
7873
  const int nb2 = dst->nb[2];
7409
7874
  const int nb3 = dst->nb[3];
7410
7875
 
7411
- ggml_cuda_set_peer_access(ne11);
7412
-
7413
7876
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
7414
7877
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
7415
7878
 
@@ -7696,6 +8159,10 @@ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, gg
7696
8159
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
7697
8160
  }
7698
8161
 
8162
+ static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8163
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
8164
+ }
8165
+
7699
8166
  static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7700
8167
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
7701
8168
  }
@@ -7712,10 +8179,22 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7712
8179
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7713
8180
  }
7714
8181
 
8182
+ static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8183
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
8184
+ }
8185
+
8186
+ static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8187
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
8188
+ }
8189
+
7715
8190
  static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7716
8191
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7717
8192
  }
7718
8193
 
8194
+ static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8195
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
8196
+ }
8197
+
7719
8198
  static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7720
8199
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7721
8200
  }
@@ -7724,6 +8203,22 @@ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, g
7724
8203
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7725
8204
  }
7726
8205
 
8206
+ static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8207
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
8208
+ }
8209
+
8210
+ static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8211
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
8212
+ }
8213
+
8214
+ static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8215
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
8216
+ }
8217
+
8218
+ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8219
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
8220
+ }
8221
+
7727
8222
  static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7728
8223
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
7729
8224
  }
@@ -7808,27 +8303,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7808
8303
  }
7809
8304
 
7810
8305
  static __global__ void k_compute_batched_ptrs(
7811
- const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
8306
+ const half * src0_as_f16, const half * src1_as_f16, char * dst,
7812
8307
  const void ** ptrs_src, void ** ptrs_dst,
7813
- int ne12, int ne13,
7814
- int ne23,
7815
- int nb02, int nb03,
7816
- int nb12, int nb13,
7817
- int nb2, int nb3,
7818
- int r2, int r3) {
7819
- int i13 = blockIdx.x * blockDim.x + threadIdx.x;
7820
- int i12 = blockIdx.y * blockDim.y + threadIdx.y;
8308
+ int64_t ne12, int64_t ne13,
8309
+ int64_t ne23,
8310
+ size_t nb02, size_t nb03,
8311
+ size_t nb12, size_t nb13,
8312
+ size_t nbd2, size_t nbd3,
8313
+ int64_t r2, int64_t r3) {
8314
+ int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
8315
+ int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
7821
8316
 
7822
8317
  if (i13 >= ne13 || i12 >= ne12) {
7823
8318
  return;
7824
8319
  }
7825
8320
 
7826
- int i03 = i13 / r3;
7827
- int i02 = i12 / r2;
8321
+ int64_t i03 = i13 / r3;
8322
+ int64_t i02 = i12 / r2;
7828
8323
 
7829
8324
  ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
7830
8325
  ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7831
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
8326
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
7832
8327
  }
7833
8328
 
7834
8329
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7884,7 +8379,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7884
8379
  to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
7885
8380
 
7886
8381
  size_t dst_as = 0;
7887
- half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8382
+
8383
+ half * dst_f16 = nullptr;
8384
+ char * dst_t = nullptr;
8385
+
8386
+ cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
8387
+ cudaDataType_t cu_data_type = CUDA_R_16F;
8388
+
8389
+ // dst strides
8390
+ size_t nbd2 = dst->nb[2];
8391
+ size_t nbd3 = dst->nb[3];
8392
+
8393
+ const half alpha_f16 = 1.0f;
8394
+ const half beta_f16 = 0.0f;
8395
+
8396
+ const float alpha_f32 = 1.0f;
8397
+ const float beta_f32 = 0.0f;
8398
+
8399
+ const void * alpha = &alpha_f16;
8400
+ const void * beta = &beta_f16;
8401
+
8402
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8403
+ dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8404
+ dst_t = (char *) dst_f16;
8405
+
8406
+ nbd2 /= sizeof(float) / sizeof(half);
8407
+ nbd3 /= sizeof(float) / sizeof(half);
8408
+ } else {
8409
+ dst_t = (char *) dst_ddf;
8410
+
8411
+ cu_compute_type = CUBLAS_COMPUTE_32F;
8412
+ cu_data_type = CUDA_R_32F;
8413
+
8414
+ alpha = &alpha_f32;
8415
+ beta = &beta_f32;
8416
+ }
7888
8417
 
7889
8418
  GGML_ASSERT(ne12 % ne02 == 0);
7890
8419
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -7893,9 +8422,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7893
8422
  const int64_t r2 = ne12/ne02;
7894
8423
  const int64_t r3 = ne13/ne03;
7895
8424
 
7896
- const half alpha_f16 = 1.0f;
7897
- const half beta_f16 = 0.0f;
7898
-
7899
8425
  #if 0
7900
8426
  // use cublasGemmEx
7901
8427
  {
@@ -7905,12 +8431,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7905
8431
  int i02 = i12 / r2;
7906
8432
 
7907
8433
  CUBLAS_CHECK(
7908
- cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
8434
+ cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7909
8435
  ne01, ne11, ne10,
7910
- &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
7911
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
7912
- &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
7913
- CUBLAS_COMPUTE_16F,
8436
+ alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
8437
+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
8438
+ beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
8439
+ cu_compute_type,
7914
8440
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7915
8441
  }
7916
8442
  }
@@ -7922,11 +8448,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7922
8448
  CUBLAS_CHECK(
7923
8449
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7924
8450
  ne01, ne11, ne10,
7925
- &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
7926
- (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
7927
- &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
8451
+ alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8452
+ (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8453
+ beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
7928
8454
  ne12*ne13,
7929
- CUBLAS_COMPUTE_16F,
8455
+ cu_compute_type,
7930
8456
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7931
8457
  } else {
7932
8458
  // use cublasGemmBatchedEx
@@ -7943,24 +8469,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7943
8469
 
7944
8470
  dim3 block_dims(ne13, ne12);
7945
8471
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7946
- src0_as_f16, src1_as_f16, dst_f16,
8472
+ src0_as_f16, src1_as_f16, dst_t,
7947
8473
  ptrs_src, ptrs_dst,
7948
8474
  ne12, ne13,
7949
8475
  ne23,
7950
8476
  nb02, nb03,
7951
8477
  nb12, nb13,
7952
- dst->nb[2], dst->nb[3],
8478
+ nbd2, nbd3,
7953
8479
  r2, r3);
7954
8480
  CUDA_CHECK(cudaGetLastError());
7955
8481
 
7956
8482
  CUBLAS_CHECK(
7957
8483
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7958
8484
  ne01, ne11, ne10,
7959
- &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7960
- (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7961
- &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
8485
+ alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8486
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8487
+ beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
7962
8488
  ne23,
7963
- CUBLAS_COMPUTE_16F,
8489
+ cu_compute_type,
7964
8490
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7965
8491
 
7966
8492
  if (ptrs_src_s != 0) {
@@ -7972,11 +8498,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7972
8498
  }
7973
8499
  #endif
7974
8500
 
7975
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
7976
- to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8501
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8502
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8503
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8504
+
8505
+ ggml_cuda_pool_free(dst_f16, dst_as);
8506
+ }
7977
8507
 
7978
8508
  ggml_cuda_pool_free(src1_as_f16, src1_as);
7979
- ggml_cuda_pool_free(dst_f16, dst_as);
7980
8509
  }
7981
8510
 
7982
8511
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8234,36 +8763,145 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
8234
8763
  }
8235
8764
  #endif
8236
8765
 
8237
- static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
8766
+ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8238
8767
  #if 0
8239
- //#ifdef CUDA_USE_TENSOR_CORES
8240
- // const bool use_tensor_cores = true;
8241
- //#else
8242
- // const bool use_tensor_cores = false;
8243
- //#endif
8244
-
8245
8768
  ggml_cuda_mul_mat_id_cublas(dst);
8246
-
8247
8769
  // TODO: mmq/mmv support
8248
- #else
8249
- const struct ggml_tensor * ids = dst->src[0];
8250
- const struct ggml_tensor * src1 = dst->src[1];
8251
- const int id = dst->op_params[0];
8770
+ #endif
8252
8771
 
8253
- int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8772
+ const int64_t nb11 = src1->nb[1];
8773
+ const int64_t nb1 = dst->nb[1];
8254
8774
 
8255
- int32_t a_id;
8256
- CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8257
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8775
+ const struct ggml_tensor * ids = src0;
8776
+ const int32_t id = ((int32_t *) dst->op_params)[0];
8777
+ const int32_t n_as = ((int32_t *) dst->op_params)[1];
8258
8778
 
8259
- GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
8260
- const struct ggml_tensor * src0 = dst->src[a_id + 2];
8779
+ std::vector<char> ids_host(ggml_nbytes(ids));
8261
8780
 
8262
- ggml_cuda_mul_mat(src0, src1, dst);
8263
- #endif
8781
+ const cudaStream_t stream = g_cudaStreams[g_main_device][0];
8782
+
8783
+ if (ids->backend == GGML_BACKEND_GPU) {
8784
+ const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8785
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
8786
+ CUDA_CHECK(cudaStreamSynchronize(stream));
8787
+ } else {
8788
+ memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
8789
+ }
8264
8790
 
8265
- (void) _src0;
8266
- (void) _src1;
8791
+ const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
8792
+ const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
8793
+
8794
+ ggml_tensor_extra_gpu src1_row_extra;
8795
+ ggml_tensor_extra_gpu dst_row_extra;
8796
+
8797
+ ggml_tensor src1_row = *src1;
8798
+ ggml_tensor dst_row = *dst;
8799
+
8800
+ src1_row.backend = GGML_BACKEND_GPU;
8801
+ dst_row.backend = GGML_BACKEND_GPU;
8802
+
8803
+ src1_row.extra = &src1_row_extra;
8804
+ dst_row.extra = &dst_row_extra;
8805
+
8806
+ char * src1_original = src1->backend == GGML_BACKEND_CPU ?
8807
+ (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
8808
+ char * dst_original = dst->backend == GGML_BACKEND_CPU ?
8809
+ (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
8810
+
8811
+ if (src1->ne[1] == 1) {
8812
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
8813
+ GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8814
+
8815
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8816
+ //int32_t row_id;
8817
+ //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8818
+ //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8819
+
8820
+ const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8821
+
8822
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8823
+
8824
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8825
+
8826
+ src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
8827
+ src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
8828
+
8829
+ dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
8830
+ dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
8831
+
8832
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8833
+ }
8834
+ } else {
8835
+ size_t as_src1, as_dst;
8836
+ char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
8837
+ char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
8838
+
8839
+ src1_row_extra.data_device[g_main_device] = src1_contiguous;
8840
+ dst_row_extra.data_device[g_main_device] = dst_contiguous;
8841
+
8842
+ const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8843
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8844
+ const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
8845
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8846
+
8847
+ for (int32_t row_id = 0; row_id < n_as; ++row_id) {
8848
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8849
+
8850
+ int64_t num_src1_rows = 0;
8851
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8852
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8853
+
8854
+ if (row_id_i != row_id) {
8855
+ continue;
8856
+ }
8857
+
8858
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8859
+
8860
+ CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8861
+ nb11, src1_kind, stream));
8862
+ num_src1_rows++;
8863
+ }
8864
+
8865
+ if (num_src1_rows == 0) {
8866
+ continue;
8867
+ }
8868
+
8869
+ src1_row.ne[1] = num_src1_rows;
8870
+ dst_row.ne[1] = num_src1_rows;
8871
+
8872
+ src1_row.nb[1] = nb11;
8873
+ src1_row.nb[2] = num_src1_rows*nb11;
8874
+ src1_row.nb[3] = num_src1_rows*nb11;
8875
+
8876
+ dst_row.nb[1] = nb1;
8877
+ dst_row.nb[2] = num_src1_rows*nb1;
8878
+ dst_row.nb[3] = num_src1_rows*nb1;
8879
+
8880
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8881
+
8882
+ num_src1_rows = 0;
8883
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8884
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8885
+
8886
+ if (row_id_i != row_id) {
8887
+ continue;
8888
+ }
8889
+
8890
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8891
+
8892
+ CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8893
+ nb1, dst_kind, stream));
8894
+ num_src1_rows++;
8895
+ }
8896
+ }
8897
+
8898
+ ggml_cuda_pool_free(src1_contiguous, as_src1);
8899
+ ggml_cuda_pool_free(dst_contiguous, as_dst);
8900
+ }
8901
+
8902
+ if (dst->backend == GGML_BACKEND_CPU) {
8903
+ CUDA_CHECK(cudaStreamSynchronize(stream));
8904
+ }
8267
8905
  }
8268
8906
 
8269
8907
  static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8373,6 +9011,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg
8373
9011
  (void) dst;
8374
9012
  }
8375
9013
 
9014
+ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
9015
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
9016
+
9017
+ return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
9018
+ }
9019
+
8376
9020
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8377
9021
  const int64_t nrows = ggml_nrows(tensor);
8378
9022
 
@@ -8422,13 +9066,12 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8422
9066
 
8423
9067
  // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
8424
9068
  if (ne0 % MATRIX_ROW_PADDING != 0) {
8425
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8426
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
9069
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
8427
9070
  }
8428
9071
 
8429
9072
  char * buf;
8430
9073
  CUDA_CHECK(cudaMalloc(&buf, size));
8431
- char * buf_host = (char*)data + offset_split;
9074
+ char * buf_host = (char *)data + offset_split;
8432
9075
 
8433
9076
  // set padding to 0 to avoid possible NaN values
8434
9077
  if (size > original_size) {
@@ -8450,7 +9093,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8450
9093
  }
8451
9094
 
8452
9095
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
8453
- if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9096
+ if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
8454
9097
  return;
8455
9098
  }
8456
9099
 
@@ -8573,11 +9216,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
8573
9216
 
8574
9217
  ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
8575
9218
 
8576
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
8577
- tensor->op == GGML_OP_VIEW;
9219
+ const bool inplace = tensor->view_src != nullptr;
8578
9220
 
8579
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
8580
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
9221
+ if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
9222
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
8581
9223
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
8582
9224
  size_t view_offset = 0;
8583
9225
  if (tensor->op == GGML_OP_VIEW) {
@@ -8657,14 +9299,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8657
9299
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
8658
9300
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
8659
9301
 
8660
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
9302
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
8661
9303
  return false;
8662
9304
  }
8663
9305
 
8664
9306
  if (tensor->op == GGML_OP_MUL_MAT) {
8665
9307
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8666
9308
  #ifndef NDEBUG
8667
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
9309
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8668
9310
  #endif
8669
9311
  return false;
8670
9312
  }
@@ -8683,6 +9325,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8683
9325
  case GGML_OP_ADD:
8684
9326
  func = ggml_cuda_add;
8685
9327
  break;
9328
+ case GGML_OP_ACC:
9329
+ func = ggml_cuda_acc;
9330
+ break;
8686
9331
  case GGML_OP_MUL:
8687
9332
  func = ggml_cuda_mul;
8688
9333
  break;
@@ -8697,6 +9342,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8697
9342
  case GGML_UNARY_OP_SILU:
8698
9343
  func = ggml_cuda_silu;
8699
9344
  break;
9345
+ case GGML_UNARY_OP_GELU_QUICK:
9346
+ func = ggml_cuda_gelu_quick;
9347
+ break;
9348
+ case GGML_UNARY_OP_TANH:
9349
+ func = ggml_cuda_tanh;
9350
+ break;
8700
9351
  case GGML_UNARY_OP_RELU:
8701
9352
  func = ggml_cuda_relu;
8702
9353
  break;
@@ -8707,6 +9358,21 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8707
9358
  case GGML_OP_NORM:
8708
9359
  func = ggml_cuda_norm;
8709
9360
  break;
9361
+ case GGML_OP_GROUP_NORM:
9362
+ func = ggml_cuda_group_norm;
9363
+ break;
9364
+ case GGML_OP_CONCAT:
9365
+ func = ggml_cuda_concat;
9366
+ break;
9367
+ case GGML_OP_UPSCALE:
9368
+ func = ggml_cuda_upscale;
9369
+ break;
9370
+ case GGML_OP_PAD:
9371
+ func = ggml_cuda_pad;
9372
+ break;
9373
+ case GGML_OP_LEAKY_RELU:
9374
+ func = ggml_cuda_leaky_relu;
9375
+ break;
8710
9376
  case GGML_OP_RMS_NORM:
8711
9377
  func = ggml_cuda_rms_norm;
8712
9378
  break;
@@ -8729,9 +9395,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8729
9395
  func = ggml_cuda_sqr;
8730
9396
  break;
8731
9397
  case GGML_OP_CLAMP:
8732
- if (!any_on_device) {
8733
- return false;
8734
- }
8735
9398
  func = ggml_cuda_clamp;
8736
9399
  break;
8737
9400
  case GGML_OP_CPY:
@@ -8740,6 +9403,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8740
9403
  case GGML_OP_CONT:
8741
9404
  func = ggml_cuda_dup;
8742
9405
  break;
9406
+ case GGML_OP_NONE:
8743
9407
  case GGML_OP_RESHAPE:
8744
9408
  case GGML_OP_VIEW:
8745
9409
  case GGML_OP_PERMUTE:
@@ -8771,6 +9435,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8771
9435
  return false;
8772
9436
  }
8773
9437
 
9438
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
9439
+ ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
9440
+ }
9441
+
8774
9442
  if (params->ith != 0) {
8775
9443
  return true;
8776
9444
  }
@@ -8844,7 +9512,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8844
9512
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8845
9513
 
8846
9514
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
8847
- assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
9515
+ assert(tensor->view_src->buffer->buft == buffer->buft);
8848
9516
  tensor->backend = tensor->view_src->backend;
8849
9517
  tensor->extra = tensor->view_src->extra;
8850
9518
  return;
@@ -8875,23 +9543,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8875
9543
  }
8876
9544
 
8877
9545
  static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
8878
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
8879
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8880
9546
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8881
9547
 
8882
- CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9548
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8883
9549
 
8884
- UNUSED(buffer);
9550
+ ggml_cuda_set_device(ctx->device);
9551
+ CUDA_CHECK(cudaDeviceSynchronize());
9552
+
9553
+ CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
8885
9554
  }
8886
9555
 
8887
9556
  static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
8888
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
8889
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8890
9557
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8891
9558
 
9559
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9560
+
9561
+ ggml_cuda_set_device(ctx->device);
9562
+ CUDA_CHECK(cudaDeviceSynchronize());
9563
+
8892
9564
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
9565
+ }
8893
9566
 
8894
- UNUSED(buffer);
9567
+ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
9568
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9569
+
9570
+ ggml_cuda_set_device(ctx->device);
9571
+ CUDA_CHECK(cudaDeviceSynchronize());
9572
+
9573
+ CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
8895
9574
  }
8896
9575
 
8897
9576
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
@@ -8902,6 +9581,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
8902
9581
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
8903
9582
  /* .cpy_tensor_from = */ NULL,
8904
9583
  /* .cpy_tensor_to = */ NULL,
9584
+ /* .clear = */ ggml_backend_cuda_buffer_clear,
8905
9585
  };
8906
9586
 
8907
9587
  // cuda buffer type
@@ -8938,8 +9618,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
8938
9618
 
8939
9619
  if (ggml_is_quantized(tensor->type)) {
8940
9620
  if (ne0 % MATRIX_ROW_PADDING != 0) {
8941
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8942
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
9621
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
8943
9622
  }
8944
9623
  }
8945
9624
 
@@ -8954,35 +9633,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
8954
9633
  UNUSED(buft);
8955
9634
  }
8956
9635
 
8957
- static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
9636
+ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
8958
9637
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
8959
9638
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
8960
9639
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
8961
9640
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
9641
+ /* .is_host = */ nullptr,
8962
9642
  };
8963
9643
 
8964
9644
  ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
8965
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
8966
- static bool ggml_backend_buffer_type_cuda_initialized = false;
8967
- if (!ggml_backend_buffer_type_cuda_initialized) {
9645
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
9646
+
9647
+ static bool ggml_backend_cuda_buffer_type_initialized = false;
9648
+
9649
+ if (!ggml_backend_cuda_buffer_type_initialized) {
8968
9650
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
8969
- ggml_backend_buffer_type_cuda[i] = {
8970
- /* .iface = */ cuda_backend_buffer_type_interface,
9651
+ ggml_backend_cuda_buffer_types[i] = {
9652
+ /* .iface = */ ggml_backend_cuda_buffer_type_interface,
8971
9653
  /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
8972
9654
  };
8973
9655
  }
8974
- ggml_backend_buffer_type_cuda_initialized = true;
9656
+ ggml_backend_cuda_buffer_type_initialized = true;
8975
9657
  }
8976
9658
 
8977
- return &ggml_backend_buffer_type_cuda[device];
9659
+ return &ggml_backend_cuda_buffer_types[device];
8978
9660
  }
8979
9661
 
8980
9662
  // host buffer type
8981
9663
 
8982
9664
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
8983
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8984
- CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
8985
- delete ctx;
9665
+ CUDA_CHECK(cudaFreeHost(buffer->context));
8986
9666
  }
8987
9667
 
8988
9668
  static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -8995,24 +9675,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
8995
9675
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
8996
9676
 
8997
9677
  return buffer;
8998
-
8999
- UNUSED(buft);
9000
9678
  }
9001
9679
 
9002
- struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9003
- /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9004
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9005
- /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9006
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9007
- };
9008
-
9009
9680
  ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9010
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9011
- /* .iface = */ cuda_backend_host_buffer_type_interface,
9681
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
9682
+ /* .iface = */ {
9683
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9684
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9685
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9686
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9687
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
9688
+ },
9012
9689
  /* .context = */ nullptr,
9013
9690
  };
9014
9691
 
9015
- return &ggml_backend_buffer_type_cuda_host;
9692
+ return &ggml_backend_cuda_buffer_type_host;
9016
9693
  }
9017
9694
 
9018
9695
  // backend
@@ -9044,8 +9721,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
9044
9721
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9045
9722
 
9046
9723
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9047
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
9048
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9049
9724
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9050
9725
 
9051
9726
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@@ -9055,8 +9730,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
9055
9730
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9056
9731
 
9057
9732
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9058
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
9059
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9060
9733
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9061
9734
 
9062
9735
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
@@ -9159,6 +9832,8 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9159
9832
  case GGML_UNARY_OP_GELU:
9160
9833
  case GGML_UNARY_OP_SILU:
9161
9834
  case GGML_UNARY_OP_RELU:
9835
+ case GGML_UNARY_OP_GELU_QUICK:
9836
+ case GGML_UNARY_OP_TANH:
9162
9837
  return true;
9163
9838
  default:
9164
9839
  return false;
@@ -9181,6 +9856,45 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9181
9856
  }
9182
9857
  return true;
9183
9858
  } break;
9859
+ case GGML_OP_GET_ROWS:
9860
+ {
9861
+ switch (op->src[0]->type) {
9862
+ case GGML_TYPE_F16:
9863
+ case GGML_TYPE_F32:
9864
+ case GGML_TYPE_Q4_0:
9865
+ case GGML_TYPE_Q4_1:
9866
+ case GGML_TYPE_Q5_0:
9867
+ case GGML_TYPE_Q5_1:
9868
+ case GGML_TYPE_Q8_0:
9869
+ return true;
9870
+ default:
9871
+ return false;
9872
+ }
9873
+ } break;
9874
+ case GGML_OP_CPY:
9875
+ {
9876
+ ggml_type src0_type = op->src[0]->type;
9877
+ ggml_type src1_type = op->src[1]->type;
9878
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
9879
+ return true;
9880
+ }
9881
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
9882
+ return true;
9883
+ }
9884
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
9885
+ return true;
9886
+ }
9887
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
9888
+ return true;
9889
+ }
9890
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
9891
+ return true;
9892
+ }
9893
+ if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
9894
+ return true;
9895
+ }
9896
+ return false;
9897
+ } break;
9184
9898
  case GGML_OP_NONE:
9185
9899
  case GGML_OP_RESHAPE:
9186
9900
  case GGML_OP_VIEW:
@@ -9188,7 +9902,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9188
9902
  case GGML_OP_TRANSPOSE:
9189
9903
  case GGML_OP_NORM:
9190
9904
  case GGML_OP_REPEAT:
9191
- case GGML_OP_GET_ROWS:
9192
9905
  case GGML_OP_DUP:
9193
9906
  case GGML_OP_ADD:
9194
9907
  case GGML_OP_MUL:
@@ -9197,7 +9910,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9197
9910
  case GGML_OP_SCALE:
9198
9911
  case GGML_OP_SQR:
9199
9912
  case GGML_OP_CLAMP:
9200
- case GGML_OP_CPY:
9201
9913
  case GGML_OP_CONT:
9202
9914
  case GGML_OP_DIAG_MASK_INF:
9203
9915
  case GGML_OP_SOFT_MAX:
@@ -9206,6 +9918,12 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9206
9918
  case GGML_OP_IM2COL:
9207
9919
  case GGML_OP_SUM_ROWS:
9208
9920
  case GGML_OP_ARGSORT:
9921
+ case GGML_OP_ACC:
9922
+ case GGML_OP_CONCAT:
9923
+ case GGML_OP_GROUP_NORM:
9924
+ case GGML_OP_UPSCALE:
9925
+ case GGML_OP_PAD:
9926
+ case GGML_OP_LEAKY_RELU:
9209
9927
  return true;
9210
9928
  default:
9211
9929
  return false;
@@ -9264,7 +9982,9 @@ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * use
9264
9982
  UNUSED(params);
9265
9983
  }
9266
9984
 
9267
- extern "C" int ggml_backend_cuda_reg_devices() {
9985
+ extern "C" int ggml_backend_cuda_reg_devices();
9986
+
9987
+ int ggml_backend_cuda_reg_devices() {
9268
9988
  int device_count = ggml_cuda_get_device_count();
9269
9989
  //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
9270
9990
  for (int i = 0; i < device_count; i++) {