llama_cpp 0.10.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,15 @@
1
1
  #include <algorithm>
2
+ #include <assert.h>
3
+ #include <atomic>
4
+ #include <cinttypes>
2
5
  #include <cstddef>
3
6
  #include <cstdint>
4
- #include <cinttypes>
5
7
  #include <float.h>
6
8
  #include <limits>
7
9
  #include <stdint.h>
8
10
  #include <stdio.h>
9
- #include <atomic>
10
- #include <assert.h>
11
+ #include <vector>
12
+
11
13
 
12
14
  #if defined(GGML_USE_HIPBLAS)
13
15
  #include <hip/hip_runtime.h>
@@ -437,6 +439,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
437
439
 
438
440
  #define CUDA_GELU_BLOCK_SIZE 256
439
441
  #define CUDA_SILU_BLOCK_SIZE 256
442
+ #define CUDA_TANH_BLOCK_SIZE 256
440
443
  #define CUDA_RELU_BLOCK_SIZE 256
441
444
  #define CUDA_SQR_BLOCK_SIZE 256
442
445
  #define CUDA_CPY_BLOCK_SIZE 32
@@ -449,6 +452,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
449
452
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
450
453
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
451
454
  #define CUDA_GET_ROWS_BLOCK_SIZE 256
455
+ #define CUDA_UPSCALE_BLOCK_SIZE 256
456
+ #define CUDA_CONCAT_BLOCK_SIZE 256
457
+ #define CUDA_PAD_BLOCK_SIZE 256
458
+ #define CUDA_ACC_BLOCK_SIZE 256
459
+ #define CUDA_IM2COL_BLOCK_SIZE 256
452
460
 
453
461
  // dmmv = dequantize_mul_mat_vec
454
462
  #ifndef GGML_CUDA_DMMV_X
@@ -610,6 +618,24 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
610
618
  dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
611
619
  }
612
620
 
621
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
622
+ const int ne10, const int ne11, const int ne12,
623
+ const int nb1, const int nb2, int offset) {
624
+ const int i = blockDim.x * blockIdx.x + threadIdx.x;
625
+ if (i >= ne) {
626
+ return;
627
+ }
628
+ int src1_idx = i - offset;
629
+ int oz = src1_idx / nb2;
630
+ int oy = (src1_idx - (oz * nb2)) / nb1;
631
+ int ox = src1_idx % nb1;
632
+ if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
633
+ dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
634
+ } else {
635
+ dst[i] = x[i];
636
+ }
637
+ }
638
+
613
639
  static __global__ void gelu_f32(const float * x, float * dst, const int k) {
614
640
  const float GELU_COEF_A = 0.044715f;
615
641
  const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@@ -632,6 +658,23 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
632
658
  dst[i] = x[i] / (1.0f + expf(-x[i]));
633
659
  }
634
660
 
661
+ static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
662
+ const float GELU_QUICK_COEF = -1.702f;
663
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
664
+ if (i >= k) {
665
+ return;
666
+ }
667
+ dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
668
+ }
669
+
670
+ static __global__ void tanh_f32(const float *x, float *dst, int k) {
671
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
672
+ if (i >= k) {
673
+ return;
674
+ }
675
+ dst[i] = tanhf(x[i]);
676
+ }
677
+
635
678
  static __global__ void relu_f32(const float * x, float * dst, const int k) {
636
679
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
637
680
 
@@ -641,6 +684,14 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
641
684
  dst[i] = fmaxf(x[i], 0);
642
685
  }
643
686
 
687
+ static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
688
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
689
+ if (i >= k) {
690
+ return;
691
+ }
692
+ dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
693
+ }
694
+
644
695
  static __global__ void sqr_f32(const float * x, float * dst, const int k) {
645
696
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
646
697
 
@@ -686,6 +737,132 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
686
737
  }
687
738
  }
688
739
 
740
+ static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
741
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
742
+ if (nidx >= ne0) {
743
+ return;
744
+ }
745
+ // operation
746
+ int offset_dst =
747
+ nidx +
748
+ blockIdx.y * ne0 +
749
+ blockIdx.z * ne0 * gridDim.y;
750
+ if (blockIdx.z < ne02) { // src0
751
+ int offset_src =
752
+ nidx +
753
+ blockIdx.y * ne0 +
754
+ blockIdx.z * ne0 * gridDim.y;
755
+ dst[offset_dst] = x[offset_src];
756
+ } else {
757
+ int offset_src =
758
+ nidx +
759
+ blockIdx.y * ne0 +
760
+ (blockIdx.z - ne02) * ne0 * gridDim.y;
761
+ dst[offset_dst] = y[offset_src];
762
+ }
763
+ }
764
+
765
+ static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
766
+ int ne0 = ne00 * scale_factor;
767
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
768
+ if (nidx >= ne0) {
769
+ return;
770
+ }
771
+ // operation
772
+ int i00 = nidx / scale_factor;
773
+ int i01 = blockIdx.y / scale_factor;
774
+ int offset_src =
775
+ i00 +
776
+ i01 * ne00 +
777
+ blockIdx.z * nb02;
778
+ int offset_dst =
779
+ nidx +
780
+ blockIdx.y * ne0 +
781
+ blockIdx.z * ne0 * gridDim.y;
782
+ dst[offset_dst] = x[offset_src];
783
+ }
784
+
785
+ static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
786
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
787
+ if (nidx >= ne0) {
788
+ return;
789
+ }
790
+
791
+ // operation
792
+ int offset_dst =
793
+ nidx +
794
+ blockIdx.y * ne0 +
795
+ blockIdx.z * ne0 * gridDim.y;
796
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
797
+ int offset_src =
798
+ nidx +
799
+ blockIdx.y * ne00 +
800
+ blockIdx.z * ne00 * ne01;
801
+ dst[offset_dst] = x[offset_src];
802
+ } else {
803
+ dst[offset_dst] = 0.0f;
804
+ }
805
+ }
806
+
807
+ template <int block_size>
808
+ static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
809
+ int start = blockIdx.x * group_size;
810
+ int end = start + group_size;
811
+
812
+ start += threadIdx.x;
813
+
814
+ if (end >= ne_elements) {
815
+ end = ne_elements;
816
+ }
817
+
818
+ float tmp = 0.0f; // partial sum for thread in warp
819
+
820
+ for (int j = start; j < end; j += block_size) {
821
+ tmp += x[j];
822
+ }
823
+
824
+ tmp = warp_reduce_sum(tmp);
825
+ if (block_size > WARP_SIZE) {
826
+ __shared__ float s_sum[32];
827
+ int warp_id = threadIdx.x / WARP_SIZE;
828
+ int lane_id = threadIdx.x % WARP_SIZE;
829
+ if (lane_id == 0) {
830
+ s_sum[warp_id] = tmp;
831
+ }
832
+ __syncthreads();
833
+ tmp = s_sum[lane_id];
834
+ tmp = warp_reduce_sum(tmp);
835
+ }
836
+
837
+ float mean = tmp / group_size;
838
+ tmp = 0.0f;
839
+
840
+ for (int j = start; j < end; j += block_size) {
841
+ float xi = x[j] - mean;
842
+ dst[j] = xi;
843
+ tmp += xi * xi;
844
+ }
845
+
846
+ tmp = warp_reduce_sum(tmp);
847
+ if (block_size > WARP_SIZE) {
848
+ __shared__ float s_sum[32];
849
+ int warp_id = threadIdx.x / WARP_SIZE;
850
+ int lane_id = threadIdx.x % WARP_SIZE;
851
+ if (lane_id == 0) {
852
+ s_sum[warp_id] = tmp;
853
+ }
854
+ __syncthreads();
855
+ tmp = s_sum[lane_id];
856
+ tmp = warp_reduce_sum(tmp);
857
+ }
858
+
859
+ float variance = tmp / group_size;
860
+ float scale = rsqrtf(variance + eps);
861
+ for (int j = start; j < end; j += block_size) {
862
+ dst[j] *= scale;
863
+ }
864
+ }
865
+
689
866
  template <int block_size>
690
867
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
691
868
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -1684,31 +1861,65 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1684
1861
  }
1685
1862
 
1686
1863
  template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1687
- static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1688
- const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1689
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
1690
-
1691
- if (col >= ncols) {
1864
+ static __global__ void k_get_rows(
1865
+ const void * src0, const int32_t * src1, dst_t * dst,
1866
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1867
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1868
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1869
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1870
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
1871
+
1872
+ const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1873
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
1874
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
1875
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
1876
+
1877
+ if (i00 >= ne00) {
1692
1878
  return;
1693
1879
  }
1694
1880
 
1695
- const int r = y[row];
1881
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1696
1882
 
1697
- // copy x[r*ncols + col] to dst[row*ncols + col]
1698
- const int xi = r*ncols + col;
1699
- const int di = row*ncols + col;
1883
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1884
+ const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
1700
1885
 
1701
- const int ib = xi/qk; // block index
1702
- const int iqs = (xi%qk)/qr; // quant index
1703
- const int iybs = di - di%qk; // y block start index
1886
+ const int ib = i00/qk; // block index
1887
+ const int iqs = (i00%qk)/qr; // quant index
1888
+ const int iybs = i00 - i00%qk; // dst block start index
1704
1889
  const int y_offset = qr == 1 ? 1 : qk/2;
1705
1890
 
1706
1891
  // dequantize
1707
1892
  dfloat2 v;
1708
- dequantize_kernel(x, ib, iqs, v);
1893
+ dequantize_kernel(src0_row, ib, iqs, v);
1709
1894
 
1710
- dst[iybs + iqs + 0] = v.x;
1711
- dst[iybs + iqs + y_offset] = v.y;
1895
+ dst_row[iybs + iqs + 0] = v.x;
1896
+ dst_row[iybs + iqs + y_offset] = v.y;
1897
+ }
1898
+
1899
+ template<typename src0_t, typename dst_t>
1900
+ static __global__ void k_get_rows_float(
1901
+ const src0_t * src0, const int32_t * src1, dst_t * dst,
1902
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1903
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1904
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1905
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1906
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
1907
+
1908
+ const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
1909
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
1910
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
1911
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
1912
+
1913
+ if (i00 >= ne00) {
1914
+ return;
1915
+ }
1916
+
1917
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1918
+
1919
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1920
+ const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
1921
+
1922
+ dst_row[i00] = src0_row[i00];
1712
1923
  }
1713
1924
 
1714
1925
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
@@ -5035,29 +5246,98 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
5035
5246
 
5036
5247
  static __global__ void im2col_f32_f16(
5037
5248
  const float * x, half * dst,
5038
- int ofs0, int ofs1, int IW, int IH, int CHW,
5249
+ int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
5039
5250
  int s0, int s1, int p0, int p1, int d0, int d1) {
5040
- const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
5041
- const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
5251
+ const int i = threadIdx.x + blockIdx.x * blockDim.x;
5252
+ if (i >= pelements) {
5253
+ return;
5254
+ }
5255
+
5256
+ const int ksize = OW * (KH > 1 ? KW : 1);
5257
+ const int kx = i / ksize;
5258
+ const int kd = kx * ksize;
5259
+ const int ky = (i - kd) / OW;
5260
+ const int ix = i % OW;
5261
+
5262
+ const int iiw = ix * s0 + kx * d0 - p0;
5263
+ const int iih = blockIdx.y * s1 + ky * d1 - p1;
5042
5264
 
5043
5265
  const int offset_dst =
5044
- (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
5045
- (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
5266
+ (blockIdx.y * OW + ix) * CHW +
5267
+ (blockIdx.z * (KW * KH) + ky * KW + kx);
5046
5268
 
5047
5269
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5048
5270
  dst[offset_dst] = __float2half(0.0f);
5049
5271
  } else {
5050
- const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
5272
+ const int offset_src = blockIdx.z * offset_delta;
5051
5273
  dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
5052
5274
  }
5053
5275
  }
5054
5276
 
5055
5277
  template<int qk, int qr, dequantize_kernel_t dq>
5056
- static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
5278
+ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5279
+ const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5280
+
5281
+ GGML_TENSOR_BINARY_OP_LOCALS
5282
+
5057
5283
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
5058
- const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
5059
- const dim3 block_nums(block_num_x, nrows, 1);
5060
- k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
5284
+ const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
5285
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5286
+
5287
+ // strides in elements
5288
+ //const size_t s0 = nb0 / ggml_element_size(dst);
5289
+ const size_t s1 = nb1 / ggml_element_size(dst);
5290
+ const size_t s2 = nb2 / ggml_element_size(dst);
5291
+ const size_t s3 = nb3 / ggml_element_size(dst);
5292
+
5293
+ const size_t s10 = nb10 / ggml_element_size(src1);
5294
+ const size_t s11 = nb11 / ggml_element_size(src1);
5295
+ const size_t s12 = nb12 / ggml_element_size(src1);
5296
+ //const size_t s13 = nb13 / ggml_element_size(src1);
5297
+
5298
+ GGML_ASSERT(ne00 % 2 == 0);
5299
+
5300
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
5301
+ src0_dd, src1_dd, dst_dd,
5302
+ ne00, /*ne01, ne02, ne03,*/
5303
+ /*ne10, ne11,*/ ne12, /*ne13,*/
5304
+ /* s0,*/ s1, s2, s3,
5305
+ /* nb00,*/ nb01, nb02, nb03,
5306
+ s10, s11, s12/*, s13*/);
5307
+
5308
+ (void) dst;
5309
+ }
5310
+
5311
+ template<typename src0_t>
5312
+ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5313
+ const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5314
+
5315
+ GGML_TENSOR_BINARY_OP_LOCALS
5316
+
5317
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
5318
+ const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
5319
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5320
+
5321
+ // strides in elements
5322
+ //const size_t s0 = nb0 / ggml_element_size(dst);
5323
+ const size_t s1 = nb1 / ggml_element_size(dst);
5324
+ const size_t s2 = nb2 / ggml_element_size(dst);
5325
+ const size_t s3 = nb3 / ggml_element_size(dst);
5326
+
5327
+ const size_t s10 = nb10 / ggml_element_size(src1);
5328
+ const size_t s11 = nb11 / ggml_element_size(src1);
5329
+ const size_t s12 = nb12 / ggml_element_size(src1);
5330
+ //const size_t s13 = nb13 / ggml_element_size(src1);
5331
+
5332
+ k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
5333
+ src0_dd, src1_dd, dst_dd,
5334
+ ne00, /*ne01, ne02, ne03,*/
5335
+ /*ne10, ne11,*/ ne12, /*ne13,*/
5336
+ /* s0,*/ s1, s2, s3,
5337
+ /* nb00,*/ nb01, nb02, nb03,
5338
+ s10, s11, s12/*, s13*/);
5339
+
5340
+ (void) dst;
5061
5341
  }
5062
5342
 
5063
5343
  template<float (*bin_op)(const float, const float)>
@@ -5069,7 +5349,6 @@ struct bin_bcast_cuda {
5069
5349
 
5070
5350
  GGML_TENSOR_BINARY_OP_LOCALS
5071
5351
 
5072
-
5073
5352
  int nr0 = ne10/ne0;
5074
5353
  int nr1 = ne11/ne1;
5075
5354
  int nr2 = ne12/ne2;
@@ -5117,26 +5396,28 @@ struct bin_bcast_cuda {
5117
5396
  int64_t ne12 = cne1[2];
5118
5397
  int64_t ne13 = cne1[3];
5119
5398
 
5120
- //size_t nb0 = cnb0[0];
5399
+ size_t nb0 = cnb0[0];
5121
5400
  size_t nb1 = cnb0[1];
5122
5401
  size_t nb2 = cnb0[2];
5123
5402
  size_t nb3 = cnb0[3];
5124
5403
 
5125
- //size_t nb10 = cnb1[0];
5404
+ size_t nb10 = cnb1[0];
5126
5405
  size_t nb11 = cnb1[1];
5127
5406
  size_t nb12 = cnb1[2];
5128
5407
  size_t nb13 = cnb1[3];
5129
5408
 
5130
- //size_t s0 = nb0 / sizeof(src1_t);
5131
- size_t s1 = nb1 / sizeof(src1_t);
5132
- size_t s2 = nb2 / sizeof(src1_t);
5133
- size_t s3 = nb3 / sizeof(src1_t);
5409
+ size_t s0 = nb0 / sizeof(dst_t);
5410
+ size_t s1 = nb1 / sizeof(dst_t);
5411
+ size_t s2 = nb2 / sizeof(dst_t);
5412
+ size_t s3 = nb3 / sizeof(dst_t);
5134
5413
 
5135
- //size_t s10 = nb10 / sizeof(src1_t);
5414
+ size_t s10 = nb10 / sizeof(src1_t);
5136
5415
  size_t s11 = nb11 / sizeof(src1_t);
5137
5416
  size_t s12 = nb12 / sizeof(src1_t);
5138
5417
  size_t s13 = nb13 / sizeof(src1_t);
5139
5418
 
5419
+ GGML_ASSERT(s0 == 1);
5420
+ GGML_ASSERT(s10 == 1);
5140
5421
 
5141
5422
  const int block_size = 128;
5142
5423
 
@@ -5174,6 +5455,13 @@ struct bin_bcast_cuda {
5174
5455
  }
5175
5456
  };
5176
5457
 
5458
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
5459
+ const int ne10, const int ne11, const int ne12,
5460
+ const int nb1, const int nb2, const int offset, cudaStream_t stream) {
5461
+ int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
5462
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
5463
+ }
5464
+
5177
5465
  static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5178
5466
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
5179
5467
  gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -5184,11 +5472,26 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
5184
5472
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5185
5473
  }
5186
5474
 
5475
+ static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5476
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
5477
+ gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5478
+ }
5479
+
5480
+ static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5481
+ const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
5482
+ tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5483
+ }
5484
+
5187
5485
  static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5188
5486
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
5189
5487
  relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5190
5488
  }
5191
5489
 
5490
+ static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
5491
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
5492
+ leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
5493
+ }
5494
+
5192
5495
  static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5193
5496
  const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
5194
5497
  sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -5205,6 +5508,38 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
5205
5508
  }
5206
5509
  }
5207
5510
 
5511
+ static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
5512
+ static const float eps = 1e-6f;
5513
+ if (group_size < 1024) {
5514
+ const dim3 block_dims(WARP_SIZE, 1, 1);
5515
+ group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
5516
+ } else {
5517
+ const dim3 block_dims(1024, 1, 1);
5518
+ group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
5519
+ }
5520
+ }
5521
+
5522
+ static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
5523
+ int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
5524
+ dim3 gridDim(num_blocks, ne1, ne2);
5525
+ concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
5526
+ }
5527
+
5528
+ static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
5529
+ int ne0 = (ne00 * scale_factor);
5530
+ int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
5531
+ dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
5532
+ upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
5533
+ }
5534
+
5535
+ static void pad_f32_cuda(const float * x, float * dst,
5536
+ const int ne00, const int ne01, const int ne02,
5537
+ const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
5538
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
5539
+ dim3 gridDim(num_blocks, ne1, ne2);
5540
+ pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
5541
+ }
5542
+
5208
5543
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
5209
5544
  GGML_ASSERT(ncols % WARP_SIZE == 0);
5210
5545
  if (ncols < 1024) {
@@ -6167,13 +6502,14 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
6167
6502
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
6168
6503
  }
6169
6504
 
6170
- static void im2col_f32_f16_cuda(const float * x, half * dst,
6171
- int OH, int IW, int IH, int OW, int IC,
6172
- int KH, int KW, int N, int ofs0, int ofs1,
6173
- int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
6174
- dim3 block_nums(IC, OH, OW);
6175
- dim3 block_dims(N, KH, KW);
6176
- im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
6505
+ static void im2col_f32_f16_cuda(const float* x, half* dst,
6506
+ int IW, int IH, int OW, int OH, int KW, int KH, int IC,
6507
+ int offset_delta,
6508
+ int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
6509
+ const int parallel_elements = OW * KW * KH;
6510
+ const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
6511
+ dim3 block_nums(num_blocks, OH, IC);
6512
+ im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
6177
6513
  }
6178
6514
 
6179
6515
  // buffer pool for cuda
@@ -6447,36 +6783,34 @@ static void ggml_cuda_op_get_rows(
6447
6783
 
6448
6784
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
6449
6785
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
6450
- GGML_ASSERT(ggml_is_contiguous(src0));
6451
- GGML_ASSERT(ggml_is_contiguous(src1));
6452
- GGML_ASSERT(ggml_is_contiguous(dst));
6453
6786
 
6454
- const int ncols = src0->ne[0];
6455
- const int nrows = ggml_nelements(src1);
6787
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
6788
+ GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
6789
+ GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
6456
6790
 
6457
6791
  const int32_t * src1_i32 = (const int32_t *) src1_d;
6458
6792
 
6459
6793
  switch (src0->type) {
6460
6794
  case GGML_TYPE_F16:
6461
- get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6795
+ get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
6462
6796
  break;
6463
6797
  case GGML_TYPE_F32:
6464
- get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6798
+ get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6465
6799
  break;
6466
6800
  case GGML_TYPE_Q4_0:
6467
- get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6801
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6468
6802
  break;
6469
6803
  case GGML_TYPE_Q4_1:
6470
- get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6804
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6471
6805
  break;
6472
6806
  case GGML_TYPE_Q5_0:
6473
- get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6807
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6474
6808
  break;
6475
6809
  case GGML_TYPE_Q5_1:
6476
- get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6810
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6477
6811
  break;
6478
6812
  case GGML_TYPE_Q8_0:
6479
- get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6813
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6480
6814
  break;
6481
6815
  default:
6482
6816
  // TODO: k-quants
@@ -6522,6 +6856,25 @@ inline void ggml_cuda_op_add(
6522
6856
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6523
6857
  }
6524
6858
 
6859
+ inline void ggml_cuda_op_acc(
6860
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6861
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6862
+
6863
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6864
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6865
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6866
+ GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
6867
+
6868
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
6869
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
6870
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
6871
+ int offset = dst->op_params[3] / 4; // offset in bytes
6872
+
6873
+ acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
6874
+
6875
+ (void) dst;
6876
+ }
6877
+
6525
6878
  inline void ggml_cuda_op_mul(
6526
6879
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6527
6880
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6564,6 +6917,34 @@ inline void ggml_cuda_op_silu(
6564
6917
  (void) src1_dd;
6565
6918
  }
6566
6919
 
6920
+ inline void ggml_cuda_op_gelu_quick(
6921
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6922
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6923
+
6924
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6925
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6926
+
6927
+ gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6928
+
6929
+ (void) src1;
6930
+ (void) dst;
6931
+ (void) src1_dd;
6932
+ }
6933
+
6934
+ inline void ggml_cuda_op_tanh(
6935
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6936
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6937
+
6938
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6939
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6940
+
6941
+ tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6942
+
6943
+ (void) src1;
6944
+ (void) dst;
6945
+ (void) src1_dd;
6946
+ }
6947
+
6567
6948
  inline void ggml_cuda_op_relu(
6568
6949
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6569
6950
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6578,6 +6959,23 @@ inline void ggml_cuda_op_relu(
6578
6959
  (void) src1_dd;
6579
6960
  }
6580
6961
 
6962
+ inline void ggml_cuda_op_leaky_relu(
6963
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6964
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6965
+
6966
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6967
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6968
+
6969
+ float negative_slope;
6970
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
6971
+
6972
+ leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
6973
+
6974
+ (void) src1;
6975
+ (void) dst;
6976
+ (void) src1_dd;
6977
+ }
6978
+
6581
6979
  inline void ggml_cuda_op_sqr(
6582
6980
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6583
6981
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6612,6 +7010,71 @@ inline void ggml_cuda_op_norm(
6612
7010
  (void) src1_dd;
6613
7011
  }
6614
7012
 
7013
+
7014
+ inline void ggml_cuda_op_group_norm(
7015
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7016
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7017
+
7018
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7019
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7020
+
7021
+ int num_groups = dst->op_params[0];
7022
+ int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
7023
+ group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
7024
+
7025
+ (void) src1;
7026
+ (void) dst;
7027
+ (void) src1_dd;
7028
+ }
7029
+
7030
+ inline void ggml_cuda_op_concat(
7031
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7032
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7033
+
7034
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7035
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7036
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7037
+
7038
+ for (int i3 = 0; i3 < dst->ne[3]; i3++) {
7039
+ concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
7040
+ }
7041
+
7042
+ (void) src1;
7043
+ (void) dst;
7044
+ }
7045
+
7046
+ inline void ggml_cuda_op_upscale(
7047
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7048
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7049
+
7050
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7051
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7052
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
7053
+
7054
+ const int scale_factor = dst->op_params[0];
7055
+
7056
+ upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
7057
+
7058
+ (void) src1;
7059
+ (void) dst;
7060
+ }
7061
+
7062
+ inline void ggml_cuda_op_pad(
7063
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7064
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7065
+
7066
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7067
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7068
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
7069
+
7070
+ pad_f32_cuda(src0_dd, dst_dd,
7071
+ src0->ne[0], src0->ne[1], src0->ne[2],
7072
+ dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
7073
+
7074
+ (void) src1;
7075
+ (void) dst;
7076
+ }
7077
+
6615
7078
  inline void ggml_cuda_op_rms_norm(
6616
7079
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6617
7080
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7126,7 +7589,6 @@ inline void ggml_cuda_op_im2col(
7126
7589
 
7127
7590
  const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
7128
7591
 
7129
- const int64_t N = src1->ne[is_2D ? 3 : 2];
7130
7592
  const int64_t IC = src1->ne[is_2D ? 2 : 1];
7131
7593
  const int64_t IH = is_2D ? src1->ne[1] : 1;
7132
7594
  const int64_t IW = src1->ne[0];
@@ -7137,17 +7599,15 @@ inline void ggml_cuda_op_im2col(
7137
7599
  const int64_t OH = is_2D ? dst->ne[2] : 1;
7138
7600
  const int64_t OW = dst->ne[1];
7139
7601
 
7140
- const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
7141
- const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
7602
+ const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
7142
7603
 
7143
- im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
7144
- OH, IW, IH, OW, IC, KH, KW, N,
7145
- ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
7604
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
7146
7605
 
7147
7606
  (void) src0;
7148
7607
  (void) src0_dd;
7149
7608
  }
7150
7609
 
7610
+
7151
7611
  inline void ggml_cuda_op_sum_rows(
7152
7612
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7153
7613
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7696,6 +8156,10 @@ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, gg
7696
8156
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
7697
8157
  }
7698
8158
 
8159
+ static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8160
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
8161
+ }
8162
+
7699
8163
  static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7700
8164
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
7701
8165
  }
@@ -7712,10 +8176,22 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7712
8176
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7713
8177
  }
7714
8178
 
8179
+ static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8180
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
8181
+ }
8182
+
8183
+ static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8184
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
8185
+ }
8186
+
7715
8187
  static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7716
8188
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7717
8189
  }
7718
8190
 
8191
+ static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8192
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
8193
+ }
8194
+
7719
8195
  static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7720
8196
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7721
8197
  }
@@ -7724,6 +8200,22 @@ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, g
7724
8200
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7725
8201
  }
7726
8202
 
8203
+ static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8204
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
8205
+ }
8206
+
8207
+ static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8208
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
8209
+ }
8210
+
8211
+ static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8212
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
8213
+ }
8214
+
8215
+ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8216
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
8217
+ }
8218
+
7727
8219
  static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7728
8220
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
7729
8221
  }
@@ -8234,36 +8726,69 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
8234
8726
  }
8235
8727
  #endif
8236
8728
 
8237
- static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
8729
+ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8238
8730
  #if 0
8239
- //#ifdef CUDA_USE_TENSOR_CORES
8240
- // const bool use_tensor_cores = true;
8241
- //#else
8242
- // const bool use_tensor_cores = false;
8243
- //#endif
8244
-
8245
8731
  ggml_cuda_mul_mat_id_cublas(dst);
8246
-
8247
8732
  // TODO: mmq/mmv support
8248
- #else
8249
- const struct ggml_tensor * ids = dst->src[0];
8250
- const struct ggml_tensor * src1 = dst->src[1];
8251
- const int id = dst->op_params[0];
8733
+ #endif
8252
8734
 
8253
- int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8735
+ GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8254
8736
 
8255
- int32_t a_id;
8256
- CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8257
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8737
+ const struct ggml_tensor * ids = src0;
8738
+ const int32_t id = ((int32_t *) dst->op_params)[0];
8739
+ const int32_t n_as = ((int32_t *) dst->op_params)[1];
8258
8740
 
8259
- GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
8260
- const struct ggml_tensor * src0 = dst->src[a_id + 2];
8741
+ std::vector<char> ids_host(ggml_nbytes(ids));
8261
8742
 
8262
- ggml_cuda_mul_mat(src0, src1, dst);
8263
- #endif
8743
+ if (ids->backend == GGML_BACKEND_GPU) {
8744
+ const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8745
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8746
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8747
+ } else {
8748
+ memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
8749
+ }
8750
+
8751
+ const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
8752
+ const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
8753
+
8754
+ ggml_tensor_extra_gpu src1_row_extra;
8755
+ ggml_tensor_extra_gpu dst_row_extra;
8756
+
8757
+ ggml_tensor src1_row = *src1;
8758
+ ggml_tensor dst_row = *dst;
8759
+
8760
+ src1_row.ne[1] = 1;
8761
+ dst_row.ne[1] = 1;
8762
+
8763
+ src1_row.nb[2] = src1_row.nb[1];
8764
+ dst_row.nb[2] = dst_row.nb[1];
8765
+
8766
+ src1_row.nb[3] = src1_row.nb[1];
8767
+ dst_row.nb[3] = dst_row.nb[1];
8768
+
8769
+ src1_row.extra = &src1_row_extra;
8770
+ dst_row.extra = &dst_row_extra;
8771
+
8772
+
8773
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8774
+ //int32_t row_id;
8775
+ //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8776
+ //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8777
+
8778
+ const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8779
+
8780
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8781
+
8782
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8783
+
8784
+ src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
8785
+ src1_row.data = (char *) src1->data + i01*src1->nb[1];
8264
8786
 
8265
- (void) _src0;
8266
- (void) _src1;
8787
+ dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
8788
+ dst_row.data = (char *) dst->data + i01*dst->nb[1];
8789
+
8790
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8791
+ }
8267
8792
  }
8268
8793
 
8269
8794
  static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8373,6 +8898,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg
8373
8898
  (void) dst;
8374
8899
  }
8375
8900
 
8901
+ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
8902
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
8903
+
8904
+ return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
8905
+ }
8906
+
8376
8907
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8377
8908
  const int64_t nrows = ggml_nrows(tensor);
8378
8909
 
@@ -8422,8 +8953,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8422
8953
 
8423
8954
  // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
8424
8955
  if (ne0 % MATRIX_ROW_PADDING != 0) {
8425
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8426
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8956
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
8427
8957
  }
8428
8958
 
8429
8959
  char * buf;
@@ -8683,6 +9213,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8683
9213
  case GGML_OP_ADD:
8684
9214
  func = ggml_cuda_add;
8685
9215
  break;
9216
+ case GGML_OP_ACC:
9217
+ func = ggml_cuda_acc;
9218
+ break;
8686
9219
  case GGML_OP_MUL:
8687
9220
  func = ggml_cuda_mul;
8688
9221
  break;
@@ -8697,6 +9230,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8697
9230
  case GGML_UNARY_OP_SILU:
8698
9231
  func = ggml_cuda_silu;
8699
9232
  break;
9233
+ case GGML_UNARY_OP_GELU_QUICK:
9234
+ func = ggml_cuda_gelu_quick;
9235
+ break;
9236
+ case GGML_UNARY_OP_TANH:
9237
+ func = ggml_cuda_tanh;
9238
+ break;
8700
9239
  case GGML_UNARY_OP_RELU:
8701
9240
  func = ggml_cuda_relu;
8702
9241
  break;
@@ -8707,6 +9246,21 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8707
9246
  case GGML_OP_NORM:
8708
9247
  func = ggml_cuda_norm;
8709
9248
  break;
9249
+ case GGML_OP_GROUP_NORM:
9250
+ func = ggml_cuda_group_norm;
9251
+ break;
9252
+ case GGML_OP_CONCAT:
9253
+ func = ggml_cuda_concat;
9254
+ break;
9255
+ case GGML_OP_UPSCALE:
9256
+ func = ggml_cuda_upscale;
9257
+ break;
9258
+ case GGML_OP_PAD:
9259
+ func = ggml_cuda_pad;
9260
+ break;
9261
+ case GGML_OP_LEAKY_RELU:
9262
+ func = ggml_cuda_leaky_relu;
9263
+ break;
8710
9264
  case GGML_OP_RMS_NORM:
8711
9265
  func = ggml_cuda_rms_norm;
8712
9266
  break;
@@ -8729,9 +9283,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8729
9283
  func = ggml_cuda_sqr;
8730
9284
  break;
8731
9285
  case GGML_OP_CLAMP:
8732
- if (!any_on_device) {
8733
- return false;
8734
- }
8735
9286
  func = ggml_cuda_clamp;
8736
9287
  break;
8737
9288
  case GGML_OP_CPY:
@@ -8740,6 +9291,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8740
9291
  case GGML_OP_CONT:
8741
9292
  func = ggml_cuda_dup;
8742
9293
  break;
9294
+ case GGML_OP_NONE:
8743
9295
  case GGML_OP_RESHAPE:
8744
9296
  case GGML_OP_VIEW:
8745
9297
  case GGML_OP_PERMUTE:
@@ -8938,8 +9490,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
8938
9490
 
8939
9491
  if (ggml_is_quantized(tensor->type)) {
8940
9492
  if (ne0 % MATRIX_ROW_PADDING != 0) {
8941
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8942
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
9493
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
8943
9494
  }
8944
9495
  }
8945
9496
 
@@ -9159,6 +9710,8 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9159
9710
  case GGML_UNARY_OP_GELU:
9160
9711
  case GGML_UNARY_OP_SILU:
9161
9712
  case GGML_UNARY_OP_RELU:
9713
+ case GGML_UNARY_OP_GELU_QUICK:
9714
+ case GGML_UNARY_OP_TANH:
9162
9715
  return true;
9163
9716
  default:
9164
9717
  return false;
@@ -9181,6 +9734,45 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9181
9734
  }
9182
9735
  return true;
9183
9736
  } break;
9737
+ case GGML_OP_GET_ROWS:
9738
+ {
9739
+ switch (op->src[0]->type) {
9740
+ case GGML_TYPE_F16:
9741
+ case GGML_TYPE_F32:
9742
+ case GGML_TYPE_Q4_0:
9743
+ case GGML_TYPE_Q4_1:
9744
+ case GGML_TYPE_Q5_0:
9745
+ case GGML_TYPE_Q5_1:
9746
+ case GGML_TYPE_Q8_0:
9747
+ return true;
9748
+ default:
9749
+ return false;
9750
+ }
9751
+ } break;
9752
+ case GGML_OP_CPY:
9753
+ {
9754
+ ggml_type src0_type = op->src[0]->type;
9755
+ ggml_type src1_type = op->src[1]->type;
9756
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
9757
+ return true;
9758
+ }
9759
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
9760
+ return true;
9761
+ }
9762
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
9763
+ return true;
9764
+ }
9765
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
9766
+ return true;
9767
+ }
9768
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
9769
+ return true;
9770
+ }
9771
+ if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
9772
+ return true;
9773
+ }
9774
+ return false;
9775
+ } break;
9184
9776
  case GGML_OP_NONE:
9185
9777
  case GGML_OP_RESHAPE:
9186
9778
  case GGML_OP_VIEW:
@@ -9188,7 +9780,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9188
9780
  case GGML_OP_TRANSPOSE:
9189
9781
  case GGML_OP_NORM:
9190
9782
  case GGML_OP_REPEAT:
9191
- case GGML_OP_GET_ROWS:
9192
9783
  case GGML_OP_DUP:
9193
9784
  case GGML_OP_ADD:
9194
9785
  case GGML_OP_MUL:
@@ -9197,7 +9788,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9197
9788
  case GGML_OP_SCALE:
9198
9789
  case GGML_OP_SQR:
9199
9790
  case GGML_OP_CLAMP:
9200
- case GGML_OP_CPY:
9201
9791
  case GGML_OP_CONT:
9202
9792
  case GGML_OP_DIAG_MASK_INF:
9203
9793
  case GGML_OP_SOFT_MAX:
@@ -9206,6 +9796,12 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
9206
9796
  case GGML_OP_IM2COL:
9207
9797
  case GGML_OP_SUM_ROWS:
9208
9798
  case GGML_OP_ARGSORT:
9799
+ case GGML_OP_ACC:
9800
+ case GGML_OP_CONCAT:
9801
+ case GGML_OP_GROUP_NORM:
9802
+ case GGML_OP_UPSCALE:
9803
+ case GGML_OP_PAD:
9804
+ case GGML_OP_LEAKY_RELU:
9209
9805
  return true;
9210
9806
  default:
9211
9807
  return false;
@@ -9264,7 +9860,9 @@ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * use
9264
9860
  UNUSED(params);
9265
9861
  }
9266
9862
 
9267
- extern "C" int ggml_backend_cuda_reg_devices() {
9863
+ extern "C" int ggml_backend_cuda_reg_devices();
9864
+
9865
+ int ggml_backend_cuda_reg_devices() {
9268
9866
  int device_count = ggml_cuda_get_device_count();
9269
9867
  //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
9270
9868
  for (int i = 0; i < device_count; i++) {