llama_cpp 0.9.5 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,15 @@
1
1
  #include <algorithm>
2
+ #include <assert.h>
3
+ #include <atomic>
2
4
  #include <cinttypes>
3
5
  #include <cstddef>
4
6
  #include <cstdint>
7
+ #include <float.h>
5
8
  #include <limits>
6
9
  #include <stdint.h>
7
10
  #include <stdio.h>
8
- #include <atomic>
9
- #include <assert.h>
11
+ #include <vector>
12
+
10
13
 
11
14
  #if defined(GGML_USE_HIPBLAS)
12
15
  #include <hip/hip_runtime.h>
@@ -69,6 +72,7 @@
69
72
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
70
73
  #define cudaSetDevice hipSetDevice
71
74
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
75
+ #define cudaStreamFireAndForget hipStreamFireAndForget
72
76
  #define cudaStreamNonBlocking hipStreamNonBlocking
73
77
  #define cudaStreamSynchronize hipStreamSynchronize
74
78
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
@@ -190,7 +194,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190
194
  fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
191
195
  cudaGetErrorString(err_)); \
192
196
  fprintf(stderr, "current device: %d\n", id); \
193
- exit(1); \
197
+ GGML_ASSERT(!"CUDA error"); \
194
198
  } \
195
199
  } while (0)
196
200
 
@@ -204,7 +208,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
204
208
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
205
209
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
206
210
  fprintf(stderr, "current device: %d\n", id); \
207
- exit(1); \
211
+ GGML_ASSERT(!"cuBLAS error"); \
208
212
  } \
209
213
  } while (0)
210
214
  #else
@@ -216,7 +220,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
216
220
  cudaGetDevice(&id); \
217
221
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
218
222
  fprintf(stderr, "current device: %d\n", id); \
219
- exit(1); \
223
+ GGML_ASSERT(!"cuBLAS error"); \
220
224
  } \
221
225
  } while (0)
222
226
  #endif // CUDART_VERSION >= 11
@@ -433,10 +437,9 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
437
  #define WARP_SIZE 32
434
438
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
435
439
 
436
- #define CUDA_ADD_BLOCK_SIZE 256
437
- #define CUDA_MUL_BLOCK_SIZE 256
438
440
  #define CUDA_GELU_BLOCK_SIZE 256
439
441
  #define CUDA_SILU_BLOCK_SIZE 256
442
+ #define CUDA_TANH_BLOCK_SIZE 256
440
443
  #define CUDA_RELU_BLOCK_SIZE 256
441
444
  #define CUDA_SQR_BLOCK_SIZE 256
442
445
  #define CUDA_CPY_BLOCK_SIZE 32
@@ -449,6 +452,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
449
452
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
450
453
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
451
454
  #define CUDA_GET_ROWS_BLOCK_SIZE 256
455
+ #define CUDA_UPSCALE_BLOCK_SIZE 256
456
+ #define CUDA_CONCAT_BLOCK_SIZE 256
457
+ #define CUDA_PAD_BLOCK_SIZE 256
458
+ #define CUDA_ACC_BLOCK_SIZE 256
459
+ #define CUDA_IM2COL_BLOCK_SIZE 256
452
460
 
453
461
  // dmmv = dequantize_mul_mat_vec
454
462
  #ifndef GGML_CUDA_DMMV_X
@@ -527,40 +535,105 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
527
535
  return x;
528
536
  }
529
537
 
530
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
531
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
538
+ static __device__ __forceinline__ float op_repeat(const float a, const float b) {
539
+ return b;
540
+ }
532
541
 
533
- if (i >= kx) {
534
- return;
535
- }
536
- dst[i] = x[i] + y[i%ky];
542
+ static __device__ __forceinline__ float op_add(const float a, const float b) {
543
+ return a + b;
537
544
  }
538
545
 
539
- static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
540
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
546
+ static __device__ __forceinline__ float op_mul(const float a, const float b) {
547
+ return a * b;
548
+ }
541
549
 
542
- if (i >= k) {
550
+ static __device__ __forceinline__ float op_div(const float a, const float b) {
551
+ return a / b;
552
+ }
553
+
554
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
555
+ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
556
+ int ne0, int ne1, int ne2, int ne3,
557
+ int ne10, int ne11, int ne12, int ne13,
558
+ /*int s0, */ int s1, int s2, int s3,
559
+ /*int s10,*/ int s11, int s12, int s13) {
560
+ const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
561
+ const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
562
+ const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
563
+ const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
564
+
565
+ if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
543
566
  return;
544
567
  }
545
- dst[i] = __hadd(x[i], __float2half(y[i]));
568
+
569
+ const int i11 = i1 % ne11;
570
+ const int i12 = i2 % ne12;
571
+ const int i13 = i3 % ne13;
572
+
573
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
574
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
575
+ const size_t i_dst = i_src0;
576
+
577
+ const src0_t * src0_row = src0 + i_src0;
578
+ const src1_t * src1_row = src1 + i_src1;
579
+ dst_t * dst_row = dst + i_dst;
580
+
581
+ for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
582
+ const int i10 = i0 % ne10;
583
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
584
+ }
546
585
  }
547
586
 
548
- static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
587
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
588
+ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
589
+ int ne0, int ne1, int ne2, int ne3,
590
+ int ne10, int ne11, int ne12, int ne13,
591
+ /*int s0, */ int s1, int s2, int s3,
592
+ /*int s10,*/ int s11, int s12, int s13) {
593
+
549
594
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
550
595
 
551
- if (i >= k) {
596
+ const int i3 = i/(ne2*ne1*ne0);
597
+ const int i2 = (i/(ne1*ne0)) % ne2;
598
+ const int i1 = (i/ne0) % ne1;
599
+ const int i0 = i % ne0;
600
+
601
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
552
602
  return;
553
603
  }
554
- dst[i] = __half2float(x[i]) + y[i];
555
- }
556
604
 
557
- static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
558
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
605
+ const int i11 = i1 % ne11;
606
+ const int i12 = i2 % ne12;
607
+ const int i13 = i3 % ne13;
608
+
609
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
610
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
611
+ const size_t i_dst = i_src0;
612
+
613
+ const src0_t * src0_row = src0 + i_src0;
614
+ const src1_t * src1_row = src1 + i_src1;
615
+ dst_t * dst_row = dst + i_dst;
616
+
617
+ const int i10 = i0 % ne10;
618
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
619
+ }
559
620
 
560
- if (i >= kx) {
621
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
622
+ const int ne10, const int ne11, const int ne12,
623
+ const int nb1, const int nb2, int offset) {
624
+ const int i = blockDim.x * blockIdx.x + threadIdx.x;
625
+ if (i >= ne) {
561
626
  return;
562
627
  }
563
- dst[i] = x[i] * y[i%ky];
628
+ int src1_idx = i - offset;
629
+ int oz = src1_idx / nb2;
630
+ int oy = (src1_idx - (oz * nb2)) / nb1;
631
+ int ox = src1_idx % nb1;
632
+ if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
633
+ dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
634
+ } else {
635
+ dst[i] = x[i];
636
+ }
564
637
  }
565
638
 
566
639
  static __global__ void gelu_f32(const float * x, float * dst, const int k) {
@@ -585,6 +658,23 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
585
658
  dst[i] = x[i] / (1.0f + expf(-x[i]));
586
659
  }
587
660
 
661
+ static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
662
+ const float GELU_QUICK_COEF = -1.702f;
663
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
664
+ if (i >= k) {
665
+ return;
666
+ }
667
+ dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
668
+ }
669
+
670
+ static __global__ void tanh_f32(const float *x, float *dst, int k) {
671
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
672
+ if (i >= k) {
673
+ return;
674
+ }
675
+ dst[i] = tanhf(x[i]);
676
+ }
677
+
588
678
  static __global__ void relu_f32(const float * x, float * dst, const int k) {
589
679
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
590
680
 
@@ -594,6 +684,14 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
594
684
  dst[i] = fmaxf(x[i], 0);
595
685
  }
596
686
 
687
+ static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
688
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
689
+ if (i >= k) {
690
+ return;
691
+ }
692
+ dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
693
+ }
694
+
597
695
  static __global__ void sqr_f32(const float * x, float * dst, const int k) {
598
696
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
599
697
 
@@ -604,12 +702,10 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
604
702
  }
605
703
 
606
704
  template <int block_size>
607
- static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
705
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
608
706
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
609
707
  const int tid = threadIdx.x;
610
708
 
611
- const float eps = 1e-5f;
612
-
613
709
  float2 mean_var = make_float2(0.f, 0.f);
614
710
 
615
711
  for (int col = tid; col < ncols; col += block_size) {
@@ -641,6 +737,132 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
641
737
  }
642
738
  }
643
739
 
740
+ static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
741
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
742
+ if (nidx >= ne0) {
743
+ return;
744
+ }
745
+ // operation
746
+ int offset_dst =
747
+ nidx +
748
+ blockIdx.y * ne0 +
749
+ blockIdx.z * ne0 * gridDim.y;
750
+ if (blockIdx.z < ne02) { // src0
751
+ int offset_src =
752
+ nidx +
753
+ blockIdx.y * ne0 +
754
+ blockIdx.z * ne0 * gridDim.y;
755
+ dst[offset_dst] = x[offset_src];
756
+ } else {
757
+ int offset_src =
758
+ nidx +
759
+ blockIdx.y * ne0 +
760
+ (blockIdx.z - ne02) * ne0 * gridDim.y;
761
+ dst[offset_dst] = y[offset_src];
762
+ }
763
+ }
764
+
765
+ static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
766
+ int ne0 = ne00 * scale_factor;
767
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
768
+ if (nidx >= ne0) {
769
+ return;
770
+ }
771
+ // operation
772
+ int i00 = nidx / scale_factor;
773
+ int i01 = blockIdx.y / scale_factor;
774
+ int offset_src =
775
+ i00 +
776
+ i01 * ne00 +
777
+ blockIdx.z * nb02;
778
+ int offset_dst =
779
+ nidx +
780
+ blockIdx.y * ne0 +
781
+ blockIdx.z * ne0 * gridDim.y;
782
+ dst[offset_dst] = x[offset_src];
783
+ }
784
+
785
+ static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
786
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
787
+ if (nidx >= ne0) {
788
+ return;
789
+ }
790
+
791
+ // operation
792
+ int offset_dst =
793
+ nidx +
794
+ blockIdx.y * ne0 +
795
+ blockIdx.z * ne0 * gridDim.y;
796
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
797
+ int offset_src =
798
+ nidx +
799
+ blockIdx.y * ne00 +
800
+ blockIdx.z * ne00 * ne01;
801
+ dst[offset_dst] = x[offset_src];
802
+ } else {
803
+ dst[offset_dst] = 0.0f;
804
+ }
805
+ }
806
+
807
+ template <int block_size>
808
+ static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
809
+ int start = blockIdx.x * group_size;
810
+ int end = start + group_size;
811
+
812
+ start += threadIdx.x;
813
+
814
+ if (end >= ne_elements) {
815
+ end = ne_elements;
816
+ }
817
+
818
+ float tmp = 0.0f; // partial sum for thread in warp
819
+
820
+ for (int j = start; j < end; j += block_size) {
821
+ tmp += x[j];
822
+ }
823
+
824
+ tmp = warp_reduce_sum(tmp);
825
+ if (block_size > WARP_SIZE) {
826
+ __shared__ float s_sum[32];
827
+ int warp_id = threadIdx.x / WARP_SIZE;
828
+ int lane_id = threadIdx.x % WARP_SIZE;
829
+ if (lane_id == 0) {
830
+ s_sum[warp_id] = tmp;
831
+ }
832
+ __syncthreads();
833
+ tmp = s_sum[lane_id];
834
+ tmp = warp_reduce_sum(tmp);
835
+ }
836
+
837
+ float mean = tmp / group_size;
838
+ tmp = 0.0f;
839
+
840
+ for (int j = start; j < end; j += block_size) {
841
+ float xi = x[j] - mean;
842
+ dst[j] = xi;
843
+ tmp += xi * xi;
844
+ }
845
+
846
+ tmp = warp_reduce_sum(tmp);
847
+ if (block_size > WARP_SIZE) {
848
+ __shared__ float s_sum[32];
849
+ int warp_id = threadIdx.x / WARP_SIZE;
850
+ int lane_id = threadIdx.x % WARP_SIZE;
851
+ if (lane_id == 0) {
852
+ s_sum[warp_id] = tmp;
853
+ }
854
+ __syncthreads();
855
+ tmp = s_sum[lane_id];
856
+ tmp = warp_reduce_sum(tmp);
857
+ }
858
+
859
+ float variance = tmp / group_size;
860
+ float scale = rsqrtf(variance + eps);
861
+ for (int j = start; j < end; j += block_size) {
862
+ dst[j] *= scale;
863
+ }
864
+ }
865
+
644
866
  template <int block_size>
645
867
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
646
868
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -1639,31 +1861,65 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1639
1861
  }
1640
1862
 
1641
1863
  template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1642
- static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1643
- const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1644
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
1645
-
1646
- if (col >= ncols) {
1864
+ static __global__ void k_get_rows(
1865
+ const void * src0, const int32_t * src1, dst_t * dst,
1866
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1867
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1868
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1869
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1870
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
1871
+
1872
+ const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1873
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
1874
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
1875
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
1876
+
1877
+ if (i00 >= ne00) {
1647
1878
  return;
1648
1879
  }
1649
1880
 
1650
- const int r = y[row];
1881
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1651
1882
 
1652
- // copy x[r*ncols + col] to dst[row*ncols + col]
1653
- const int xi = r*ncols + col;
1654
- const int di = row*ncols + col;
1883
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1884
+ const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
1655
1885
 
1656
- const int ib = xi/qk; // block index
1657
- const int iqs = (xi%qk)/qr; // quant index
1658
- const int iybs = di - di%qk; // y block start index
1886
+ const int ib = i00/qk; // block index
1887
+ const int iqs = (i00%qk)/qr; // quant index
1888
+ const int iybs = i00 - i00%qk; // dst block start index
1659
1889
  const int y_offset = qr == 1 ? 1 : qk/2;
1660
1890
 
1661
1891
  // dequantize
1662
1892
  dfloat2 v;
1663
- dequantize_kernel(x, ib, iqs, v);
1893
+ dequantize_kernel(src0_row, ib, iqs, v);
1894
+
1895
+ dst_row[iybs + iqs + 0] = v.x;
1896
+ dst_row[iybs + iqs + y_offset] = v.y;
1897
+ }
1898
+
1899
+ template<typename src0_t, typename dst_t>
1900
+ static __global__ void k_get_rows_float(
1901
+ const src0_t * src0, const int32_t * src1, dst_t * dst,
1902
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1903
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1904
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1905
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1906
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
1907
+
1908
+ const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
1909
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
1910
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
1911
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
1664
1912
 
1665
- dst[iybs + iqs + 0] = v.x;
1666
- dst[iybs + iqs + y_offset] = v.y;
1913
+ if (i00 >= ne00) {
1914
+ return;
1915
+ }
1916
+
1917
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1918
+
1919
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1920
+ const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
1921
+
1922
+ dst_row[i00] = src0_row[i00];
1667
1923
  }
1668
1924
 
1669
1925
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
@@ -4559,6 +4815,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4559
4815
  cpy_1(cx + x_offset, cdst + dst_offset);
4560
4816
  }
4561
4817
 
4818
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
4819
+ const float * xi = (const float *) cxi;
4820
+ block_q8_0 * dsti = (block_q8_0 *) cdsti;
4821
+
4822
+ float amax = 0.0f; // absolute max
4823
+
4824
+ for (int j = 0; j < QK8_0; j++) {
4825
+ const float v = xi[j];
4826
+ amax = fmaxf(amax, fabsf(v));
4827
+ }
4828
+
4829
+ const float d = amax / ((1 << 7) - 1);
4830
+ const float id = d ? 1.0f/d : 0.0f;
4831
+
4832
+ dsti->d = d;
4833
+
4834
+ for (int j = 0; j < QK8_0; ++j) {
4835
+ const float x0 = xi[j]*id;
4836
+
4837
+ dsti->qs[j] = roundf(x0);
4838
+ }
4839
+ }
4840
+
4841
+ static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
4842
+ const float * xi = (const float *) cxi;
4843
+ block_q4_0 * dsti = (block_q4_0 *) cdsti;
4844
+
4845
+ float amax = 0.0f;
4846
+ float vmax = 0.0f;
4847
+
4848
+ for (int j = 0; j < QK4_0; ++j) {
4849
+ const float v = xi[j];
4850
+ if (amax < fabsf(v)) {
4851
+ amax = fabsf(v);
4852
+ vmax = v;
4853
+ }
4854
+ }
4855
+
4856
+ const float d = vmax / -8;
4857
+ const float id = d ? 1.0f/d : 0.0f;
4858
+
4859
+ dsti->d = d;
4860
+
4861
+ for (int j = 0; j < QK4_0/2; ++j) {
4862
+ const float x0 = xi[0 + j]*id;
4863
+ const float x1 = xi[QK4_0/2 + j]*id;
4864
+
4865
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
4866
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
4867
+
4868
+ dsti->qs[j] = xi0;
4869
+ dsti->qs[j] |= xi1 << 4;
4870
+ }
4871
+ }
4872
+
4873
+ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
4874
+ const float * xi = (const float *) cxi;
4875
+ block_q4_1 * dsti = (block_q4_1 *) cdsti;
4876
+
4877
+ float vmin = FLT_MAX;
4878
+ float vmax = -FLT_MAX;
4879
+
4880
+ for (int j = 0; j < QK4_1; ++j) {
4881
+ const float v = xi[j];
4882
+
4883
+ if (v < vmin) vmin = v;
4884
+ if (v > vmax) vmax = v;
4885
+ }
4886
+
4887
+ const float d = (vmax - vmin) / ((1 << 4) - 1);
4888
+ const float id = d ? 1.0f/d : 0.0f;
4889
+
4890
+ dsti->dm.x = d;
4891
+ dsti->dm.y = vmin;
4892
+
4893
+ for (int j = 0; j < QK4_1/2; ++j) {
4894
+ const float x0 = (xi[0 + j] - vmin)*id;
4895
+ const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
4896
+
4897
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
4898
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
4899
+
4900
+ dsti->qs[j] = xi0;
4901
+ dsti->qs[j] |= xi1 << 4;
4902
+ }
4903
+ }
4904
+
4905
+ template <cpy_kernel_t cpy_blck, int qk>
4906
+ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
4907
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
4908
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
4909
+ const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
4910
+
4911
+ if (i >= ne) {
4912
+ return;
4913
+ }
4914
+
4915
+ const int i02 = i / (ne00*ne01);
4916
+ const int i01 = (i - i02*ne01*ne00) / ne00;
4917
+ const int i00 = (i - i02*ne01*ne00 - i01*ne00);
4918
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
4919
+
4920
+ const int i12 = i / (ne10*ne11);
4921
+ const int i11 = (i - i12*ne10*ne11) / ne10;
4922
+ const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
4923
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
4924
+
4925
+ cpy_blck(cx + x_offset, cdst + dst_offset);
4926
+ }
4927
+
4562
4928
  static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4563
4929
  const float y = (i0 / 2 - low) / max(0.001f, high - low);
4564
4930
  return 1.0f - min(1.0f, max(0.0f, y));
@@ -4713,6 +5079,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
4713
5079
  dst[i] = col * m_k + x[i];
4714
5080
  }
4715
5081
 
5082
+ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
5083
+ const int row = blockIdx.y;
5084
+ const int col = threadIdx.x;
5085
+
5086
+ float sum = 0.0f;
5087
+ for (int i = col; i < ncols; i += blockDim.x) {
5088
+ sum += x[row * ncols + i];
5089
+ }
5090
+
5091
+ sum = warp_reduce_sum(sum);
5092
+
5093
+ if (col == 0) {
5094
+ dst[row] = sum;
5095
+ }
5096
+ }
5097
+
5098
+ template<typename T>
5099
+ static inline __device__ void swap(T & a, T & b) {
5100
+ T tmp = a;
5101
+ a = b;
5102
+ b = tmp;
5103
+ }
5104
+
5105
+ template<ggml_sort_order order>
5106
+ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
5107
+ // bitonic sort
5108
+ int col = threadIdx.x;
5109
+ int row = blockIdx.y;
5110
+
5111
+ if (col >= ncols) return;
5112
+
5113
+ const float * x_row = x + row * ncols;
5114
+ int * dst_row = dst + row * ncols;
5115
+
5116
+ // initialize indices
5117
+ if (col < ncols) {
5118
+ dst_row[col] = col;
5119
+ }
5120
+ __syncthreads();
5121
+
5122
+ for (int k = 2; k <= ncols; k *= 2) {
5123
+ for (int j = k / 2; j > 0; j /= 2) {
5124
+ int ixj = col ^ j;
5125
+ if (ixj > col) {
5126
+ if ((col & k) == 0) {
5127
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
5128
+ swap(dst_row[col], dst_row[ixj]);
5129
+ }
5130
+ } else {
5131
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
5132
+ swap(dst_row[col], dst_row[ixj]);
5133
+ }
5134
+ }
5135
+ }
5136
+ __syncthreads();
5137
+ }
5138
+ }
5139
+ }
5140
+
4716
5141
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4717
5142
  const int col = blockDim.y*blockIdx.y + threadIdx.y;
4718
5143
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
@@ -4722,8 +5147,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
4722
5147
  }
4723
5148
 
4724
5149
  const int i = row*ncols + col;
4725
- // dst[i] = col > n_past + row ? -INFINITY : x[i];
4726
- dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
5150
+ //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
5151
+ //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
5152
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
4727
5153
  }
4728
5154
 
4729
5155
  static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
@@ -4820,49 +5246,220 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
4820
5246
 
4821
5247
  static __global__ void im2col_f32_f16(
4822
5248
  const float * x, half * dst,
4823
- int ofs0, int ofs1, int IW, int IH, int CHW,
5249
+ int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
4824
5250
  int s0, int s1, int p0, int p1, int d0, int d1) {
4825
- const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
4826
- const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
5251
+ const int i = threadIdx.x + blockIdx.x * blockDim.x;
5252
+ if (i >= pelements) {
5253
+ return;
5254
+ }
5255
+
5256
+ const int ksize = OW * (KH > 1 ? KW : 1);
5257
+ const int kx = i / ksize;
5258
+ const int kd = kx * ksize;
5259
+ const int ky = (i - kd) / OW;
5260
+ const int ix = i % OW;
5261
+
5262
+ const int iiw = ix * s0 + kx * d0 - p0;
5263
+ const int iih = blockIdx.y * s1 + ky * d1 - p1;
4827
5264
 
4828
5265
  const int offset_dst =
4829
- (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
4830
- (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
5266
+ (blockIdx.y * OW + ix) * CHW +
5267
+ (blockIdx.z * (KW * KH) + ky * KW + kx);
4831
5268
 
4832
5269
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
4833
5270
  dst[offset_dst] = __float2half(0.0f);
4834
5271
  } else {
4835
- const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
5272
+ const int offset_src = blockIdx.z * offset_delta;
4836
5273
  dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
4837
5274
  }
4838
5275
  }
4839
5276
 
4840
5277
  template<int qk, int qr, dequantize_kernel_t dq>
4841
- static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
5278
+ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5279
+ const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5280
+
5281
+ GGML_TENSOR_BINARY_OP_LOCALS
5282
+
4842
5283
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
4843
- const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
4844
- const dim3 block_nums(block_num_x, nrows, 1);
4845
- k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4846
- }
5284
+ const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
5285
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5286
+
5287
+ // strides in elements
5288
+ //const size_t s0 = nb0 / ggml_element_size(dst);
5289
+ const size_t s1 = nb1 / ggml_element_size(dst);
5290
+ const size_t s2 = nb2 / ggml_element_size(dst);
5291
+ const size_t s3 = nb3 / ggml_element_size(dst);
5292
+
5293
+ const size_t s10 = nb10 / ggml_element_size(src1);
5294
+ const size_t s11 = nb11 / ggml_element_size(src1);
5295
+ const size_t s12 = nb12 / ggml_element_size(src1);
5296
+ //const size_t s13 = nb13 / ggml_element_size(src1);
5297
+
5298
+ GGML_ASSERT(ne00 % 2 == 0);
5299
+
5300
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
5301
+ src0_dd, src1_dd, dst_dd,
5302
+ ne00, /*ne01, ne02, ne03,*/
5303
+ /*ne10, ne11,*/ ne12, /*ne13,*/
5304
+ /* s0,*/ s1, s2, s3,
5305
+ /* nb00,*/ nb01, nb02, nb03,
5306
+ s10, s11, s12/*, s13*/);
4847
5307
 
4848
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4849
- const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4850
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
5308
+ (void) dst;
4851
5309
  }
4852
5310
 
4853
- static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
4854
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4855
- add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4856
- }
5311
+ template<typename src0_t>
5312
+ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5313
+ const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5314
+
5315
+ GGML_TENSOR_BINARY_OP_LOCALS
4857
5316
 
4858
- static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4859
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4860
- add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
5317
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
5318
+ const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
5319
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5320
+
5321
+ // strides in elements
5322
+ //const size_t s0 = nb0 / ggml_element_size(dst);
5323
+ const size_t s1 = nb1 / ggml_element_size(dst);
5324
+ const size_t s2 = nb2 / ggml_element_size(dst);
5325
+ const size_t s3 = nb3 / ggml_element_size(dst);
5326
+
5327
+ const size_t s10 = nb10 / ggml_element_size(src1);
5328
+ const size_t s11 = nb11 / ggml_element_size(src1);
5329
+ const size_t s12 = nb12 / ggml_element_size(src1);
5330
+ //const size_t s13 = nb13 / ggml_element_size(src1);
5331
+
5332
+ k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
5333
+ src0_dd, src1_dd, dst_dd,
5334
+ ne00, /*ne01, ne02, ne03,*/
5335
+ /*ne10, ne11,*/ ne12, /*ne13,*/
5336
+ /* s0,*/ s1, s2, s3,
5337
+ /* nb00,*/ nb01, nb02, nb03,
5338
+ s10, s11, s12/*, s13*/);
5339
+
5340
+ (void) dst;
4861
5341
  }
4862
5342
 
4863
- static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4864
- const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4865
- mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
5343
+ template<float (*bin_op)(const float, const float)>
5344
+ struct bin_bcast_cuda {
5345
+ template<typename src0_t, typename src1_t, typename dst_t>
5346
+ void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
5347
+ const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
5348
+ cudaStream_t stream) {
5349
+
5350
+ GGML_TENSOR_BINARY_OP_LOCALS
5351
+
5352
+ int nr0 = ne10/ne0;
5353
+ int nr1 = ne11/ne1;
5354
+ int nr2 = ne12/ne2;
5355
+ int nr3 = ne13/ne3;
5356
+
5357
+ int nr[4] = { nr0, nr1, nr2, nr3 };
5358
+
5359
+ // collapse dimensions until first broadcast dimension
5360
+ int64_t cne0[] = {ne0, ne1, ne2, ne3};
5361
+ int64_t cne1[] = {ne10, ne11, ne12, ne13};
5362
+ size_t cnb0[] = {nb0, nb1, nb2, nb3};
5363
+ size_t cnb1[] = {nb10, nb11, nb12, nb13};
5364
+ auto collapse = [](int64_t cne[]) {
5365
+ cne[0] *= cne[1];
5366
+ cne[1] = cne[2];
5367
+ cne[2] = cne[3];
5368
+ cne[3] = 1;
5369
+ };
5370
+
5371
+ auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
5372
+ cnb[1] *= cne[1];
5373
+ cnb[2] *= cne[2];
5374
+ cnb[3] *= cne[3];
5375
+ };
5376
+
5377
+ for (int i = 0; i < 4; i++) {
5378
+ if (nr[i] != 1) {
5379
+ break;
5380
+ }
5381
+ if (i > 0) {
5382
+ collapse_nb(cnb0, cne0);
5383
+ collapse_nb(cnb1, cne1);
5384
+ collapse(cne0);
5385
+ collapse(cne1);
5386
+ }
5387
+ }
5388
+ {
5389
+ int64_t ne0 = cne0[0];
5390
+ int64_t ne1 = cne0[1];
5391
+ int64_t ne2 = cne0[2];
5392
+ int64_t ne3 = cne0[3];
5393
+
5394
+ int64_t ne10 = cne1[0];
5395
+ int64_t ne11 = cne1[1];
5396
+ int64_t ne12 = cne1[2];
5397
+ int64_t ne13 = cne1[3];
5398
+
5399
+ size_t nb0 = cnb0[0];
5400
+ size_t nb1 = cnb0[1];
5401
+ size_t nb2 = cnb0[2];
5402
+ size_t nb3 = cnb0[3];
5403
+
5404
+ size_t nb10 = cnb1[0];
5405
+ size_t nb11 = cnb1[1];
5406
+ size_t nb12 = cnb1[2];
5407
+ size_t nb13 = cnb1[3];
5408
+
5409
+ size_t s0 = nb0 / sizeof(dst_t);
5410
+ size_t s1 = nb1 / sizeof(dst_t);
5411
+ size_t s2 = nb2 / sizeof(dst_t);
5412
+ size_t s3 = nb3 / sizeof(dst_t);
5413
+
5414
+ size_t s10 = nb10 / sizeof(src1_t);
5415
+ size_t s11 = nb11 / sizeof(src1_t);
5416
+ size_t s12 = nb12 / sizeof(src1_t);
5417
+ size_t s13 = nb13 / sizeof(src1_t);
5418
+
5419
+ GGML_ASSERT(s0 == 1);
5420
+ GGML_ASSERT(s10 == 1);
5421
+
5422
+ const int block_size = 128;
5423
+
5424
+ int64_t hne0 = std::max(ne0/2LL, 1LL);
5425
+
5426
+ dim3 block_dims;
5427
+ block_dims.x = std::min<unsigned int>(hne0, block_size);
5428
+ block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
5429
+ block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
5430
+
5431
+ dim3 block_nums(
5432
+ (hne0 + block_dims.x - 1) / block_dims.x,
5433
+ (ne1 + block_dims.y - 1) / block_dims.y,
5434
+ (ne2*ne3 + block_dims.z - 1) / block_dims.z
5435
+ );
5436
+
5437
+ if (block_nums.z > 65535) {
5438
+ // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
5439
+ int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
5440
+ k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
5441
+ src0_dd, src1_dd, dst_dd,
5442
+ ne0, ne1, ne2, ne3,
5443
+ ne10, ne11, ne12, ne13,
5444
+ /* s0, */ s1, s2, s3,
5445
+ /* s10, */ s11, s12, s13);
5446
+ } else {
5447
+ k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
5448
+ src0_dd, src1_dd, dst_dd,
5449
+ ne0, ne1, ne2, ne3,
5450
+ ne10, ne11, ne12, ne13,
5451
+ /* s0, */ s1, s2, s3,
5452
+ /* s10, */ s11, s12, s13);
5453
+ }
5454
+ }
5455
+ }
5456
+ };
5457
+
5458
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
5459
+ const int ne10, const int ne11, const int ne12,
5460
+ const int nb1, const int nb2, const int offset, cudaStream_t stream) {
5461
+ int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
5462
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
4866
5463
  }
4867
5464
 
4868
5465
  static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
@@ -4875,27 +5472,74 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4875
5472
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4876
5473
  }
4877
5474
 
5475
+ static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5476
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
5477
+ gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5478
+ }
5479
+
5480
+ static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5481
+ const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
5482
+ tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5483
+ }
5484
+
4878
5485
  static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4879
5486
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
4880
5487
  relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4881
5488
  }
4882
5489
 
5490
+ static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
5491
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
5492
+ leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
5493
+ }
5494
+
4883
5495
  static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4884
5496
  const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
4885
5497
  sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4886
5498
  }
4887
5499
 
4888
- static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5500
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4889
5501
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4890
5502
  if (ncols < 1024) {
4891
5503
  const dim3 block_dims(WARP_SIZE, 1, 1);
4892
- norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5504
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
5505
+ } else {
5506
+ const dim3 block_dims(1024, 1, 1);
5507
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
5508
+ }
5509
+ }
5510
+
5511
+ static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
5512
+ static const float eps = 1e-6f;
5513
+ if (group_size < 1024) {
5514
+ const dim3 block_dims(WARP_SIZE, 1, 1);
5515
+ group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
4893
5516
  } else {
4894
5517
  const dim3 block_dims(1024, 1, 1);
4895
- norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5518
+ group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
4896
5519
  }
4897
5520
  }
4898
5521
 
5522
+ static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
5523
+ int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
5524
+ dim3 gridDim(num_blocks, ne1, ne2);
5525
+ concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
5526
+ }
5527
+
5528
+ static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
5529
+ int ne0 = (ne00 * scale_factor);
5530
+ int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
5531
+ dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
5532
+ upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
5533
+ }
5534
+
5535
+ static void pad_f32_cuda(const float * x, float * dst,
5536
+ const int ne00, const int ne01, const int ne02,
5537
+ const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
5538
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
5539
+ dim3 gridDim(num_blocks, ne1, ne2);
5540
+ pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
5541
+ }
5542
+
4899
5543
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4900
5544
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4901
5545
  if (ncols < 1024) {
@@ -4914,34 +5558,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4914
5558
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4915
5559
  }
4916
5560
 
4917
- template<typename dst_t>
4918
- static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4919
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4920
- dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4921
- }
4922
-
4923
- template<typename dst_t>
4924
- static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4925
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4926
- dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4927
- }
4928
-
4929
- template<typename dst_t>
4930
- static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4931
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4932
- dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4933
- }
4934
-
4935
- template<typename dst_t>
4936
- static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4937
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4938
- dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4939
- }
4940
-
4941
- template<typename dst_t>
4942
- static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
5561
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
5562
+ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
4943
5563
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4944
- dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5564
+ dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4945
5565
  }
4946
5566
 
4947
5567
  template<typename dst_t>
@@ -4990,6 +5610,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4990
5610
  #endif
4991
5611
  }
4992
5612
 
5613
+ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5614
+ switch (type) {
5615
+ case GGML_TYPE_Q4_0:
5616
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5617
+ case GGML_TYPE_Q4_1:
5618
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5619
+ case GGML_TYPE_Q5_0:
5620
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5621
+ case GGML_TYPE_Q5_1:
5622
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5623
+ case GGML_TYPE_Q8_0:
5624
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5625
+ case GGML_TYPE_Q2_K:
5626
+ return dequantize_row_q2_K_cuda;
5627
+ case GGML_TYPE_Q3_K:
5628
+ return dequantize_row_q3_K_cuda;
5629
+ case GGML_TYPE_Q4_K:
5630
+ return dequantize_row_q4_K_cuda;
5631
+ case GGML_TYPE_Q5_K:
5632
+ return dequantize_row_q5_K_cuda;
5633
+ case GGML_TYPE_Q6_K:
5634
+ return dequantize_row_q6_K_cuda;
5635
+ case GGML_TYPE_F32:
5636
+ return dequantize_block_cuda<1, 1, convert_f32>;
5637
+ default:
5638
+ return nullptr;
5639
+ }
5640
+ }
5641
+
5642
+ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5643
+ switch (type) {
5644
+ case GGML_TYPE_Q4_0:
5645
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5646
+ case GGML_TYPE_Q4_1:
5647
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5648
+ case GGML_TYPE_Q5_0:
5649
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5650
+ case GGML_TYPE_Q5_1:
5651
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5652
+ case GGML_TYPE_Q8_0:
5653
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5654
+ case GGML_TYPE_Q2_K:
5655
+ return dequantize_row_q2_K_cuda;
5656
+ case GGML_TYPE_Q3_K:
5657
+ return dequantize_row_q3_K_cuda;
5658
+ case GGML_TYPE_Q4_K:
5659
+ return dequantize_row_q4_K_cuda;
5660
+ case GGML_TYPE_Q5_K:
5661
+ return dequantize_row_q5_K_cuda;
5662
+ case GGML_TYPE_Q6_K:
5663
+ return dequantize_row_q6_K_cuda;
5664
+ case GGML_TYPE_F16:
5665
+ return dequantize_block_cuda<1, 1, convert_f16>;
5666
+ default:
5667
+ return nullptr;
5668
+ }
5669
+ }
5670
+
4993
5671
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4994
5672
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4995
5673
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -5078,6 +5756,15 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
5078
5756
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5079
5757
  }
5080
5758
 
5759
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5760
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5761
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5762
+ const dim3 block_nums(block_num_y, 1, 1);
5763
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5764
+ dequantize_mul_mat_vec<1, 1, convert_f16>
5765
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5766
+ }
5767
+
5081
5768
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5082
5769
  GGML_ASSERT(ncols % QK4_0 == 0);
5083
5770
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -5168,83 +5855,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
5168
5855
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5169
5856
  }
5170
5857
 
5171
- static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
5172
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
5173
- dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5174
- }
5175
-
5176
- static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
5177
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
5178
- dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5179
- }
5180
-
5181
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5182
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5183
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5184
- const dim3 block_nums(block_num_y, 1, 1);
5185
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5186
- dequantize_mul_mat_vec<1, 1, convert_f16>
5187
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5188
- }
5189
-
5190
- static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5191
- switch (type) {
5192
- case GGML_TYPE_Q4_0:
5193
- return dequantize_row_q4_0_cuda;
5194
- case GGML_TYPE_Q4_1:
5195
- return dequantize_row_q4_1_cuda;
5196
- case GGML_TYPE_Q5_0:
5197
- return dequantize_row_q5_0_cuda;
5198
- case GGML_TYPE_Q5_1:
5199
- return dequantize_row_q5_1_cuda;
5200
- case GGML_TYPE_Q8_0:
5201
- return dequantize_row_q8_0_cuda;
5202
- case GGML_TYPE_Q2_K:
5203
- return dequantize_row_q2_K_cuda;
5204
- case GGML_TYPE_Q3_K:
5205
- return dequantize_row_q3_K_cuda;
5206
- case GGML_TYPE_Q4_K:
5207
- return dequantize_row_q4_K_cuda;
5208
- case GGML_TYPE_Q5_K:
5209
- return dequantize_row_q5_K_cuda;
5210
- case GGML_TYPE_Q6_K:
5211
- return dequantize_row_q6_K_cuda;
5212
- case GGML_TYPE_F32:
5213
- return convert_fp32_to_fp16_cuda;
5214
- default:
5215
- return nullptr;
5216
- }
5217
- }
5218
-
5219
- static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5220
- switch (type) {
5221
- case GGML_TYPE_Q4_0:
5222
- return dequantize_row_q4_0_cuda;
5223
- case GGML_TYPE_Q4_1:
5224
- return dequantize_row_q4_1_cuda;
5225
- case GGML_TYPE_Q5_0:
5226
- return dequantize_row_q5_0_cuda;
5227
- case GGML_TYPE_Q5_1:
5228
- return dequantize_row_q5_1_cuda;
5229
- case GGML_TYPE_Q8_0:
5230
- return dequantize_row_q8_0_cuda;
5231
- case GGML_TYPE_Q2_K:
5232
- return dequantize_row_q2_K_cuda;
5233
- case GGML_TYPE_Q3_K:
5234
- return dequantize_row_q3_K_cuda;
5235
- case GGML_TYPE_Q4_K:
5236
- return dequantize_row_q4_K_cuda;
5237
- case GGML_TYPE_Q5_K:
5238
- return dequantize_row_q5_K_cuda;
5239
- case GGML_TYPE_Q6_K:
5240
- return dequantize_row_q6_K_cuda;
5241
- case GGML_TYPE_F16:
5242
- return convert_fp16_to_fp32_cuda;
5243
- default:
5244
- return nullptr;
5245
- }
5246
- }
5247
-
5248
5858
  static void ggml_mul_mat_q4_0_q8_1_cuda(
5249
5859
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
5250
5860
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
@@ -5737,6 +6347,39 @@ static void ggml_cpy_f32_f16_cuda(
5737
6347
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5738
6348
  }
5739
6349
 
6350
+ static void ggml_cpy_f32_q8_0_cuda(
6351
+ const char * cx, char * cdst, const int ne,
6352
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6353
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6354
+
6355
+ GGML_ASSERT(ne % QK8_0 == 0);
6356
+ const int num_blocks = ne / QK8_0;
6357
+ cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
6358
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6359
+ }
6360
+
6361
+ static void ggml_cpy_f32_q4_0_cuda(
6362
+ const char * cx, char * cdst, const int ne,
6363
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6364
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6365
+
6366
+ GGML_ASSERT(ne % QK4_0 == 0);
6367
+ const int num_blocks = ne / QK4_0;
6368
+ cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
6369
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6370
+ }
6371
+
6372
+ static void ggml_cpy_f32_q4_1_cuda(
6373
+ const char * cx, char * cdst, const int ne,
6374
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6375
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6376
+
6377
+ GGML_ASSERT(ne % QK4_1 == 0);
6378
+ const int num_blocks = ne / QK4_1;
6379
+ cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
6380
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6381
+ }
6382
+
5740
6383
  static void ggml_cpy_f16_f16_cuda(
5741
6384
  const char * cx, char * cdst, const int ne,
5742
6385
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -5823,6 +6466,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
5823
6466
  alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
5824
6467
  }
5825
6468
 
6469
+ static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6470
+ const dim3 block_dims(WARP_SIZE, 1, 1);
6471
+ const dim3 block_nums(1, nrows, 1);
6472
+ k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6473
+ }
6474
+
6475
+ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
6476
+ // bitonic sort requires ncols to be power of 2
6477
+ GGML_ASSERT((ncols & (ncols - 1)) == 0);
6478
+
6479
+ const dim3 block_dims(ncols, 1, 1);
6480
+ const dim3 block_nums(1, nrows, 1);
6481
+ if (order == GGML_SORT_ASC) {
6482
+ k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6483
+ } else if (order == GGML_SORT_DESC) {
6484
+ k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6485
+ } else {
6486
+ GGML_ASSERT(false);
6487
+ }
6488
+ }
6489
+
5826
6490
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
5827
6491
  const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
5828
6492
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -5838,13 +6502,14 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
5838
6502
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
5839
6503
  }
5840
6504
 
5841
- static void im2col_f32_f16_cuda(const float * x, half * dst,
5842
- int OH, int IW, int IH, int OW, int IC,
5843
- int KH, int KW, int N, int ofs0, int ofs1,
5844
- int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
5845
- dim3 block_nums(IC, OH, OW);
5846
- dim3 block_dims(N, KH, KW);
5847
- im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
6505
+ static void im2col_f32_f16_cuda(const float* x, half* dst,
6506
+ int IW, int IH, int OW, int OH, int KW, int KH, int IC,
6507
+ int offset_delta,
6508
+ int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
6509
+ const int parallel_elements = OW * KW * KH;
6510
+ const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
6511
+ dim3 block_nums(num_blocks, OH, IC);
6512
+ im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
5848
6513
  }
5849
6514
 
5850
6515
  // buffer pool for cuda
@@ -5915,7 +6580,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5915
6580
  return ptr;
5916
6581
  }
5917
6582
  #ifdef DEBUG_CUDA_MALLOC
5918
- fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
6583
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5919
6584
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5920
6585
  #endif
5921
6586
  void * ptr;
@@ -6053,7 +6718,7 @@ void * ggml_cuda_host_malloc(size_t size) {
6053
6718
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
6054
6719
  // This can fixed the OOM error in WSL.
6055
6720
  cudaGetLastError();
6056
- fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
6721
+ fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6057
6722
  size/1024.0/1024.0, cudaGetErrorString(err));
6058
6723
  return nullptr;
6059
6724
  }
@@ -6098,75 +6763,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6098
6763
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
6099
6764
  if (nb0 == ts && nb1 == ts*ne0/bs) {
6100
6765
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
6101
- }
6102
- if (nb0 == ts) {
6766
+ } else if (nb0 == ts) {
6103
6767
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
6104
- }
6105
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6106
- const void * rx = (const void *) ((const char *) x + i1*nb1);
6107
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6108
- // pretend the row is a matrix with cols=1
6109
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6110
- if (r != cudaSuccess) { return r; }
6111
- }
6112
- return cudaSuccess;
6113
- }
6114
-
6115
- static void ggml_cuda_op_repeat(
6116
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6117
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
6118
- // guaranteed to be an integer due to the check in ggml_can_repeat
6119
- const int64_t ne0 = dst->ne[0];
6120
- const int64_t ne1 = dst->ne[1];
6121
- const int64_t ne2 = dst->ne[2];
6122
- const int64_t ne3 = dst->ne[3];
6123
-
6124
- const int64_t ne00 = src0->ne[0];
6125
- const int64_t ne01 = src0->ne[1];
6126
- const int64_t ne02 = src0->ne[2];
6127
- const int64_t ne03 = src0->ne[3];
6128
-
6129
- const size_t nb0 = dst->nb[0];
6130
- const size_t nb1 = dst->nb[1];
6131
- const size_t nb2 = dst->nb[2];
6132
- const size_t nb3 = dst->nb[3];
6133
-
6134
- const size_t nb00 = src0->nb[0];
6135
- const size_t nb01 = src0->nb[1];
6136
- const size_t nb02 = src0->nb[2];
6137
- const size_t nb03 = src0->nb[3];
6138
-
6139
- const int nr0 = (int)(ne0/ne00);
6140
- const int nr1 = (int)(ne1/ne01);
6141
- const int nr2 = (int)(ne2/ne02);
6142
- const int nr3 = (int)(ne3/ne03);
6143
-
6144
- // TODO: support for transposed / permuted tensors
6145
- GGML_ASSERT(nb0 == sizeof(float));
6146
- GGML_ASSERT(nb00 == sizeof(float));
6147
-
6148
- // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
6149
- for (int i3 = 0; i3 < nr3; i3++) {
6150
- for (int k3 = 0; k3 < ne03; k3++) {
6151
- for (int i2 = 0; i2 < nr2; i2++) {
6152
- for (int k2 = 0; k2 < ne02; k2++) {
6153
- for (int i1 = 0; i1 < nr1; i1++) {
6154
- for (int k1 = 0; k1 < ne01; k1++) {
6155
- for (int i0 = 0; i0 < nr0; i0++) {
6156
- CUDA_CHECK(cudaMemcpyAsync(
6157
- (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
6158
- (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
6159
- ne00*nb0, cudaMemcpyDeviceToDevice, stream));
6160
- }
6161
- }
6162
- }
6163
- }
6164
- }
6768
+ } else {
6769
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6770
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6771
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6772
+ // pretend the row is a matrix with cols=1
6773
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6774
+ if (r != cudaSuccess) return r;
6165
6775
  }
6776
+ return cudaSuccess;
6166
6777
  }
6167
-
6168
- (void) src1;
6169
- (void) src1_d;
6170
6778
  }
6171
6779
 
6172
6780
  static void ggml_cuda_op_get_rows(
@@ -6175,36 +6783,34 @@ static void ggml_cuda_op_get_rows(
6175
6783
 
6176
6784
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
6177
6785
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
6178
- GGML_ASSERT(ggml_is_contiguous(src0));
6179
- GGML_ASSERT(ggml_is_contiguous(src1));
6180
- GGML_ASSERT(ggml_is_contiguous(dst));
6181
6786
 
6182
- const int ncols = src0->ne[0];
6183
- const int nrows = ggml_nelements(src1);
6787
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
6788
+ GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
6789
+ GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
6184
6790
 
6185
6791
  const int32_t * src1_i32 = (const int32_t *) src1_d;
6186
6792
 
6187
6793
  switch (src0->type) {
6188
6794
  case GGML_TYPE_F16:
6189
- get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6795
+ get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
6190
6796
  break;
6191
6797
  case GGML_TYPE_F32:
6192
- get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6798
+ get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6193
6799
  break;
6194
6800
  case GGML_TYPE_Q4_0:
6195
- get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6801
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6196
6802
  break;
6197
6803
  case GGML_TYPE_Q4_1:
6198
- get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6804
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6199
6805
  break;
6200
6806
  case GGML_TYPE_Q5_0:
6201
- get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6807
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6202
6808
  break;
6203
6809
  case GGML_TYPE_Q5_1:
6204
- get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6810
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6205
6811
  break;
6206
6812
  case GGML_TYPE_Q8_0:
6207
- get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6813
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6208
6814
  break;
6209
6815
  default:
6210
6816
  // TODO: k-quants
@@ -6213,46 +6819,76 @@ static void ggml_cuda_op_get_rows(
6213
6819
  }
6214
6820
  }
6215
6821
 
6216
- inline void ggml_cuda_op_add(
6822
+ template<class op>
6823
+ inline void ggml_cuda_op_bin_bcast(
6217
6824
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6218
6825
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6219
6826
 
6220
6827
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6221
6828
 
6222
- const int64_t ne10 = src1->ne[0];
6223
- const int64_t ne11 = src1->ne[1];
6224
-
6225
6829
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6226
- add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6830
+ op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6227
6831
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
6228
- add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6832
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
6229
6833
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6230
- add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
6834
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
6231
6835
  } else {
6232
- fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
6836
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
6837
+ ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
6233
6838
  GGML_ASSERT(false);
6234
6839
  }
6840
+ }
6841
+
6842
+ static void ggml_cuda_op_repeat(
6843
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6844
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
6845
+
6846
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
6235
6847
 
6236
6848
  (void) src1;
6237
- (void) dst;
6849
+ (void) src1_d;
6238
6850
  }
6239
6851
 
6240
- inline void ggml_cuda_op_mul(
6852
+ inline void ggml_cuda_op_add(
6853
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6854
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6855
+
6856
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6857
+ }
6858
+
6859
+ inline void ggml_cuda_op_acc(
6241
6860
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6242
6861
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6243
6862
 
6244
6863
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6245
6864
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6246
6865
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6866
+ GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
6247
6867
 
6248
- const int64_t ne10 = src1->ne[0];
6249
- const int64_t ne11 = src1->ne[1];
6868
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
6869
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
6870
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
6871
+ int offset = dst->op_params[3] / 4; // offset in bytes
6250
6872
 
6251
- mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6873
+ acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
6252
6874
 
6253
6875
  (void) dst;
6254
6876
  }
6255
6877
 
6878
+ inline void ggml_cuda_op_mul(
6879
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6880
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6881
+
6882
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6883
+ }
6884
+
6885
+ inline void ggml_cuda_op_div(
6886
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6887
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6888
+
6889
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6890
+ }
6891
+
6256
6892
  inline void ggml_cuda_op_gelu(
6257
6893
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6258
6894
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6281,6 +6917,34 @@ inline void ggml_cuda_op_silu(
6281
6917
  (void) src1_dd;
6282
6918
  }
6283
6919
 
6920
+ inline void ggml_cuda_op_gelu_quick(
6921
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6922
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6923
+
6924
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6925
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6926
+
6927
+ gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6928
+
6929
+ (void) src1;
6930
+ (void) dst;
6931
+ (void) src1_dd;
6932
+ }
6933
+
6934
+ inline void ggml_cuda_op_tanh(
6935
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6936
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6937
+
6938
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6939
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6940
+
6941
+ tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6942
+
6943
+ (void) src1;
6944
+ (void) dst;
6945
+ (void) src1_dd;
6946
+ }
6947
+
6284
6948
  inline void ggml_cuda_op_relu(
6285
6949
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6286
6950
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6295,38 +6959,38 @@ inline void ggml_cuda_op_relu(
6295
6959
  (void) src1_dd;
6296
6960
  }
6297
6961
 
6298
- inline void ggml_cuda_op_sqr(
6962
+ inline void ggml_cuda_op_leaky_relu(
6299
6963
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6300
6964
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6301
6965
 
6302
6966
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6303
6967
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6304
6968
 
6305
- sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6969
+ float negative_slope;
6970
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
6971
+
6972
+ leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
6306
6973
 
6307
6974
  (void) src1;
6308
6975
  (void) dst;
6309
6976
  (void) src1_dd;
6310
6977
  }
6311
6978
 
6312
- inline void ggml_cuda_op_norm(
6979
+ inline void ggml_cuda_op_sqr(
6313
6980
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6314
6981
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6315
6982
 
6316
6983
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6317
6984
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6318
6985
 
6319
- const int64_t ne00 = src0->ne[0];
6320
- const int64_t nrows = ggml_nrows(src0);
6321
-
6322
- norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
6986
+ sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6323
6987
 
6324
6988
  (void) src1;
6325
6989
  (void) dst;
6326
6990
  (void) src1_dd;
6327
6991
  }
6328
6992
 
6329
- inline void ggml_cuda_op_rms_norm(
6993
+ inline void ggml_cuda_op_norm(
6330
6994
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6331
6995
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6332
6996
 
@@ -6339,26 +7003,111 @@ inline void ggml_cuda_op_rms_norm(
6339
7003
  float eps;
6340
7004
  memcpy(&eps, dst->op_params, sizeof(float));
6341
7005
 
6342
- rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
7006
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
6343
7007
 
6344
7008
  (void) src1;
6345
7009
  (void) dst;
6346
7010
  (void) src1_dd;
6347
7011
  }
6348
7012
 
6349
- inline void ggml_cuda_op_mul_mat_q(
6350
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
6351
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6352
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6353
7013
 
6354
- const int64_t ne00 = src0->ne[0];
7014
+ inline void ggml_cuda_op_group_norm(
7015
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7016
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6355
7017
 
6356
- const int64_t ne10 = src1->ne[0];
6357
- GGML_ASSERT(ne10 % QK8_1 == 0);
7018
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7019
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6358
7020
 
6359
- const int64_t ne0 = dst->ne[0];
7021
+ int num_groups = dst->op_params[0];
7022
+ int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
7023
+ group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
6360
7024
 
6361
- const int64_t row_diff = row_high - row_low;
7025
+ (void) src1;
7026
+ (void) dst;
7027
+ (void) src1_dd;
7028
+ }
7029
+
7030
+ inline void ggml_cuda_op_concat(
7031
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7032
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7033
+
7034
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7035
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7036
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7037
+
7038
+ for (int i3 = 0; i3 < dst->ne[3]; i3++) {
7039
+ concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
7040
+ }
7041
+
7042
+ (void) src1;
7043
+ (void) dst;
7044
+ }
7045
+
7046
+ inline void ggml_cuda_op_upscale(
7047
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7048
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7049
+
7050
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7051
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7052
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
7053
+
7054
+ const int scale_factor = dst->op_params[0];
7055
+
7056
+ upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
7057
+
7058
+ (void) src1;
7059
+ (void) dst;
7060
+ }
7061
+
7062
+ inline void ggml_cuda_op_pad(
7063
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7064
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7065
+
7066
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7067
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7068
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
7069
+
7070
+ pad_f32_cuda(src0_dd, dst_dd,
7071
+ src0->ne[0], src0->ne[1], src0->ne[2],
7072
+ dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
7073
+
7074
+ (void) src1;
7075
+ (void) dst;
7076
+ }
7077
+
7078
+ inline void ggml_cuda_op_rms_norm(
7079
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7080
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7081
+
7082
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7083
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7084
+
7085
+ const int64_t ne00 = src0->ne[0];
7086
+ const int64_t nrows = ggml_nrows(src0);
7087
+
7088
+ float eps;
7089
+ memcpy(&eps, dst->op_params, sizeof(float));
7090
+
7091
+ rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
7092
+
7093
+ (void) src1;
7094
+ (void) dst;
7095
+ (void) src1_dd;
7096
+ }
7097
+
7098
+ inline void ggml_cuda_op_mul_mat_q(
7099
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7100
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7101
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7102
+
7103
+ const int64_t ne00 = src0->ne[0];
7104
+
7105
+ const int64_t ne10 = src1->ne[0];
7106
+ GGML_ASSERT(ne10 % QK8_1 == 0);
7107
+
7108
+ const int64_t ne0 = dst->ne[0];
7109
+
7110
+ const int64_t row_diff = row_high - row_low;
6362
7111
 
6363
7112
  int id;
6364
7113
  CUDA_CHECK(cudaGetDevice(&id));
@@ -6474,6 +7223,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
6474
7223
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6475
7224
  const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6476
7225
 
7226
+ GGML_ASSERT(ggml_nrows(src1) == 1);
7227
+
6477
7228
  const int64_t ne00 = src0->ne[0];
6478
7229
  const int64_t row_diff = row_high - row_low;
6479
7230
 
@@ -6533,7 +7284,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
6533
7284
  size_t ash;
6534
7285
  dfloat * src1_dfloat = nullptr; // dfloat == half
6535
7286
 
6536
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
7287
+ bool src1_convert_f16 =
7288
+ src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
6537
7289
  src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
6538
7290
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
6539
7291
 
@@ -6837,7 +7589,6 @@ inline void ggml_cuda_op_im2col(
6837
7589
 
6838
7590
  const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
6839
7591
 
6840
- const int64_t N = src1->ne[is_2D ? 3 : 2];
6841
7592
  const int64_t IC = src1->ne[is_2D ? 2 : 1];
6842
7593
  const int64_t IH = is_2D ? src1->ne[1] : 1;
6843
7594
  const int64_t IW = src1->ne[0];
@@ -6848,17 +7599,51 @@ inline void ggml_cuda_op_im2col(
6848
7599
  const int64_t OH = is_2D ? dst->ne[2] : 1;
6849
7600
  const int64_t OW = dst->ne[1];
6850
7601
 
6851
- const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
6852
- const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
7602
+ const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
6853
7603
 
6854
- im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
6855
- OH, IW, IH, OW, IC, KH, KW, N,
6856
- ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
7604
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
6857
7605
 
6858
7606
  (void) src0;
6859
7607
  (void) src0_dd;
6860
7608
  }
6861
7609
 
7610
+
7611
+ inline void ggml_cuda_op_sum_rows(
7612
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7613
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7614
+
7615
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7616
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7617
+
7618
+ const int64_t ncols = src0->ne[0];
7619
+ const int64_t nrows = ggml_nrows(src0);
7620
+
7621
+ sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
7622
+
7623
+ (void) src1;
7624
+ (void) dst;
7625
+ (void) src1_dd;
7626
+ }
7627
+
7628
+ inline void ggml_cuda_op_argsort(
7629
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7630
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7631
+
7632
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7633
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
7634
+
7635
+ const int64_t ncols = src0->ne[0];
7636
+ const int64_t nrows = ggml_nrows(src0);
7637
+
7638
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
7639
+
7640
+ argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
7641
+
7642
+ (void) src1;
7643
+ (void) dst;
7644
+ (void) src1_dd;
7645
+ }
7646
+
6862
7647
  inline void ggml_cuda_op_diag_mask_inf(
6863
7648
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6864
7649
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7067,7 +7852,7 @@ static void ggml_cuda_op_mul_mat(
7067
7852
  const int64_t ne01 = src0->ne[1];
7068
7853
  const int64_t ne02 = src0->ne[2];
7069
7854
  const int64_t ne03 = src0->ne[3];
7070
- // const int64_t nrows0 = ggml_nrows(src0);
7855
+ const int64_t nrows0 = ggml_nrows(src0);
7071
7856
 
7072
7857
  const int64_t ne10 = src1->ne[0];
7073
7858
  const int64_t ne11 = src1->ne[1];
@@ -7103,10 +7888,9 @@ static void ggml_cuda_op_mul_mat(
7103
7888
 
7104
7889
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
7105
7890
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
7106
-
7107
7891
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
7108
- const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
7109
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
7892
+
7893
+ const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
7110
7894
 
7111
7895
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7112
7896
  GGML_ASSERT(!(split && ne02 > 1));
@@ -7231,7 +8015,7 @@ static void ggml_cuda_op_mul_mat(
7231
8015
  const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
7232
8016
 
7233
8017
  // for split tensors the data begins at i0 == i0_offset_low
7234
- char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
8018
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
7235
8019
  float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
7236
8020
  char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
7237
8021
  float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
@@ -7372,10 +8156,18 @@ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, gg
7372
8156
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
7373
8157
  }
7374
8158
 
8159
+ static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8160
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
8161
+ }
8162
+
7375
8163
  static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7376
8164
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
7377
8165
  }
7378
8166
 
8167
+ static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8168
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
8169
+ }
8170
+
7379
8171
  static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7380
8172
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
7381
8173
  }
@@ -7384,10 +8176,22 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7384
8176
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7385
8177
  }
7386
8178
 
8179
+ static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8180
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
8181
+ }
8182
+
8183
+ static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8184
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
8185
+ }
8186
+
7387
8187
  static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7388
8188
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7389
8189
  }
7390
8190
 
8191
+ static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8192
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
8193
+ }
8194
+
7391
8195
  static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7392
8196
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7393
8197
  }
@@ -7396,12 +8200,28 @@ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, g
7396
8200
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7397
8201
  }
7398
8202
 
8203
+ static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8204
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
8205
+ }
8206
+
8207
+ static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8208
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
8209
+ }
8210
+
8211
+ static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8212
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
8213
+ }
8214
+
8215
+ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8216
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
8217
+ }
8218
+
7399
8219
  static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7400
8220
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
7401
8221
  }
7402
8222
 
7403
8223
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7404
- if (!g_cublas_loaded) { return false; }
8224
+ if (!g_cublas_loaded) return false;
7405
8225
 
7406
8226
  const int64_t ne10 = src1->ne[0];
7407
8227
 
@@ -7479,7 +8299,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7479
8299
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7480
8300
  }
7481
8301
 
7482
- __global__ static void k_compute_batched_ptrs(
8302
+ static __global__ void k_compute_batched_ptrs(
7483
8303
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7484
8304
  const void ** ptrs_src, void ** ptrs_dst,
7485
8305
  int ne12, int ne13,
@@ -7535,9 +8355,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7535
8355
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7536
8356
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7537
8357
 
7538
- int id;
7539
- CUDA_CHECK(cudaGetDevice(&id));
7540
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
8358
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
7541
8359
 
7542
8360
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7543
8361
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -7594,7 +8412,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7594
8412
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
7595
8413
  // use cublasGemmStridedBatchedEx
7596
8414
  CUBLAS_CHECK(
7597
- cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
8415
+ cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7598
8416
  ne01, ne11, ne10,
7599
8417
  &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
7600
8418
  (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
@@ -7628,7 +8446,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7628
8446
  CUDA_CHECK(cudaGetLastError());
7629
8447
 
7630
8448
  CUBLAS_CHECK(
7631
- cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
8449
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7632
8450
  ne01, ne11, ne10,
7633
8451
  &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7634
8452
  (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
@@ -7698,10 +8516,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7698
8516
  #ifdef GGML_CUDA_FORCE_DMMV
7699
8517
  const bool use_mul_mat_vec_q = false;
7700
8518
  #else
7701
- const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
8519
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
7702
8520
  #endif // GGML_CUDA_FORCE_DMMV
7703
8521
 
7704
8522
  if (use_mul_mat_vec_q) {
8523
+ // NOTE: this kernel does not support ggml_nrows(src1) > 1
7705
8524
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
7706
8525
  } else {
7707
8526
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
@@ -7726,6 +8545,252 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7726
8545
  }
7727
8546
  }
7728
8547
 
8548
+ #if 0
8549
+ template<typename ... Srcs>
8550
+ static __global__ void k_compute_batched_ptrs_id(
8551
+ const void ** ptrs_src, void ** ptrs_dst,
8552
+ int ne12, int ne13,
8553
+ int ne23,
8554
+ int nb02, int nb03,
8555
+ int nb12, int nb13,
8556
+ int nb2, int nb3,
8557
+ int r2, int r3,
8558
+ ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
8559
+ const half * src1_f16, half * dst_f16,
8560
+ const int32_t * ids, const int id,
8561
+ Srcs... src0s) {
8562
+
8563
+ int i = ids[id];
8564
+
8565
+ half * src0_f16;
8566
+ const void * srcs_ar[] = { (const half *) src0s... };
8567
+ if (src0_type == GGML_TYPE_F16) {
8568
+ src0_f16 = (half *) srcs_ar[i];
8569
+ } else {
8570
+ src0_f16 = src0_as_f16;
8571
+ if (threadIdx.x == 0 && threadIdx.y == 0) {
8572
+ const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
8573
+ to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
8574
+ }
8575
+ }
8576
+
8577
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
8578
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
8579
+
8580
+ if (i13 >= ne13 || i12 >= ne12) {
8581
+ return;
8582
+ }
8583
+
8584
+ int i03 = i13 / r3;
8585
+ int i02 = i12 / r2;
8586
+
8587
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
8588
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
8589
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
8590
+ }
8591
+
8592
+ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
8593
+ const struct ggml_tensor * ids = dst->src[0];
8594
+ const struct ggml_tensor * src1 = dst->src[1];
8595
+ const struct ggml_tensor * src00 = dst->src[2];
8596
+
8597
+ const int id = dst->op_params[0];
8598
+
8599
+ GGML_ASSERT(!ggml_is_transposed(src00));
8600
+ GGML_ASSERT(!ggml_is_transposed(src1));
8601
+
8602
+ GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
8603
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
8604
+
8605
+ const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
8606
+ const int64_t ne01 = src00->ne[1];
8607
+ const int64_t ne02 = src00->ne[2];
8608
+ const int64_t ne03 = src00->ne[3];
8609
+
8610
+ //const int64_t nb01 = src00->nb[1];
8611
+ const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
8612
+ const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
8613
+
8614
+ const int64_t ne10 = src1->ne[0];
8615
+ const int64_t ne11 = src1->ne[1];
8616
+ const int64_t ne12 = src1->ne[2];
8617
+ const int64_t ne13 = src1->ne[3];
8618
+
8619
+ //const int64_t nb11 = src1->nb[1];
8620
+ const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
8621
+ const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
8622
+
8623
+ const int64_t ne1 = ggml_nelements(src1);
8624
+ const int64_t ne = ggml_nelements(dst);
8625
+
8626
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8627
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8628
+
8629
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
8630
+
8631
+ //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
8632
+ //void * src0_ddq = src0_extra->data_device[g_main_device];
8633
+ //half * src0_as_f16 = (half *) src0_ddq;
8634
+
8635
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
8636
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
8637
+
8638
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
8639
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
8640
+
8641
+ // convert src1 to fp16
8642
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8643
+ GGML_ASSERT(to_fp16_cuda != nullptr);
8644
+
8645
+ size_t src1_as = 0;
8646
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
8647
+ to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8648
+
8649
+ size_t dst_as = 0;
8650
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8651
+
8652
+ GGML_ASSERT(ne12 % ne02 == 0);
8653
+ GGML_ASSERT(ne13 % ne03 == 0);
8654
+
8655
+ // broadcast factors
8656
+ const int64_t r2 = ne12/ne02;
8657
+ const int64_t r3 = ne13/ne03;
8658
+
8659
+ const half alpha_f16 = 1.0f;
8660
+ const half beta_f16 = 0.0f;
8661
+
8662
+ // use cublasGemmBatchedEx
8663
+ const int ne23 = ne12*ne13;
8664
+
8665
+ const void ** ptrs_src = nullptr;
8666
+ void ** ptrs_dst = nullptr;
8667
+
8668
+ size_t ptrs_src_s = 0;
8669
+ size_t ptrs_dst_s = 0;
8670
+
8671
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
8672
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
8673
+
8674
+ int64_t src0_ne = ggml_nelements(src00);
8675
+ half * src0_as_f16 = nullptr;
8676
+ size_t src0_as = 0;
8677
+ if (src00->type != GGML_TYPE_F16) {
8678
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
8679
+ }
8680
+
8681
+ static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
8682
+ dim3 block_dims(ne13, ne12);
8683
+ k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
8684
+ ptrs_src, ptrs_dst,
8685
+ ne12, ne13,
8686
+ ne23,
8687
+ ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
8688
+ nb12, nb13,
8689
+ dst->nb[2], dst->nb[3],
8690
+ r2, r3,
8691
+ src00->type, src0_as_f16, src0_ne,
8692
+ src1_as_f16, dst_f16,
8693
+ (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
8694
+ dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
8695
+ dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
8696
+ dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
8697
+ dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
8698
+ );
8699
+ CUDA_CHECK(cudaGetLastError());
8700
+
8701
+ CUBLAS_CHECK(
8702
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8703
+ ne01, ne11, ne10,
8704
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
8705
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
8706
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
8707
+ ne23,
8708
+ CUBLAS_COMPUTE_16F,
8709
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8710
+
8711
+ if (src0_as != 0) {
8712
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
8713
+ }
8714
+ if (ptrs_src_s != 0) {
8715
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
8716
+ }
8717
+ if (ptrs_dst_s != 0) {
8718
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
8719
+ }
8720
+
8721
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8722
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8723
+
8724
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
8725
+ ggml_cuda_pool_free(dst_f16, dst_as);
8726
+ }
8727
+ #endif
8728
+
8729
+ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8730
+ #if 0
8731
+ ggml_cuda_mul_mat_id_cublas(dst);
8732
+ // TODO: mmq/mmv support
8733
+ #endif
8734
+
8735
+ GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8736
+
8737
+ const struct ggml_tensor * ids = src0;
8738
+ const int32_t id = ((int32_t *) dst->op_params)[0];
8739
+ const int32_t n_as = ((int32_t *) dst->op_params)[1];
8740
+
8741
+ std::vector<char> ids_host(ggml_nbytes(ids));
8742
+
8743
+ if (ids->backend == GGML_BACKEND_GPU) {
8744
+ const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8745
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8746
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8747
+ } else {
8748
+ memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
8749
+ }
8750
+
8751
+ const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
8752
+ const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
8753
+
8754
+ ggml_tensor_extra_gpu src1_row_extra;
8755
+ ggml_tensor_extra_gpu dst_row_extra;
8756
+
8757
+ ggml_tensor src1_row = *src1;
8758
+ ggml_tensor dst_row = *dst;
8759
+
8760
+ src1_row.ne[1] = 1;
8761
+ dst_row.ne[1] = 1;
8762
+
8763
+ src1_row.nb[2] = src1_row.nb[1];
8764
+ dst_row.nb[2] = dst_row.nb[1];
8765
+
8766
+ src1_row.nb[3] = src1_row.nb[1];
8767
+ dst_row.nb[3] = dst_row.nb[1];
8768
+
8769
+ src1_row.extra = &src1_row_extra;
8770
+ dst_row.extra = &dst_row_extra;
8771
+
8772
+
8773
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8774
+ //int32_t row_id;
8775
+ //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8776
+ //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8777
+
8778
+ const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8779
+
8780
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8781
+
8782
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8783
+
8784
+ src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
8785
+ src1_row.data = (char *) src1->data + i01*src1->nb[1];
8786
+
8787
+ dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
8788
+ dst_row.data = (char *) dst->data + i01*dst->nb[1];
8789
+
8790
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8791
+ }
8792
+ }
8793
+
7729
8794
  static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7730
8795
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
7731
8796
  }
@@ -7770,14 +8835,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7770
8835
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
7771
8836
 
7772
8837
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
7773
- ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7774
- ne10, ne11, nb10, nb11, nb12, main_stream);
8838
+ ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7775
8839
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7776
- ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7777
- ne10, ne11, nb10, nb11, nb12, main_stream);
8840
+ ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8841
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
8842
+ ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8843
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
8844
+ ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8845
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
8846
+ ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7778
8847
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7779
- ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7780
- ne10, ne11, nb10, nb11, nb12, main_stream);
8848
+ ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7781
8849
  } else {
7782
8850
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7783
8851
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7788,6 +8856,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7788
8856
  }
7789
8857
 
7790
8858
  static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8859
+ // TODO: why do we pass dst as src1 here?
7791
8860
  ggml_cuda_cpy(src0, dst, nullptr);
7792
8861
  (void) src1;
7793
8862
  }
@@ -7813,12 +8882,28 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
7813
8882
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7814
8883
  }
7815
8884
 
8885
+ static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8886
+ GGML_ASSERT(ggml_is_contiguous(src0));
8887
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
8888
+ }
8889
+
8890
+ static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8891
+ GGML_ASSERT(ggml_is_contiguous(src0));
8892
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
8893
+ }
8894
+
7816
8895
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7817
8896
  (void) src0;
7818
8897
  (void) src1;
7819
8898
  (void) dst;
7820
8899
  }
7821
8900
 
8901
+ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
8902
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
8903
+
8904
+ return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
8905
+ }
8906
+
7822
8907
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
7823
8908
  const int64_t nrows = ggml_nrows(tensor);
7824
8909
 
@@ -7868,8 +8953,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
7868
8953
 
7869
8954
  // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
7870
8955
  if (ne0 % MATRIX_ROW_PADDING != 0) {
7871
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
7872
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8956
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
7873
8957
  }
7874
8958
 
7875
8959
  char * buf;
@@ -8068,8 +9152,9 @@ void ggml_cuda_set_main_device(const int main_device) {
8068
9152
  main_device, g_device_count, g_main_device);
8069
9153
  return;
8070
9154
  }
8071
- g_main_device = main_device;
8072
- if (g_device_count > 1) {
9155
+
9156
+ if (g_main_device != main_device && g_device_count > 1) {
9157
+ g_main_device = main_device;
8073
9158
  cudaDeviceProp prop;
8074
9159
  CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
8075
9160
  fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
@@ -8095,7 +9180,7 @@ void ggml_cuda_free_scratch() {
8095
9180
  }
8096
9181
 
8097
9182
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8098
- if (!g_cublas_loaded) { return false; }
9183
+ if (!g_cublas_loaded) return false;
8099
9184
 
8100
9185
  ggml_cuda_func_t func;
8101
9186
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8128,9 +9213,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8128
9213
  case GGML_OP_ADD:
8129
9214
  func = ggml_cuda_add;
8130
9215
  break;
9216
+ case GGML_OP_ACC:
9217
+ func = ggml_cuda_acc;
9218
+ break;
8131
9219
  case GGML_OP_MUL:
8132
9220
  func = ggml_cuda_mul;
8133
9221
  break;
9222
+ case GGML_OP_DIV:
9223
+ func = ggml_cuda_div;
9224
+ break;
8134
9225
  case GGML_OP_UNARY:
8135
9226
  switch (ggml_get_unary_op(tensor)) {
8136
9227
  case GGML_UNARY_OP_GELU:
@@ -8139,15 +9230,37 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8139
9230
  case GGML_UNARY_OP_SILU:
8140
9231
  func = ggml_cuda_silu;
8141
9232
  break;
9233
+ case GGML_UNARY_OP_GELU_QUICK:
9234
+ func = ggml_cuda_gelu_quick;
9235
+ break;
9236
+ case GGML_UNARY_OP_TANH:
9237
+ func = ggml_cuda_tanh;
9238
+ break;
8142
9239
  case GGML_UNARY_OP_RELU:
8143
9240
  func = ggml_cuda_relu;
8144
9241
  break;
8145
9242
  default:
8146
9243
  return false;
8147
- } break;
9244
+ }
9245
+ break;
8148
9246
  case GGML_OP_NORM:
8149
9247
  func = ggml_cuda_norm;
8150
9248
  break;
9249
+ case GGML_OP_GROUP_NORM:
9250
+ func = ggml_cuda_group_norm;
9251
+ break;
9252
+ case GGML_OP_CONCAT:
9253
+ func = ggml_cuda_concat;
9254
+ break;
9255
+ case GGML_OP_UPSCALE:
9256
+ func = ggml_cuda_upscale;
9257
+ break;
9258
+ case GGML_OP_PAD:
9259
+ func = ggml_cuda_pad;
9260
+ break;
9261
+ case GGML_OP_LEAKY_RELU:
9262
+ func = ggml_cuda_leaky_relu;
9263
+ break;
8151
9264
  case GGML_OP_RMS_NORM:
8152
9265
  func = ggml_cuda_rms_norm;
8153
9266
  break;
@@ -8157,6 +9270,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8157
9270
  }
8158
9271
  func = ggml_cuda_mul_mat;
8159
9272
  break;
9273
+ case GGML_OP_MUL_MAT_ID:
9274
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
9275
+ return false;
9276
+ }
9277
+ func = ggml_cuda_mul_mat_id;
9278
+ break;
8160
9279
  case GGML_OP_SCALE:
8161
9280
  func = ggml_cuda_scale;
8162
9281
  break;
@@ -8164,9 +9283,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8164
9283
  func = ggml_cuda_sqr;
8165
9284
  break;
8166
9285
  case GGML_OP_CLAMP:
8167
- if (!any_on_device) {
8168
- return false;
8169
- }
8170
9286
  func = ggml_cuda_clamp;
8171
9287
  break;
8172
9288
  case GGML_OP_CPY:
@@ -8175,6 +9291,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8175
9291
  case GGML_OP_CONT:
8176
9292
  func = ggml_cuda_dup;
8177
9293
  break;
9294
+ case GGML_OP_NONE:
8178
9295
  case GGML_OP_RESHAPE:
8179
9296
  case GGML_OP_VIEW:
8180
9297
  case GGML_OP_PERMUTE:
@@ -8196,6 +9313,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8196
9313
  case GGML_OP_IM2COL:
8197
9314
  func = ggml_cuda_im2col;
8198
9315
  break;
9316
+ case GGML_OP_SUM_ROWS:
9317
+ func = ggml_cuda_sum_rows;
9318
+ break;
9319
+ case GGML_OP_ARGSORT:
9320
+ func = ggml_cuda_argsort;
9321
+ break;
8199
9322
  default:
8200
9323
  return false;
8201
9324
  }
@@ -8212,7 +9335,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8212
9335
 
8213
9336
  int ggml_cuda_get_device_count() {
8214
9337
  int device_count;
8215
- CUDA_CHECK(cudaGetDeviceCount(&device_count));
9338
+ if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
9339
+ return 0;
9340
+ }
8216
9341
  return device_count;
8217
9342
  }
8218
9343
 
@@ -8228,27 +9353,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
8228
9353
 
8229
9354
  #define UNUSED GGML_UNUSED
8230
9355
 
8231
- struct ggml_backend_context_cuda {
8232
- };
8233
-
8234
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
8235
- return GGML_CUDA_NAME;
8236
-
8237
- UNUSED(backend);
8238
- }
8239
-
8240
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
8241
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
8242
- delete cuda_ctx;
8243
- delete backend;
8244
- }
9356
+ // cuda buffer
8245
9357
 
8246
9358
  struct ggml_backend_buffer_context_cuda {
8247
- void * device;
8248
-
9359
+ int device;
9360
+ void * dev_ptr = nullptr;
8249
9361
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
8250
9362
  size_t temp_tensor_extra_index = 0;
8251
9363
 
9364
+ ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
9365
+
8252
9366
  ~ggml_backend_buffer_context_cuda() {
8253
9367
  delete[] temp_tensor_extras;
8254
9368
  }
@@ -8269,41 +9383,20 @@ struct ggml_backend_buffer_context_cuda {
8269
9383
 
8270
9384
  static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
8271
9385
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8272
- CUDA_CHECK(cudaFree(ctx->device));
9386
+ CUDA_CHECK(cudaFree(ctx->dev_ptr));
8273
9387
  delete ctx;
8274
9388
  }
8275
9389
 
8276
9390
  static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
8277
9391
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8278
- return ctx->device;
8279
- }
8280
-
8281
- static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8282
- int64_t row_low = 0;
8283
- int64_t row_high = ggml_nrows(tensor);
8284
- int64_t nrows_split = row_high - row_low;
8285
-
8286
- size_t size = ggml_nbytes_split(tensor, nrows_split);
8287
-
8288
- int64_t ne0 = tensor->ne[0];
8289
-
8290
- if (ggml_is_quantized(tensor->type)) {
8291
- if (ne0 % MATRIX_ROW_PADDING != 0) {
8292
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8293
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8294
- }
8295
- }
8296
-
8297
- return size;
8298
-
8299
- UNUSED(buffer);
9392
+ return ctx->dev_ptr;
8300
9393
  }
8301
9394
 
8302
9395
  static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8303
9396
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8304
9397
 
8305
9398
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
8306
- assert(tensor->view_src->buffer->backend == buffer->backend);
9399
+ assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
8307
9400
  tensor->backend = tensor->view_src->backend;
8308
9401
  tensor->extra = tensor->view_src->extra;
8309
9402
  return;
@@ -8311,7 +9404,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8311
9404
 
8312
9405
  ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
8313
9406
 
8314
- extra->data_device[g_main_device] = tensor->data;
9407
+ extra->data_device[ctx->device] = tensor->data;
8315
9408
 
8316
9409
  tensor->backend = GGML_BACKEND_GPU;
8317
9410
  tensor->extra = extra;
@@ -8323,64 +9416,207 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8323
9416
  int64_t nrows_split = row_high - row_low;
8324
9417
 
8325
9418
  size_t original_size = ggml_nbytes_split(tensor, nrows_split);
8326
- size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
9419
+ size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
8327
9420
 
8328
9421
  if (padded_size > original_size && tensor->view_src == nullptr) {
8329
- CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
9422
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
8330
9423
  }
8331
9424
  }
8332
9425
 
8333
9426
  UNUSED(buffer);
8334
9427
  }
8335
9428
 
9429
+ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9430
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
9431
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9432
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9433
+
9434
+ CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9435
+
9436
+ UNUSED(buffer);
9437
+ }
9438
+
9439
+ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9440
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
9441
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9442
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9443
+
9444
+ CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
9445
+
9446
+ UNUSED(buffer);
9447
+ }
9448
+
8336
9449
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
8337
- /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
8338
- /* .get_base = */ ggml_backend_cuda_buffer_get_base,
8339
- /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
8340
- /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
8341
- /* .free_tensor = */ NULL,
9450
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
9451
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
9452
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
9453
+ /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
9454
+ /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
9455
+ /* .cpy_tensor_from = */ NULL,
9456
+ /* .cpy_tensor_to = */ NULL,
8342
9457
  };
8343
9458
 
8344
- static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
8345
- ggml_cuda_set_device(g_main_device);
9459
+ // cuda buffer type
8346
9460
 
8347
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
9461
+ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
9462
+ int device = (int) (intptr_t) buft->context;
9463
+
9464
+ ggml_cuda_set_device(device);
8348
9465
 
8349
9466
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8350
9467
 
8351
- ggml_cuda_set_device(g_main_device);
8352
- CUDA_CHECK(cudaMalloc(&ctx->device, size));
9468
+ void * dev_ptr;
9469
+ CUDA_CHECK(cudaMalloc(&dev_ptr, size));
8353
9470
 
8354
- return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
9471
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
9472
+
9473
+ return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
8355
9474
  }
8356
9475
 
8357
- static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
9476
+ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
8358
9477
  return 128;
9478
+
9479
+ UNUSED(buft);
9480
+ }
9481
+
9482
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
9483
+ int64_t row_low = 0;
9484
+ int64_t row_high = ggml_nrows(tensor);
9485
+ int64_t nrows_split = row_high - row_low;
9486
+
9487
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
9488
+
9489
+ int64_t ne0 = tensor->ne[0];
9490
+
9491
+ if (ggml_is_quantized(tensor->type)) {
9492
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
9493
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
9494
+ }
9495
+ }
9496
+
9497
+ return size;
9498
+
9499
+ UNUSED(buft);
9500
+ }
9501
+
9502
+ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
9503
+ return ggml_backend_is_cuda(backend);
9504
+
9505
+ UNUSED(buft);
9506
+ }
9507
+
9508
+ static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
9509
+ /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
9510
+ /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
9511
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
9512
+ /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
9513
+ };
9514
+
9515
+ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9516
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
9517
+ static bool ggml_backend_buffer_type_cuda_initialized = false;
9518
+ if (!ggml_backend_buffer_type_cuda_initialized) {
9519
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
9520
+ ggml_backend_buffer_type_cuda[i] = {
9521
+ /* .iface = */ cuda_backend_buffer_type_interface,
9522
+ /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
9523
+ };
9524
+ }
9525
+ ggml_backend_buffer_type_cuda_initialized = true;
9526
+ }
9527
+
9528
+ return &ggml_backend_buffer_type_cuda[device];
9529
+ }
9530
+
9531
+ // host buffer type
9532
+
9533
+ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9534
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9535
+ CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
9536
+ delete ctx;
9537
+ }
9538
+
9539
+ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
9540
+ void * ptr;
9541
+ CUDA_CHECK(cudaMallocHost(&ptr, size));
9542
+
9543
+ // FIXME: this is a hack to avoid having to implement a new buffer type
9544
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
9545
+ buffer->buft = buft;
9546
+ buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
9547
+
9548
+ return buffer;
9549
+
9550
+ UNUSED(buft);
9551
+ }
9552
+
9553
+ struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9554
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9555
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9556
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9557
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9558
+ };
9559
+
9560
+ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9561
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9562
+ /* .iface = */ cuda_backend_host_buffer_type_interface,
9563
+ /* .context = */ nullptr,
9564
+ };
9565
+
9566
+ return &ggml_backend_buffer_type_cuda_host;
9567
+ }
9568
+
9569
+ // backend
9570
+
9571
+ struct ggml_backend_context_cuda {
9572
+ int device;
9573
+ };
9574
+
9575
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
9576
+ return GGML_CUDA_NAME;
9577
+
8359
9578
  UNUSED(backend);
8360
9579
  }
8361
9580
 
9581
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
9582
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9583
+
9584
+ delete cuda_ctx;
9585
+ delete backend;
9586
+ }
9587
+
9588
+ static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
9589
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9590
+
9591
+ return ggml_backend_cuda_buffer_type(cuda_ctx->device);
9592
+ }
9593
+
8362
9594
  static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9595
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9596
+
9597
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8363
9598
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
8364
9599
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8365
9600
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8366
9601
 
8367
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
8368
-
8369
- UNUSED(backend);
9602
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
8370
9603
  }
8371
9604
 
8372
9605
  static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9606
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9607
+
9608
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8373
9609
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
8374
9610
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8375
9611
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8376
9612
 
8377
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8378
-
8379
- UNUSED(backend);
9613
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
8380
9614
  }
8381
9615
 
8382
9616
  static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
8383
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
9617
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9618
+
9619
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
8384
9620
 
8385
9621
  UNUSED(backend);
8386
9622
  }
@@ -8394,14 +9630,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8394
9630
  UNUSED(cgraph);
8395
9631
  }
8396
9632
 
8397
- [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9633
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8398
9634
  GGML_ASSERT(!"not implemented");
8399
9635
 
8400
9636
  UNUSED(backend);
8401
9637
  UNUSED(plan);
8402
9638
  }
8403
9639
 
8404
- [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9640
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8405
9641
  GGML_ASSERT(!"not implemented");
8406
9642
 
8407
9643
  UNUSED(backend);
@@ -8409,7 +9645,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8409
9645
  }
8410
9646
 
8411
9647
  static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
8412
- ggml_cuda_set_device(g_main_device);
9648
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9649
+
9650
+ ggml_cuda_set_main_device(cuda_ctx->device);
8413
9651
 
8414
9652
  ggml_compute_params params = {};
8415
9653
  params.type = GGML_TASK_COMPUTE;
@@ -8417,13 +9655,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8417
9655
  for (int i = 0; i < cgraph->n_nodes; i++) {
8418
9656
  ggml_tensor * node = cgraph->nodes[i];
8419
9657
 
8420
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
9658
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8421
9659
  continue;
8422
- }
9660
+
8423
9661
  assert(node->backend == GGML_BACKEND_GPU);
9662
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9663
+ assert(node->extra != nullptr);
9664
+
8424
9665
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8425
9666
  if (node->src[j] != nullptr) {
8426
9667
  assert(node->src[j]->backend == GGML_BACKEND_GPU);
9668
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9669
+ assert(node->src[j]->extra != nullptr);
8427
9670
  }
8428
9671
  }
8429
9672
 
@@ -8460,27 +9703,143 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8460
9703
  UNUSED(backend);
8461
9704
  }
8462
9705
 
9706
+ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
9707
+ switch (op->op) {
9708
+ case GGML_OP_UNARY:
9709
+ switch (ggml_get_unary_op(op)) {
9710
+ case GGML_UNARY_OP_GELU:
9711
+ case GGML_UNARY_OP_SILU:
9712
+ case GGML_UNARY_OP_RELU:
9713
+ case GGML_UNARY_OP_GELU_QUICK:
9714
+ case GGML_UNARY_OP_TANH:
9715
+ return true;
9716
+ default:
9717
+ return false;
9718
+ }
9719
+ break;
9720
+ case GGML_OP_MUL_MAT:
9721
+ case GGML_OP_MUL_MAT_ID:
9722
+ {
9723
+ struct ggml_tensor * a;
9724
+ struct ggml_tensor * b;
9725
+ if (op->op == GGML_OP_MUL_MAT) {
9726
+ a = op->src[0];
9727
+ b = op->src[1];
9728
+ } else {
9729
+ a = op->src[2];
9730
+ b = op->src[1];
9731
+ }
9732
+ if (a->ne[3] != b->ne[3]) {
9733
+ return false;
9734
+ }
9735
+ return true;
9736
+ } break;
9737
+ case GGML_OP_GET_ROWS:
9738
+ {
9739
+ switch (op->src[0]->type) {
9740
+ case GGML_TYPE_F16:
9741
+ case GGML_TYPE_F32:
9742
+ case GGML_TYPE_Q4_0:
9743
+ case GGML_TYPE_Q4_1:
9744
+ case GGML_TYPE_Q5_0:
9745
+ case GGML_TYPE_Q5_1:
9746
+ case GGML_TYPE_Q8_0:
9747
+ return true;
9748
+ default:
9749
+ return false;
9750
+ }
9751
+ } break;
9752
+ case GGML_OP_CPY:
9753
+ {
9754
+ ggml_type src0_type = op->src[0]->type;
9755
+ ggml_type src1_type = op->src[1]->type;
9756
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
9757
+ return true;
9758
+ }
9759
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
9760
+ return true;
9761
+ }
9762
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
9763
+ return true;
9764
+ }
9765
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
9766
+ return true;
9767
+ }
9768
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
9769
+ return true;
9770
+ }
9771
+ if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
9772
+ return true;
9773
+ }
9774
+ return false;
9775
+ } break;
9776
+ case GGML_OP_NONE:
9777
+ case GGML_OP_RESHAPE:
9778
+ case GGML_OP_VIEW:
9779
+ case GGML_OP_PERMUTE:
9780
+ case GGML_OP_TRANSPOSE:
9781
+ case GGML_OP_NORM:
9782
+ case GGML_OP_REPEAT:
9783
+ case GGML_OP_DUP:
9784
+ case GGML_OP_ADD:
9785
+ case GGML_OP_MUL:
9786
+ case GGML_OP_DIV:
9787
+ case GGML_OP_RMS_NORM:
9788
+ case GGML_OP_SCALE:
9789
+ case GGML_OP_SQR:
9790
+ case GGML_OP_CLAMP:
9791
+ case GGML_OP_CONT:
9792
+ case GGML_OP_DIAG_MASK_INF:
9793
+ case GGML_OP_SOFT_MAX:
9794
+ case GGML_OP_ROPE:
9795
+ case GGML_OP_ALIBI:
9796
+ case GGML_OP_IM2COL:
9797
+ case GGML_OP_SUM_ROWS:
9798
+ case GGML_OP_ARGSORT:
9799
+ case GGML_OP_ACC:
9800
+ case GGML_OP_CONCAT:
9801
+ case GGML_OP_GROUP_NORM:
9802
+ case GGML_OP_UPSCALE:
9803
+ case GGML_OP_PAD:
9804
+ case GGML_OP_LEAKY_RELU:
9805
+ return true;
9806
+ default:
9807
+ return false;
9808
+ }
9809
+
9810
+ UNUSED(backend);
9811
+ }
9812
+
8463
9813
  static ggml_backend_i cuda_backend_i = {
8464
- /* .get_name = */ ggml_backend_cuda_name,
8465
- /* .free = */ ggml_backend_cuda_free,
8466
- /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
8467
- /* .get_alignment = */ ggml_backend_cuda_get_alignment,
8468
- /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
8469
- /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
8470
- /* .synchronize = */ ggml_backend_cuda_synchronize,
8471
- /* .cpy_tensor_from = */ nullptr,
8472
- /* .cpy_tensor_to = */ nullptr,
8473
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
8474
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
8475
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
8476
- /* .graph_compute = */ ggml_backend_cuda_graph_compute,
8477
- /* .supports_op = */ nullptr,
9814
+ /* .get_name = */ ggml_backend_cuda_name,
9815
+ /* .free = */ ggml_backend_cuda_free,
9816
+ /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
9817
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
9818
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
9819
+ /* .cpy_tensor_from_async = */ NULL,
9820
+ /* .cpy_tensor_to_async = */ NULL,
9821
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
9822
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
9823
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
9824
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
9825
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
9826
+ /* .supports_op = */ ggml_backend_cuda_supports_op,
8478
9827
  };
8479
9828
 
8480
- ggml_backend_t ggml_backend_cuda_init() {
9829
+ ggml_backend_t ggml_backend_cuda_init(int device) {
8481
9830
  ggml_init_cublas(); // TODO: remove from ggml.c
8482
9831
 
8483
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
9832
+ if (device < 0 || device >= ggml_cuda_get_device_count()) {
9833
+ fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
9834
+ return nullptr;
9835
+ }
9836
+
9837
+ // not strictly necessary, but it may reduce the overhead of the first graph_compute
9838
+ ggml_cuda_set_main_device(device);
9839
+
9840
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
9841
+ /* .device = */ device
9842
+ };
8484
9843
 
8485
9844
  ggml_backend_t cuda_backend = new ggml_backend {
8486
9845
  /* .interface = */ cuda_backend_i,
@@ -8489,3 +9848,27 @@ ggml_backend_t ggml_backend_cuda_init() {
8489
9848
 
8490
9849
  return cuda_backend;
8491
9850
  }
9851
+
9852
+ bool ggml_backend_is_cuda(ggml_backend_t backend) {
9853
+ return backend->iface.get_name == ggml_backend_cuda_name;
9854
+ }
9855
+
9856
+ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
9857
+ ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
9858
+ return cuda_backend;
9859
+
9860
+ UNUSED(params);
9861
+ }
9862
+
9863
+ extern "C" int ggml_backend_cuda_reg_devices();
9864
+
9865
+ int ggml_backend_cuda_reg_devices() {
9866
+ int device_count = ggml_cuda_get_device_count();
9867
+ //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
9868
+ for (int i = 0; i < device_count; i++) {
9869
+ char name[128];
9870
+ snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
9871
+ ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
9872
+ }
9873
+ return device_count;
9874
+ }