llama_cpp 0.9.5 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,15 @@
1
1
  #include <algorithm>
2
+ #include <assert.h>
3
+ #include <atomic>
2
4
  #include <cinttypes>
3
5
  #include <cstddef>
4
6
  #include <cstdint>
7
+ #include <float.h>
5
8
  #include <limits>
6
9
  #include <stdint.h>
7
10
  #include <stdio.h>
8
- #include <atomic>
9
- #include <assert.h>
11
+ #include <vector>
12
+
10
13
 
11
14
  #if defined(GGML_USE_HIPBLAS)
12
15
  #include <hip/hip_runtime.h>
@@ -69,6 +72,7 @@
69
72
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
70
73
  #define cudaSetDevice hipSetDevice
71
74
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
75
+ #define cudaStreamFireAndForget hipStreamFireAndForget
72
76
  #define cudaStreamNonBlocking hipStreamNonBlocking
73
77
  #define cudaStreamSynchronize hipStreamSynchronize
74
78
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
@@ -190,7 +194,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190
194
  fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
191
195
  cudaGetErrorString(err_)); \
192
196
  fprintf(stderr, "current device: %d\n", id); \
193
- exit(1); \
197
+ GGML_ASSERT(!"CUDA error"); \
194
198
  } \
195
199
  } while (0)
196
200
 
@@ -204,7 +208,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
204
208
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
205
209
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
206
210
  fprintf(stderr, "current device: %d\n", id); \
207
- exit(1); \
211
+ GGML_ASSERT(!"cuBLAS error"); \
208
212
  } \
209
213
  } while (0)
210
214
  #else
@@ -216,7 +220,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
216
220
  cudaGetDevice(&id); \
217
221
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
218
222
  fprintf(stderr, "current device: %d\n", id); \
219
- exit(1); \
223
+ GGML_ASSERT(!"cuBLAS error"); \
220
224
  } \
221
225
  } while (0)
222
226
  #endif // CUDART_VERSION >= 11
@@ -433,10 +437,9 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
437
  #define WARP_SIZE 32
434
438
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
435
439
 
436
- #define CUDA_ADD_BLOCK_SIZE 256
437
- #define CUDA_MUL_BLOCK_SIZE 256
438
440
  #define CUDA_GELU_BLOCK_SIZE 256
439
441
  #define CUDA_SILU_BLOCK_SIZE 256
442
+ #define CUDA_TANH_BLOCK_SIZE 256
440
443
  #define CUDA_RELU_BLOCK_SIZE 256
441
444
  #define CUDA_SQR_BLOCK_SIZE 256
442
445
  #define CUDA_CPY_BLOCK_SIZE 32
@@ -449,6 +452,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
449
452
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
450
453
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
451
454
  #define CUDA_GET_ROWS_BLOCK_SIZE 256
455
+ #define CUDA_UPSCALE_BLOCK_SIZE 256
456
+ #define CUDA_CONCAT_BLOCK_SIZE 256
457
+ #define CUDA_PAD_BLOCK_SIZE 256
458
+ #define CUDA_ACC_BLOCK_SIZE 256
459
+ #define CUDA_IM2COL_BLOCK_SIZE 256
452
460
 
453
461
  // dmmv = dequantize_mul_mat_vec
454
462
  #ifndef GGML_CUDA_DMMV_X
@@ -527,40 +535,105 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
527
535
  return x;
528
536
  }
529
537
 
530
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
531
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
538
+ static __device__ __forceinline__ float op_repeat(const float a, const float b) {
539
+ return b;
540
+ }
532
541
 
533
- if (i >= kx) {
534
- return;
535
- }
536
- dst[i] = x[i] + y[i%ky];
542
+ static __device__ __forceinline__ float op_add(const float a, const float b) {
543
+ return a + b;
537
544
  }
538
545
 
539
- static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
540
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
546
+ static __device__ __forceinline__ float op_mul(const float a, const float b) {
547
+ return a * b;
548
+ }
541
549
 
542
- if (i >= k) {
550
+ static __device__ __forceinline__ float op_div(const float a, const float b) {
551
+ return a / b;
552
+ }
553
+
554
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
555
+ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
556
+ int ne0, int ne1, int ne2, int ne3,
557
+ int ne10, int ne11, int ne12, int ne13,
558
+ /*int s0, */ int s1, int s2, int s3,
559
+ /*int s10,*/ int s11, int s12, int s13) {
560
+ const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
561
+ const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
562
+ const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
563
+ const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
564
+
565
+ if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
543
566
  return;
544
567
  }
545
- dst[i] = __hadd(x[i], __float2half(y[i]));
568
+
569
+ const int i11 = i1 % ne11;
570
+ const int i12 = i2 % ne12;
571
+ const int i13 = i3 % ne13;
572
+
573
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
574
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
575
+ const size_t i_dst = i_src0;
576
+
577
+ const src0_t * src0_row = src0 + i_src0;
578
+ const src1_t * src1_row = src1 + i_src1;
579
+ dst_t * dst_row = dst + i_dst;
580
+
581
+ for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
582
+ const int i10 = i0 % ne10;
583
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
584
+ }
546
585
  }
547
586
 
548
- static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
587
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
588
+ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
589
+ int ne0, int ne1, int ne2, int ne3,
590
+ int ne10, int ne11, int ne12, int ne13,
591
+ /*int s0, */ int s1, int s2, int s3,
592
+ /*int s10,*/ int s11, int s12, int s13) {
593
+
549
594
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
550
595
 
551
- if (i >= k) {
596
+ const int i3 = i/(ne2*ne1*ne0);
597
+ const int i2 = (i/(ne1*ne0)) % ne2;
598
+ const int i1 = (i/ne0) % ne1;
599
+ const int i0 = i % ne0;
600
+
601
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
552
602
  return;
553
603
  }
554
- dst[i] = __half2float(x[i]) + y[i];
555
- }
556
604
 
557
- static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
558
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
605
+ const int i11 = i1 % ne11;
606
+ const int i12 = i2 % ne12;
607
+ const int i13 = i3 % ne13;
608
+
609
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
610
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
611
+ const size_t i_dst = i_src0;
612
+
613
+ const src0_t * src0_row = src0 + i_src0;
614
+ const src1_t * src1_row = src1 + i_src1;
615
+ dst_t * dst_row = dst + i_dst;
616
+
617
+ const int i10 = i0 % ne10;
618
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
619
+ }
559
620
 
560
- if (i >= kx) {
621
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
622
+ const int ne10, const int ne11, const int ne12,
623
+ const int nb1, const int nb2, int offset) {
624
+ const int i = blockDim.x * blockIdx.x + threadIdx.x;
625
+ if (i >= ne) {
561
626
  return;
562
627
  }
563
- dst[i] = x[i] * y[i%ky];
628
+ int src1_idx = i - offset;
629
+ int oz = src1_idx / nb2;
630
+ int oy = (src1_idx - (oz * nb2)) / nb1;
631
+ int ox = src1_idx % nb1;
632
+ if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
633
+ dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
634
+ } else {
635
+ dst[i] = x[i];
636
+ }
564
637
  }
565
638
 
566
639
  static __global__ void gelu_f32(const float * x, float * dst, const int k) {
@@ -585,6 +658,23 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
585
658
  dst[i] = x[i] / (1.0f + expf(-x[i]));
586
659
  }
587
660
 
661
+ static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
662
+ const float GELU_QUICK_COEF = -1.702f;
663
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
664
+ if (i >= k) {
665
+ return;
666
+ }
667
+ dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
668
+ }
669
+
670
+ static __global__ void tanh_f32(const float *x, float *dst, int k) {
671
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
672
+ if (i >= k) {
673
+ return;
674
+ }
675
+ dst[i] = tanhf(x[i]);
676
+ }
677
+
588
678
  static __global__ void relu_f32(const float * x, float * dst, const int k) {
589
679
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
590
680
 
@@ -594,6 +684,14 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
594
684
  dst[i] = fmaxf(x[i], 0);
595
685
  }
596
686
 
687
+ static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
688
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
689
+ if (i >= k) {
690
+ return;
691
+ }
692
+ dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
693
+ }
694
+
597
695
  static __global__ void sqr_f32(const float * x, float * dst, const int k) {
598
696
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
599
697
 
@@ -604,12 +702,10 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
604
702
  }
605
703
 
606
704
  template <int block_size>
607
- static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
705
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
608
706
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
609
707
  const int tid = threadIdx.x;
610
708
 
611
- const float eps = 1e-5f;
612
-
613
709
  float2 mean_var = make_float2(0.f, 0.f);
614
710
 
615
711
  for (int col = tid; col < ncols; col += block_size) {
@@ -641,6 +737,132 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
641
737
  }
642
738
  }
643
739
 
740
+ static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
741
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
742
+ if (nidx >= ne0) {
743
+ return;
744
+ }
745
+ // operation
746
+ int offset_dst =
747
+ nidx +
748
+ blockIdx.y * ne0 +
749
+ blockIdx.z * ne0 * gridDim.y;
750
+ if (blockIdx.z < ne02) { // src0
751
+ int offset_src =
752
+ nidx +
753
+ blockIdx.y * ne0 +
754
+ blockIdx.z * ne0 * gridDim.y;
755
+ dst[offset_dst] = x[offset_src];
756
+ } else {
757
+ int offset_src =
758
+ nidx +
759
+ blockIdx.y * ne0 +
760
+ (blockIdx.z - ne02) * ne0 * gridDim.y;
761
+ dst[offset_dst] = y[offset_src];
762
+ }
763
+ }
764
+
765
+ static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
766
+ int ne0 = ne00 * scale_factor;
767
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
768
+ if (nidx >= ne0) {
769
+ return;
770
+ }
771
+ // operation
772
+ int i00 = nidx / scale_factor;
773
+ int i01 = blockIdx.y / scale_factor;
774
+ int offset_src =
775
+ i00 +
776
+ i01 * ne00 +
777
+ blockIdx.z * nb02;
778
+ int offset_dst =
779
+ nidx +
780
+ blockIdx.y * ne0 +
781
+ blockIdx.z * ne0 * gridDim.y;
782
+ dst[offset_dst] = x[offset_src];
783
+ }
784
+
785
+ static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
786
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
787
+ if (nidx >= ne0) {
788
+ return;
789
+ }
790
+
791
+ // operation
792
+ int offset_dst =
793
+ nidx +
794
+ blockIdx.y * ne0 +
795
+ blockIdx.z * ne0 * gridDim.y;
796
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
797
+ int offset_src =
798
+ nidx +
799
+ blockIdx.y * ne00 +
800
+ blockIdx.z * ne00 * ne01;
801
+ dst[offset_dst] = x[offset_src];
802
+ } else {
803
+ dst[offset_dst] = 0.0f;
804
+ }
805
+ }
806
+
807
+ template <int block_size>
808
+ static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
809
+ int start = blockIdx.x * group_size;
810
+ int end = start + group_size;
811
+
812
+ start += threadIdx.x;
813
+
814
+ if (end >= ne_elements) {
815
+ end = ne_elements;
816
+ }
817
+
818
+ float tmp = 0.0f; // partial sum for thread in warp
819
+
820
+ for (int j = start; j < end; j += block_size) {
821
+ tmp += x[j];
822
+ }
823
+
824
+ tmp = warp_reduce_sum(tmp);
825
+ if (block_size > WARP_SIZE) {
826
+ __shared__ float s_sum[32];
827
+ int warp_id = threadIdx.x / WARP_SIZE;
828
+ int lane_id = threadIdx.x % WARP_SIZE;
829
+ if (lane_id == 0) {
830
+ s_sum[warp_id] = tmp;
831
+ }
832
+ __syncthreads();
833
+ tmp = s_sum[lane_id];
834
+ tmp = warp_reduce_sum(tmp);
835
+ }
836
+
837
+ float mean = tmp / group_size;
838
+ tmp = 0.0f;
839
+
840
+ for (int j = start; j < end; j += block_size) {
841
+ float xi = x[j] - mean;
842
+ dst[j] = xi;
843
+ tmp += xi * xi;
844
+ }
845
+
846
+ tmp = warp_reduce_sum(tmp);
847
+ if (block_size > WARP_SIZE) {
848
+ __shared__ float s_sum[32];
849
+ int warp_id = threadIdx.x / WARP_SIZE;
850
+ int lane_id = threadIdx.x % WARP_SIZE;
851
+ if (lane_id == 0) {
852
+ s_sum[warp_id] = tmp;
853
+ }
854
+ __syncthreads();
855
+ tmp = s_sum[lane_id];
856
+ tmp = warp_reduce_sum(tmp);
857
+ }
858
+
859
+ float variance = tmp / group_size;
860
+ float scale = rsqrtf(variance + eps);
861
+ for (int j = start; j < end; j += block_size) {
862
+ dst[j] *= scale;
863
+ }
864
+ }
865
+
644
866
  template <int block_size>
645
867
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
646
868
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -1639,31 +1861,65 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1639
1861
  }
1640
1862
 
1641
1863
  template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1642
- static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1643
- const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1644
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
1645
-
1646
- if (col >= ncols) {
1864
+ static __global__ void k_get_rows(
1865
+ const void * src0, const int32_t * src1, dst_t * dst,
1866
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1867
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1868
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1869
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1870
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
1871
+
1872
+ const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1873
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
1874
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
1875
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
1876
+
1877
+ if (i00 >= ne00) {
1647
1878
  return;
1648
1879
  }
1649
1880
 
1650
- const int r = y[row];
1881
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1651
1882
 
1652
- // copy x[r*ncols + col] to dst[row*ncols + col]
1653
- const int xi = r*ncols + col;
1654
- const int di = row*ncols + col;
1883
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1884
+ const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
1655
1885
 
1656
- const int ib = xi/qk; // block index
1657
- const int iqs = (xi%qk)/qr; // quant index
1658
- const int iybs = di - di%qk; // y block start index
1886
+ const int ib = i00/qk; // block index
1887
+ const int iqs = (i00%qk)/qr; // quant index
1888
+ const int iybs = i00 - i00%qk; // dst block start index
1659
1889
  const int y_offset = qr == 1 ? 1 : qk/2;
1660
1890
 
1661
1891
  // dequantize
1662
1892
  dfloat2 v;
1663
- dequantize_kernel(x, ib, iqs, v);
1893
+ dequantize_kernel(src0_row, ib, iqs, v);
1894
+
1895
+ dst_row[iybs + iqs + 0] = v.x;
1896
+ dst_row[iybs + iqs + y_offset] = v.y;
1897
+ }
1898
+
1899
+ template<typename src0_t, typename dst_t>
1900
+ static __global__ void k_get_rows_float(
1901
+ const src0_t * src0, const int32_t * src1, dst_t * dst,
1902
+ int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
1903
+ /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
1904
+ /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
1905
+ /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
1906
+ size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
1907
+
1908
+ const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
1909
+ const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
1910
+ const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
1911
+ const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
1664
1912
 
1665
- dst[iybs + iqs + 0] = v.x;
1666
- dst[iybs + iqs + y_offset] = v.y;
1913
+ if (i00 >= ne00) {
1914
+ return;
1915
+ }
1916
+
1917
+ const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
1918
+
1919
+ dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
1920
+ const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
1921
+
1922
+ dst_row[i00] = src0_row[i00];
1667
1923
  }
1668
1924
 
1669
1925
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
@@ -4559,6 +4815,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4559
4815
  cpy_1(cx + x_offset, cdst + dst_offset);
4560
4816
  }
4561
4817
 
4818
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
4819
+ const float * xi = (const float *) cxi;
4820
+ block_q8_0 * dsti = (block_q8_0 *) cdsti;
4821
+
4822
+ float amax = 0.0f; // absolute max
4823
+
4824
+ for (int j = 0; j < QK8_0; j++) {
4825
+ const float v = xi[j];
4826
+ amax = fmaxf(amax, fabsf(v));
4827
+ }
4828
+
4829
+ const float d = amax / ((1 << 7) - 1);
4830
+ const float id = d ? 1.0f/d : 0.0f;
4831
+
4832
+ dsti->d = d;
4833
+
4834
+ for (int j = 0; j < QK8_0; ++j) {
4835
+ const float x0 = xi[j]*id;
4836
+
4837
+ dsti->qs[j] = roundf(x0);
4838
+ }
4839
+ }
4840
+
4841
+ static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
4842
+ const float * xi = (const float *) cxi;
4843
+ block_q4_0 * dsti = (block_q4_0 *) cdsti;
4844
+
4845
+ float amax = 0.0f;
4846
+ float vmax = 0.0f;
4847
+
4848
+ for (int j = 0; j < QK4_0; ++j) {
4849
+ const float v = xi[j];
4850
+ if (amax < fabsf(v)) {
4851
+ amax = fabsf(v);
4852
+ vmax = v;
4853
+ }
4854
+ }
4855
+
4856
+ const float d = vmax / -8;
4857
+ const float id = d ? 1.0f/d : 0.0f;
4858
+
4859
+ dsti->d = d;
4860
+
4861
+ for (int j = 0; j < QK4_0/2; ++j) {
4862
+ const float x0 = xi[0 + j]*id;
4863
+ const float x1 = xi[QK4_0/2 + j]*id;
4864
+
4865
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
4866
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
4867
+
4868
+ dsti->qs[j] = xi0;
4869
+ dsti->qs[j] |= xi1 << 4;
4870
+ }
4871
+ }
4872
+
4873
+ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
4874
+ const float * xi = (const float *) cxi;
4875
+ block_q4_1 * dsti = (block_q4_1 *) cdsti;
4876
+
4877
+ float vmin = FLT_MAX;
4878
+ float vmax = -FLT_MAX;
4879
+
4880
+ for (int j = 0; j < QK4_1; ++j) {
4881
+ const float v = xi[j];
4882
+
4883
+ if (v < vmin) vmin = v;
4884
+ if (v > vmax) vmax = v;
4885
+ }
4886
+
4887
+ const float d = (vmax - vmin) / ((1 << 4) - 1);
4888
+ const float id = d ? 1.0f/d : 0.0f;
4889
+
4890
+ dsti->dm.x = d;
4891
+ dsti->dm.y = vmin;
4892
+
4893
+ for (int j = 0; j < QK4_1/2; ++j) {
4894
+ const float x0 = (xi[0 + j] - vmin)*id;
4895
+ const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
4896
+
4897
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
4898
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
4899
+
4900
+ dsti->qs[j] = xi0;
4901
+ dsti->qs[j] |= xi1 << 4;
4902
+ }
4903
+ }
4904
+
4905
+ template <cpy_kernel_t cpy_blck, int qk>
4906
+ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
4907
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
4908
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
4909
+ const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
4910
+
4911
+ if (i >= ne) {
4912
+ return;
4913
+ }
4914
+
4915
+ const int i02 = i / (ne00*ne01);
4916
+ const int i01 = (i - i02*ne01*ne00) / ne00;
4917
+ const int i00 = (i - i02*ne01*ne00 - i01*ne00);
4918
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
4919
+
4920
+ const int i12 = i / (ne10*ne11);
4921
+ const int i11 = (i - i12*ne10*ne11) / ne10;
4922
+ const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
4923
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
4924
+
4925
+ cpy_blck(cx + x_offset, cdst + dst_offset);
4926
+ }
4927
+
4562
4928
  static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4563
4929
  const float y = (i0 / 2 - low) / max(0.001f, high - low);
4564
4930
  return 1.0f - min(1.0f, max(0.0f, y));
@@ -4713,6 +5079,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
4713
5079
  dst[i] = col * m_k + x[i];
4714
5080
  }
4715
5081
 
5082
+ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
5083
+ const int row = blockIdx.y;
5084
+ const int col = threadIdx.x;
5085
+
5086
+ float sum = 0.0f;
5087
+ for (int i = col; i < ncols; i += blockDim.x) {
5088
+ sum += x[row * ncols + i];
5089
+ }
5090
+
5091
+ sum = warp_reduce_sum(sum);
5092
+
5093
+ if (col == 0) {
5094
+ dst[row] = sum;
5095
+ }
5096
+ }
5097
+
5098
+ template<typename T>
5099
+ static inline __device__ void swap(T & a, T & b) {
5100
+ T tmp = a;
5101
+ a = b;
5102
+ b = tmp;
5103
+ }
5104
+
5105
+ template<ggml_sort_order order>
5106
+ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
5107
+ // bitonic sort
5108
+ int col = threadIdx.x;
5109
+ int row = blockIdx.y;
5110
+
5111
+ if (col >= ncols) return;
5112
+
5113
+ const float * x_row = x + row * ncols;
5114
+ int * dst_row = dst + row * ncols;
5115
+
5116
+ // initialize indices
5117
+ if (col < ncols) {
5118
+ dst_row[col] = col;
5119
+ }
5120
+ __syncthreads();
5121
+
5122
+ for (int k = 2; k <= ncols; k *= 2) {
5123
+ for (int j = k / 2; j > 0; j /= 2) {
5124
+ int ixj = col ^ j;
5125
+ if (ixj > col) {
5126
+ if ((col & k) == 0) {
5127
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
5128
+ swap(dst_row[col], dst_row[ixj]);
5129
+ }
5130
+ } else {
5131
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
5132
+ swap(dst_row[col], dst_row[ixj]);
5133
+ }
5134
+ }
5135
+ }
5136
+ __syncthreads();
5137
+ }
5138
+ }
5139
+ }
5140
+
4716
5141
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4717
5142
  const int col = blockDim.y*blockIdx.y + threadIdx.y;
4718
5143
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
@@ -4722,8 +5147,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
4722
5147
  }
4723
5148
 
4724
5149
  const int i = row*ncols + col;
4725
- // dst[i] = col > n_past + row ? -INFINITY : x[i];
4726
- dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
5150
+ //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
5151
+ //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
5152
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
4727
5153
  }
4728
5154
 
4729
5155
  static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
@@ -4820,49 +5246,220 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
4820
5246
 
4821
5247
  static __global__ void im2col_f32_f16(
4822
5248
  const float * x, half * dst,
4823
- int ofs0, int ofs1, int IW, int IH, int CHW,
5249
+ int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
4824
5250
  int s0, int s1, int p0, int p1, int d0, int d1) {
4825
- const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
4826
- const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
5251
+ const int i = threadIdx.x + blockIdx.x * blockDim.x;
5252
+ if (i >= pelements) {
5253
+ return;
5254
+ }
5255
+
5256
+ const int ksize = OW * (KH > 1 ? KW : 1);
5257
+ const int kx = i / ksize;
5258
+ const int kd = kx * ksize;
5259
+ const int ky = (i - kd) / OW;
5260
+ const int ix = i % OW;
5261
+
5262
+ const int iiw = ix * s0 + kx * d0 - p0;
5263
+ const int iih = blockIdx.y * s1 + ky * d1 - p1;
4827
5264
 
4828
5265
  const int offset_dst =
4829
- (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
4830
- (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
5266
+ (blockIdx.y * OW + ix) * CHW +
5267
+ (blockIdx.z * (KW * KH) + ky * KW + kx);
4831
5268
 
4832
5269
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
4833
5270
  dst[offset_dst] = __float2half(0.0f);
4834
5271
  } else {
4835
- const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
5272
+ const int offset_src = blockIdx.z * offset_delta;
4836
5273
  dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
4837
5274
  }
4838
5275
  }
4839
5276
 
4840
5277
  template<int qk, int qr, dequantize_kernel_t dq>
4841
- static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
5278
+ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5279
+ const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5280
+
5281
+ GGML_TENSOR_BINARY_OP_LOCALS
5282
+
4842
5283
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
4843
- const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
4844
- const dim3 block_nums(block_num_x, nrows, 1);
4845
- k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4846
- }
5284
+ const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
5285
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5286
+
5287
+ // strides in elements
5288
+ //const size_t s0 = nb0 / ggml_element_size(dst);
5289
+ const size_t s1 = nb1 / ggml_element_size(dst);
5290
+ const size_t s2 = nb2 / ggml_element_size(dst);
5291
+ const size_t s3 = nb3 / ggml_element_size(dst);
5292
+
5293
+ const size_t s10 = nb10 / ggml_element_size(src1);
5294
+ const size_t s11 = nb11 / ggml_element_size(src1);
5295
+ const size_t s12 = nb12 / ggml_element_size(src1);
5296
+ //const size_t s13 = nb13 / ggml_element_size(src1);
5297
+
5298
+ GGML_ASSERT(ne00 % 2 == 0);
5299
+
5300
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
5301
+ src0_dd, src1_dd, dst_dd,
5302
+ ne00, /*ne01, ne02, ne03,*/
5303
+ /*ne10, ne11,*/ ne12, /*ne13,*/
5304
+ /* s0,*/ s1, s2, s3,
5305
+ /* nb00,*/ nb01, nb02, nb03,
5306
+ s10, s11, s12/*, s13*/);
4847
5307
 
4848
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4849
- const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4850
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
5308
+ (void) dst;
4851
5309
  }
4852
5310
 
4853
- static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
4854
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4855
- add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4856
- }
5311
+ template<typename src0_t>
5312
+ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5313
+ const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5314
+
5315
+ GGML_TENSOR_BINARY_OP_LOCALS
4857
5316
 
4858
- static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4859
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4860
- add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
5317
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
5318
+ const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
5319
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5320
+
5321
+ // strides in elements
5322
+ //const size_t s0 = nb0 / ggml_element_size(dst);
5323
+ const size_t s1 = nb1 / ggml_element_size(dst);
5324
+ const size_t s2 = nb2 / ggml_element_size(dst);
5325
+ const size_t s3 = nb3 / ggml_element_size(dst);
5326
+
5327
+ const size_t s10 = nb10 / ggml_element_size(src1);
5328
+ const size_t s11 = nb11 / ggml_element_size(src1);
5329
+ const size_t s12 = nb12 / ggml_element_size(src1);
5330
+ //const size_t s13 = nb13 / ggml_element_size(src1);
5331
+
5332
+ k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
5333
+ src0_dd, src1_dd, dst_dd,
5334
+ ne00, /*ne01, ne02, ne03,*/
5335
+ /*ne10, ne11,*/ ne12, /*ne13,*/
5336
+ /* s0,*/ s1, s2, s3,
5337
+ /* nb00,*/ nb01, nb02, nb03,
5338
+ s10, s11, s12/*, s13*/);
5339
+
5340
+ (void) dst;
4861
5341
  }
4862
5342
 
4863
- static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4864
- const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4865
- mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
5343
+ template<float (*bin_op)(const float, const float)>
5344
+ struct bin_bcast_cuda {
5345
+ template<typename src0_t, typename src1_t, typename dst_t>
5346
+ void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
5347
+ const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
5348
+ cudaStream_t stream) {
5349
+
5350
+ GGML_TENSOR_BINARY_OP_LOCALS
5351
+
5352
+ int nr0 = ne10/ne0;
5353
+ int nr1 = ne11/ne1;
5354
+ int nr2 = ne12/ne2;
5355
+ int nr3 = ne13/ne3;
5356
+
5357
+ int nr[4] = { nr0, nr1, nr2, nr3 };
5358
+
5359
+ // collapse dimensions until first broadcast dimension
5360
+ int64_t cne0[] = {ne0, ne1, ne2, ne3};
5361
+ int64_t cne1[] = {ne10, ne11, ne12, ne13};
5362
+ size_t cnb0[] = {nb0, nb1, nb2, nb3};
5363
+ size_t cnb1[] = {nb10, nb11, nb12, nb13};
5364
+ auto collapse = [](int64_t cne[]) {
5365
+ cne[0] *= cne[1];
5366
+ cne[1] = cne[2];
5367
+ cne[2] = cne[3];
5368
+ cne[3] = 1;
5369
+ };
5370
+
5371
+ auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
5372
+ cnb[1] *= cne[1];
5373
+ cnb[2] *= cne[2];
5374
+ cnb[3] *= cne[3];
5375
+ };
5376
+
5377
+ for (int i = 0; i < 4; i++) {
5378
+ if (nr[i] != 1) {
5379
+ break;
5380
+ }
5381
+ if (i > 0) {
5382
+ collapse_nb(cnb0, cne0);
5383
+ collapse_nb(cnb1, cne1);
5384
+ collapse(cne0);
5385
+ collapse(cne1);
5386
+ }
5387
+ }
5388
+ {
5389
+ int64_t ne0 = cne0[0];
5390
+ int64_t ne1 = cne0[1];
5391
+ int64_t ne2 = cne0[2];
5392
+ int64_t ne3 = cne0[3];
5393
+
5394
+ int64_t ne10 = cne1[0];
5395
+ int64_t ne11 = cne1[1];
5396
+ int64_t ne12 = cne1[2];
5397
+ int64_t ne13 = cne1[3];
5398
+
5399
+ size_t nb0 = cnb0[0];
5400
+ size_t nb1 = cnb0[1];
5401
+ size_t nb2 = cnb0[2];
5402
+ size_t nb3 = cnb0[3];
5403
+
5404
+ size_t nb10 = cnb1[0];
5405
+ size_t nb11 = cnb1[1];
5406
+ size_t nb12 = cnb1[2];
5407
+ size_t nb13 = cnb1[3];
5408
+
5409
+ size_t s0 = nb0 / sizeof(dst_t);
5410
+ size_t s1 = nb1 / sizeof(dst_t);
5411
+ size_t s2 = nb2 / sizeof(dst_t);
5412
+ size_t s3 = nb3 / sizeof(dst_t);
5413
+
5414
+ size_t s10 = nb10 / sizeof(src1_t);
5415
+ size_t s11 = nb11 / sizeof(src1_t);
5416
+ size_t s12 = nb12 / sizeof(src1_t);
5417
+ size_t s13 = nb13 / sizeof(src1_t);
5418
+
5419
+ GGML_ASSERT(s0 == 1);
5420
+ GGML_ASSERT(s10 == 1);
5421
+
5422
+ const int block_size = 128;
5423
+
5424
+ int64_t hne0 = std::max(ne0/2LL, 1LL);
5425
+
5426
+ dim3 block_dims;
5427
+ block_dims.x = std::min<unsigned int>(hne0, block_size);
5428
+ block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
5429
+ block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
5430
+
5431
+ dim3 block_nums(
5432
+ (hne0 + block_dims.x - 1) / block_dims.x,
5433
+ (ne1 + block_dims.y - 1) / block_dims.y,
5434
+ (ne2*ne3 + block_dims.z - 1) / block_dims.z
5435
+ );
5436
+
5437
+ if (block_nums.z > 65535) {
5438
+ // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
5439
+ int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
5440
+ k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
5441
+ src0_dd, src1_dd, dst_dd,
5442
+ ne0, ne1, ne2, ne3,
5443
+ ne10, ne11, ne12, ne13,
5444
+ /* s0, */ s1, s2, s3,
5445
+ /* s10, */ s11, s12, s13);
5446
+ } else {
5447
+ k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
5448
+ src0_dd, src1_dd, dst_dd,
5449
+ ne0, ne1, ne2, ne3,
5450
+ ne10, ne11, ne12, ne13,
5451
+ /* s0, */ s1, s2, s3,
5452
+ /* s10, */ s11, s12, s13);
5453
+ }
5454
+ }
5455
+ }
5456
+ };
5457
+
5458
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
5459
+ const int ne10, const int ne11, const int ne12,
5460
+ const int nb1, const int nb2, const int offset, cudaStream_t stream) {
5461
+ int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
5462
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
4866
5463
  }
4867
5464
 
4868
5465
  static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
@@ -4875,27 +5472,74 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4875
5472
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4876
5473
  }
4877
5474
 
5475
+ static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5476
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
5477
+ gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5478
+ }
5479
+
5480
+ static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
5481
+ const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
5482
+ tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
5483
+ }
5484
+
4878
5485
  static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4879
5486
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
4880
5487
  relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4881
5488
  }
4882
5489
 
5490
+ static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
5491
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
5492
+ leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
5493
+ }
5494
+
4883
5495
  static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4884
5496
  const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
4885
5497
  sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4886
5498
  }
4887
5499
 
4888
- static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5500
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4889
5501
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4890
5502
  if (ncols < 1024) {
4891
5503
  const dim3 block_dims(WARP_SIZE, 1, 1);
4892
- norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5504
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
5505
+ } else {
5506
+ const dim3 block_dims(1024, 1, 1);
5507
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
5508
+ }
5509
+ }
5510
+
5511
+ static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
5512
+ static const float eps = 1e-6f;
5513
+ if (group_size < 1024) {
5514
+ const dim3 block_dims(WARP_SIZE, 1, 1);
5515
+ group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
4893
5516
  } else {
4894
5517
  const dim3 block_dims(1024, 1, 1);
4895
- norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5518
+ group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
4896
5519
  }
4897
5520
  }
4898
5521
 
5522
+ static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
5523
+ int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
5524
+ dim3 gridDim(num_blocks, ne1, ne2);
5525
+ concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
5526
+ }
5527
+
5528
+ static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
5529
+ int ne0 = (ne00 * scale_factor);
5530
+ int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
5531
+ dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
5532
+ upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
5533
+ }
5534
+
5535
+ static void pad_f32_cuda(const float * x, float * dst,
5536
+ const int ne00, const int ne01, const int ne02,
5537
+ const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
5538
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
5539
+ dim3 gridDim(num_blocks, ne1, ne2);
5540
+ pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
5541
+ }
5542
+
4899
5543
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4900
5544
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4901
5545
  if (ncols < 1024) {
@@ -4914,34 +5558,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4914
5558
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4915
5559
  }
4916
5560
 
4917
- template<typename dst_t>
4918
- static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4919
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4920
- dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4921
- }
4922
-
4923
- template<typename dst_t>
4924
- static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4925
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4926
- dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4927
- }
4928
-
4929
- template<typename dst_t>
4930
- static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4931
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4932
- dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4933
- }
4934
-
4935
- template<typename dst_t>
4936
- static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4937
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4938
- dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4939
- }
4940
-
4941
- template<typename dst_t>
4942
- static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
5561
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
5562
+ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
4943
5563
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4944
- dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5564
+ dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4945
5565
  }
4946
5566
 
4947
5567
  template<typename dst_t>
@@ -4990,6 +5610,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4990
5610
  #endif
4991
5611
  }
4992
5612
 
5613
+ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5614
+ switch (type) {
5615
+ case GGML_TYPE_Q4_0:
5616
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5617
+ case GGML_TYPE_Q4_1:
5618
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5619
+ case GGML_TYPE_Q5_0:
5620
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5621
+ case GGML_TYPE_Q5_1:
5622
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5623
+ case GGML_TYPE_Q8_0:
5624
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5625
+ case GGML_TYPE_Q2_K:
5626
+ return dequantize_row_q2_K_cuda;
5627
+ case GGML_TYPE_Q3_K:
5628
+ return dequantize_row_q3_K_cuda;
5629
+ case GGML_TYPE_Q4_K:
5630
+ return dequantize_row_q4_K_cuda;
5631
+ case GGML_TYPE_Q5_K:
5632
+ return dequantize_row_q5_K_cuda;
5633
+ case GGML_TYPE_Q6_K:
5634
+ return dequantize_row_q6_K_cuda;
5635
+ case GGML_TYPE_F32:
5636
+ return dequantize_block_cuda<1, 1, convert_f32>;
5637
+ default:
5638
+ return nullptr;
5639
+ }
5640
+ }
5641
+
5642
+ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5643
+ switch (type) {
5644
+ case GGML_TYPE_Q4_0:
5645
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5646
+ case GGML_TYPE_Q4_1:
5647
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5648
+ case GGML_TYPE_Q5_0:
5649
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5650
+ case GGML_TYPE_Q5_1:
5651
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5652
+ case GGML_TYPE_Q8_0:
5653
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5654
+ case GGML_TYPE_Q2_K:
5655
+ return dequantize_row_q2_K_cuda;
5656
+ case GGML_TYPE_Q3_K:
5657
+ return dequantize_row_q3_K_cuda;
5658
+ case GGML_TYPE_Q4_K:
5659
+ return dequantize_row_q4_K_cuda;
5660
+ case GGML_TYPE_Q5_K:
5661
+ return dequantize_row_q5_K_cuda;
5662
+ case GGML_TYPE_Q6_K:
5663
+ return dequantize_row_q6_K_cuda;
5664
+ case GGML_TYPE_F16:
5665
+ return dequantize_block_cuda<1, 1, convert_f16>;
5666
+ default:
5667
+ return nullptr;
5668
+ }
5669
+ }
5670
+
4993
5671
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4994
5672
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4995
5673
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -5078,6 +5756,15 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
5078
5756
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5079
5757
  }
5080
5758
 
5759
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5760
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5761
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5762
+ const dim3 block_nums(block_num_y, 1, 1);
5763
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5764
+ dequantize_mul_mat_vec<1, 1, convert_f16>
5765
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5766
+ }
5767
+
5081
5768
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5082
5769
  GGML_ASSERT(ncols % QK4_0 == 0);
5083
5770
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -5168,83 +5855,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
5168
5855
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5169
5856
  }
5170
5857
 
5171
- static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
5172
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
5173
- dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5174
- }
5175
-
5176
- static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
5177
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
5178
- dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5179
- }
5180
-
5181
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5182
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5183
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5184
- const dim3 block_nums(block_num_y, 1, 1);
5185
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5186
- dequantize_mul_mat_vec<1, 1, convert_f16>
5187
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5188
- }
5189
-
5190
- static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5191
- switch (type) {
5192
- case GGML_TYPE_Q4_0:
5193
- return dequantize_row_q4_0_cuda;
5194
- case GGML_TYPE_Q4_1:
5195
- return dequantize_row_q4_1_cuda;
5196
- case GGML_TYPE_Q5_0:
5197
- return dequantize_row_q5_0_cuda;
5198
- case GGML_TYPE_Q5_1:
5199
- return dequantize_row_q5_1_cuda;
5200
- case GGML_TYPE_Q8_0:
5201
- return dequantize_row_q8_0_cuda;
5202
- case GGML_TYPE_Q2_K:
5203
- return dequantize_row_q2_K_cuda;
5204
- case GGML_TYPE_Q3_K:
5205
- return dequantize_row_q3_K_cuda;
5206
- case GGML_TYPE_Q4_K:
5207
- return dequantize_row_q4_K_cuda;
5208
- case GGML_TYPE_Q5_K:
5209
- return dequantize_row_q5_K_cuda;
5210
- case GGML_TYPE_Q6_K:
5211
- return dequantize_row_q6_K_cuda;
5212
- case GGML_TYPE_F32:
5213
- return convert_fp32_to_fp16_cuda;
5214
- default:
5215
- return nullptr;
5216
- }
5217
- }
5218
-
5219
- static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5220
- switch (type) {
5221
- case GGML_TYPE_Q4_0:
5222
- return dequantize_row_q4_0_cuda;
5223
- case GGML_TYPE_Q4_1:
5224
- return dequantize_row_q4_1_cuda;
5225
- case GGML_TYPE_Q5_0:
5226
- return dequantize_row_q5_0_cuda;
5227
- case GGML_TYPE_Q5_1:
5228
- return dequantize_row_q5_1_cuda;
5229
- case GGML_TYPE_Q8_0:
5230
- return dequantize_row_q8_0_cuda;
5231
- case GGML_TYPE_Q2_K:
5232
- return dequantize_row_q2_K_cuda;
5233
- case GGML_TYPE_Q3_K:
5234
- return dequantize_row_q3_K_cuda;
5235
- case GGML_TYPE_Q4_K:
5236
- return dequantize_row_q4_K_cuda;
5237
- case GGML_TYPE_Q5_K:
5238
- return dequantize_row_q5_K_cuda;
5239
- case GGML_TYPE_Q6_K:
5240
- return dequantize_row_q6_K_cuda;
5241
- case GGML_TYPE_F16:
5242
- return convert_fp16_to_fp32_cuda;
5243
- default:
5244
- return nullptr;
5245
- }
5246
- }
5247
-
5248
5858
  static void ggml_mul_mat_q4_0_q8_1_cuda(
5249
5859
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
5250
5860
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
@@ -5737,6 +6347,39 @@ static void ggml_cpy_f32_f16_cuda(
5737
6347
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5738
6348
  }
5739
6349
 
6350
+ static void ggml_cpy_f32_q8_0_cuda(
6351
+ const char * cx, char * cdst, const int ne,
6352
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6353
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6354
+
6355
+ GGML_ASSERT(ne % QK8_0 == 0);
6356
+ const int num_blocks = ne / QK8_0;
6357
+ cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
6358
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6359
+ }
6360
+
6361
+ static void ggml_cpy_f32_q4_0_cuda(
6362
+ const char * cx, char * cdst, const int ne,
6363
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6364
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6365
+
6366
+ GGML_ASSERT(ne % QK4_0 == 0);
6367
+ const int num_blocks = ne / QK4_0;
6368
+ cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
6369
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6370
+ }
6371
+
6372
+ static void ggml_cpy_f32_q4_1_cuda(
6373
+ const char * cx, char * cdst, const int ne,
6374
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6375
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6376
+
6377
+ GGML_ASSERT(ne % QK4_1 == 0);
6378
+ const int num_blocks = ne / QK4_1;
6379
+ cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
6380
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6381
+ }
6382
+
5740
6383
  static void ggml_cpy_f16_f16_cuda(
5741
6384
  const char * cx, char * cdst, const int ne,
5742
6385
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -5823,6 +6466,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
5823
6466
  alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
5824
6467
  }
5825
6468
 
6469
+ static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6470
+ const dim3 block_dims(WARP_SIZE, 1, 1);
6471
+ const dim3 block_nums(1, nrows, 1);
6472
+ k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6473
+ }
6474
+
6475
+ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
6476
+ // bitonic sort requires ncols to be power of 2
6477
+ GGML_ASSERT((ncols & (ncols - 1)) == 0);
6478
+
6479
+ const dim3 block_dims(ncols, 1, 1);
6480
+ const dim3 block_nums(1, nrows, 1);
6481
+ if (order == GGML_SORT_ASC) {
6482
+ k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6483
+ } else if (order == GGML_SORT_DESC) {
6484
+ k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6485
+ } else {
6486
+ GGML_ASSERT(false);
6487
+ }
6488
+ }
6489
+
5826
6490
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
5827
6491
  const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
5828
6492
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -5838,13 +6502,14 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
5838
6502
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
5839
6503
  }
5840
6504
 
5841
- static void im2col_f32_f16_cuda(const float * x, half * dst,
5842
- int OH, int IW, int IH, int OW, int IC,
5843
- int KH, int KW, int N, int ofs0, int ofs1,
5844
- int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
5845
- dim3 block_nums(IC, OH, OW);
5846
- dim3 block_dims(N, KH, KW);
5847
- im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
6505
+ static void im2col_f32_f16_cuda(const float* x, half* dst,
6506
+ int IW, int IH, int OW, int OH, int KW, int KH, int IC,
6507
+ int offset_delta,
6508
+ int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
6509
+ const int parallel_elements = OW * KW * KH;
6510
+ const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
6511
+ dim3 block_nums(num_blocks, OH, IC);
6512
+ im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
5848
6513
  }
5849
6514
 
5850
6515
  // buffer pool for cuda
@@ -5915,7 +6580,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5915
6580
  return ptr;
5916
6581
  }
5917
6582
  #ifdef DEBUG_CUDA_MALLOC
5918
- fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
6583
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5919
6584
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5920
6585
  #endif
5921
6586
  void * ptr;
@@ -6053,7 +6718,7 @@ void * ggml_cuda_host_malloc(size_t size) {
6053
6718
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
6054
6719
  // This can fixed the OOM error in WSL.
6055
6720
  cudaGetLastError();
6056
- fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
6721
+ fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6057
6722
  size/1024.0/1024.0, cudaGetErrorString(err));
6058
6723
  return nullptr;
6059
6724
  }
@@ -6098,75 +6763,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6098
6763
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
6099
6764
  if (nb0 == ts && nb1 == ts*ne0/bs) {
6100
6765
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
6101
- }
6102
- if (nb0 == ts) {
6766
+ } else if (nb0 == ts) {
6103
6767
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
6104
- }
6105
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6106
- const void * rx = (const void *) ((const char *) x + i1*nb1);
6107
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6108
- // pretend the row is a matrix with cols=1
6109
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6110
- if (r != cudaSuccess) { return r; }
6111
- }
6112
- return cudaSuccess;
6113
- }
6114
-
6115
- static void ggml_cuda_op_repeat(
6116
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6117
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
6118
- // guaranteed to be an integer due to the check in ggml_can_repeat
6119
- const int64_t ne0 = dst->ne[0];
6120
- const int64_t ne1 = dst->ne[1];
6121
- const int64_t ne2 = dst->ne[2];
6122
- const int64_t ne3 = dst->ne[3];
6123
-
6124
- const int64_t ne00 = src0->ne[0];
6125
- const int64_t ne01 = src0->ne[1];
6126
- const int64_t ne02 = src0->ne[2];
6127
- const int64_t ne03 = src0->ne[3];
6128
-
6129
- const size_t nb0 = dst->nb[0];
6130
- const size_t nb1 = dst->nb[1];
6131
- const size_t nb2 = dst->nb[2];
6132
- const size_t nb3 = dst->nb[3];
6133
-
6134
- const size_t nb00 = src0->nb[0];
6135
- const size_t nb01 = src0->nb[1];
6136
- const size_t nb02 = src0->nb[2];
6137
- const size_t nb03 = src0->nb[3];
6138
-
6139
- const int nr0 = (int)(ne0/ne00);
6140
- const int nr1 = (int)(ne1/ne01);
6141
- const int nr2 = (int)(ne2/ne02);
6142
- const int nr3 = (int)(ne3/ne03);
6143
-
6144
- // TODO: support for transposed / permuted tensors
6145
- GGML_ASSERT(nb0 == sizeof(float));
6146
- GGML_ASSERT(nb00 == sizeof(float));
6147
-
6148
- // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
6149
- for (int i3 = 0; i3 < nr3; i3++) {
6150
- for (int k3 = 0; k3 < ne03; k3++) {
6151
- for (int i2 = 0; i2 < nr2; i2++) {
6152
- for (int k2 = 0; k2 < ne02; k2++) {
6153
- for (int i1 = 0; i1 < nr1; i1++) {
6154
- for (int k1 = 0; k1 < ne01; k1++) {
6155
- for (int i0 = 0; i0 < nr0; i0++) {
6156
- CUDA_CHECK(cudaMemcpyAsync(
6157
- (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
6158
- (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
6159
- ne00*nb0, cudaMemcpyDeviceToDevice, stream));
6160
- }
6161
- }
6162
- }
6163
- }
6164
- }
6768
+ } else {
6769
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6770
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6771
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6772
+ // pretend the row is a matrix with cols=1
6773
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6774
+ if (r != cudaSuccess) return r;
6165
6775
  }
6776
+ return cudaSuccess;
6166
6777
  }
6167
-
6168
- (void) src1;
6169
- (void) src1_d;
6170
6778
  }
6171
6779
 
6172
6780
  static void ggml_cuda_op_get_rows(
@@ -6175,36 +6783,34 @@ static void ggml_cuda_op_get_rows(
6175
6783
 
6176
6784
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
6177
6785
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
6178
- GGML_ASSERT(ggml_is_contiguous(src0));
6179
- GGML_ASSERT(ggml_is_contiguous(src1));
6180
- GGML_ASSERT(ggml_is_contiguous(dst));
6181
6786
 
6182
- const int ncols = src0->ne[0];
6183
- const int nrows = ggml_nelements(src1);
6787
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
6788
+ GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
6789
+ GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
6184
6790
 
6185
6791
  const int32_t * src1_i32 = (const int32_t *) src1_d;
6186
6792
 
6187
6793
  switch (src0->type) {
6188
6794
  case GGML_TYPE_F16:
6189
- get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6795
+ get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
6190
6796
  break;
6191
6797
  case GGML_TYPE_F32:
6192
- get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6798
+ get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6193
6799
  break;
6194
6800
  case GGML_TYPE_Q4_0:
6195
- get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6801
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6196
6802
  break;
6197
6803
  case GGML_TYPE_Q4_1:
6198
- get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6804
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6199
6805
  break;
6200
6806
  case GGML_TYPE_Q5_0:
6201
- get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6807
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6202
6808
  break;
6203
6809
  case GGML_TYPE_Q5_1:
6204
- get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6810
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6205
6811
  break;
6206
6812
  case GGML_TYPE_Q8_0:
6207
- get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
6813
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
6208
6814
  break;
6209
6815
  default:
6210
6816
  // TODO: k-quants
@@ -6213,46 +6819,76 @@ static void ggml_cuda_op_get_rows(
6213
6819
  }
6214
6820
  }
6215
6821
 
6216
- inline void ggml_cuda_op_add(
6822
+ template<class op>
6823
+ inline void ggml_cuda_op_bin_bcast(
6217
6824
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6218
6825
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6219
6826
 
6220
6827
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6221
6828
 
6222
- const int64_t ne10 = src1->ne[0];
6223
- const int64_t ne11 = src1->ne[1];
6224
-
6225
6829
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6226
- add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6830
+ op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6227
6831
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
6228
- add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6832
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
6229
6833
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6230
- add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
6834
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
6231
6835
  } else {
6232
- fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
6836
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
6837
+ ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
6233
6838
  GGML_ASSERT(false);
6234
6839
  }
6840
+ }
6841
+
6842
+ static void ggml_cuda_op_repeat(
6843
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6844
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
6845
+
6846
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
6235
6847
 
6236
6848
  (void) src1;
6237
- (void) dst;
6849
+ (void) src1_d;
6238
6850
  }
6239
6851
 
6240
- inline void ggml_cuda_op_mul(
6852
+ inline void ggml_cuda_op_add(
6853
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6854
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6855
+
6856
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6857
+ }
6858
+
6859
+ inline void ggml_cuda_op_acc(
6241
6860
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6242
6861
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6243
6862
 
6244
6863
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6245
6864
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6246
6865
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6866
+ GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
6247
6867
 
6248
- const int64_t ne10 = src1->ne[0];
6249
- const int64_t ne11 = src1->ne[1];
6868
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
6869
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
6870
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
6871
+ int offset = dst->op_params[3] / 4; // offset in bytes
6250
6872
 
6251
- mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6873
+ acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
6252
6874
 
6253
6875
  (void) dst;
6254
6876
  }
6255
6877
 
6878
+ inline void ggml_cuda_op_mul(
6879
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6880
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6881
+
6882
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6883
+ }
6884
+
6885
+ inline void ggml_cuda_op_div(
6886
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6887
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6888
+
6889
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6890
+ }
6891
+
6256
6892
  inline void ggml_cuda_op_gelu(
6257
6893
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6258
6894
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6281,6 +6917,34 @@ inline void ggml_cuda_op_silu(
6281
6917
  (void) src1_dd;
6282
6918
  }
6283
6919
 
6920
+ inline void ggml_cuda_op_gelu_quick(
6921
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6922
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6923
+
6924
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6925
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6926
+
6927
+ gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6928
+
6929
+ (void) src1;
6930
+ (void) dst;
6931
+ (void) src1_dd;
6932
+ }
6933
+
6934
+ inline void ggml_cuda_op_tanh(
6935
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6936
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6937
+
6938
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6939
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6940
+
6941
+ tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6942
+
6943
+ (void) src1;
6944
+ (void) dst;
6945
+ (void) src1_dd;
6946
+ }
6947
+
6284
6948
  inline void ggml_cuda_op_relu(
6285
6949
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6286
6950
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6295,38 +6959,38 @@ inline void ggml_cuda_op_relu(
6295
6959
  (void) src1_dd;
6296
6960
  }
6297
6961
 
6298
- inline void ggml_cuda_op_sqr(
6962
+ inline void ggml_cuda_op_leaky_relu(
6299
6963
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6300
6964
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6301
6965
 
6302
6966
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6303
6967
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6304
6968
 
6305
- sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6969
+ float negative_slope;
6970
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
6971
+
6972
+ leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
6306
6973
 
6307
6974
  (void) src1;
6308
6975
  (void) dst;
6309
6976
  (void) src1_dd;
6310
6977
  }
6311
6978
 
6312
- inline void ggml_cuda_op_norm(
6979
+ inline void ggml_cuda_op_sqr(
6313
6980
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6314
6981
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6315
6982
 
6316
6983
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6317
6984
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6318
6985
 
6319
- const int64_t ne00 = src0->ne[0];
6320
- const int64_t nrows = ggml_nrows(src0);
6321
-
6322
- norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
6986
+ sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6323
6987
 
6324
6988
  (void) src1;
6325
6989
  (void) dst;
6326
6990
  (void) src1_dd;
6327
6991
  }
6328
6992
 
6329
- inline void ggml_cuda_op_rms_norm(
6993
+ inline void ggml_cuda_op_norm(
6330
6994
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6331
6995
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6332
6996
 
@@ -6339,26 +7003,111 @@ inline void ggml_cuda_op_rms_norm(
6339
7003
  float eps;
6340
7004
  memcpy(&eps, dst->op_params, sizeof(float));
6341
7005
 
6342
- rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
7006
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
6343
7007
 
6344
7008
  (void) src1;
6345
7009
  (void) dst;
6346
7010
  (void) src1_dd;
6347
7011
  }
6348
7012
 
6349
- inline void ggml_cuda_op_mul_mat_q(
6350
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
6351
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6352
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6353
7013
 
6354
- const int64_t ne00 = src0->ne[0];
7014
+ inline void ggml_cuda_op_group_norm(
7015
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7016
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6355
7017
 
6356
- const int64_t ne10 = src1->ne[0];
6357
- GGML_ASSERT(ne10 % QK8_1 == 0);
7018
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7019
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6358
7020
 
6359
- const int64_t ne0 = dst->ne[0];
7021
+ int num_groups = dst->op_params[0];
7022
+ int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
7023
+ group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
6360
7024
 
6361
- const int64_t row_diff = row_high - row_low;
7025
+ (void) src1;
7026
+ (void) dst;
7027
+ (void) src1_dd;
7028
+ }
7029
+
7030
+ inline void ggml_cuda_op_concat(
7031
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7032
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7033
+
7034
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7035
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7036
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7037
+
7038
+ for (int i3 = 0; i3 < dst->ne[3]; i3++) {
7039
+ concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
7040
+ }
7041
+
7042
+ (void) src1;
7043
+ (void) dst;
7044
+ }
7045
+
7046
+ inline void ggml_cuda_op_upscale(
7047
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7048
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7049
+
7050
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7051
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7052
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
7053
+
7054
+ const int scale_factor = dst->op_params[0];
7055
+
7056
+ upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
7057
+
7058
+ (void) src1;
7059
+ (void) dst;
7060
+ }
7061
+
7062
+ inline void ggml_cuda_op_pad(
7063
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7064
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7065
+
7066
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7067
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
7068
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
7069
+
7070
+ pad_f32_cuda(src0_dd, dst_dd,
7071
+ src0->ne[0], src0->ne[1], src0->ne[2],
7072
+ dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
7073
+
7074
+ (void) src1;
7075
+ (void) dst;
7076
+ }
7077
+
7078
+ inline void ggml_cuda_op_rms_norm(
7079
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7080
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7081
+
7082
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7083
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7084
+
7085
+ const int64_t ne00 = src0->ne[0];
7086
+ const int64_t nrows = ggml_nrows(src0);
7087
+
7088
+ float eps;
7089
+ memcpy(&eps, dst->op_params, sizeof(float));
7090
+
7091
+ rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
7092
+
7093
+ (void) src1;
7094
+ (void) dst;
7095
+ (void) src1_dd;
7096
+ }
7097
+
7098
+ inline void ggml_cuda_op_mul_mat_q(
7099
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7100
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7101
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7102
+
7103
+ const int64_t ne00 = src0->ne[0];
7104
+
7105
+ const int64_t ne10 = src1->ne[0];
7106
+ GGML_ASSERT(ne10 % QK8_1 == 0);
7107
+
7108
+ const int64_t ne0 = dst->ne[0];
7109
+
7110
+ const int64_t row_diff = row_high - row_low;
6362
7111
 
6363
7112
  int id;
6364
7113
  CUDA_CHECK(cudaGetDevice(&id));
@@ -6474,6 +7223,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
6474
7223
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6475
7224
  const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6476
7225
 
7226
+ GGML_ASSERT(ggml_nrows(src1) == 1);
7227
+
6477
7228
  const int64_t ne00 = src0->ne[0];
6478
7229
  const int64_t row_diff = row_high - row_low;
6479
7230
 
@@ -6533,7 +7284,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
6533
7284
  size_t ash;
6534
7285
  dfloat * src1_dfloat = nullptr; // dfloat == half
6535
7286
 
6536
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
7287
+ bool src1_convert_f16 =
7288
+ src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
6537
7289
  src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
6538
7290
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
6539
7291
 
@@ -6837,7 +7589,6 @@ inline void ggml_cuda_op_im2col(
6837
7589
 
6838
7590
  const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
6839
7591
 
6840
- const int64_t N = src1->ne[is_2D ? 3 : 2];
6841
7592
  const int64_t IC = src1->ne[is_2D ? 2 : 1];
6842
7593
  const int64_t IH = is_2D ? src1->ne[1] : 1;
6843
7594
  const int64_t IW = src1->ne[0];
@@ -6848,17 +7599,51 @@ inline void ggml_cuda_op_im2col(
6848
7599
  const int64_t OH = is_2D ? dst->ne[2] : 1;
6849
7600
  const int64_t OW = dst->ne[1];
6850
7601
 
6851
- const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
6852
- const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
7602
+ const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
6853
7603
 
6854
- im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
6855
- OH, IW, IH, OW, IC, KH, KW, N,
6856
- ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
7604
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
6857
7605
 
6858
7606
  (void) src0;
6859
7607
  (void) src0_dd;
6860
7608
  }
6861
7609
 
7610
+
7611
+ inline void ggml_cuda_op_sum_rows(
7612
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7613
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7614
+
7615
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7616
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7617
+
7618
+ const int64_t ncols = src0->ne[0];
7619
+ const int64_t nrows = ggml_nrows(src0);
7620
+
7621
+ sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
7622
+
7623
+ (void) src1;
7624
+ (void) dst;
7625
+ (void) src1_dd;
7626
+ }
7627
+
7628
+ inline void ggml_cuda_op_argsort(
7629
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7630
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7631
+
7632
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7633
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
7634
+
7635
+ const int64_t ncols = src0->ne[0];
7636
+ const int64_t nrows = ggml_nrows(src0);
7637
+
7638
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
7639
+
7640
+ argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
7641
+
7642
+ (void) src1;
7643
+ (void) dst;
7644
+ (void) src1_dd;
7645
+ }
7646
+
6862
7647
  inline void ggml_cuda_op_diag_mask_inf(
6863
7648
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6864
7649
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7067,7 +7852,7 @@ static void ggml_cuda_op_mul_mat(
7067
7852
  const int64_t ne01 = src0->ne[1];
7068
7853
  const int64_t ne02 = src0->ne[2];
7069
7854
  const int64_t ne03 = src0->ne[3];
7070
- // const int64_t nrows0 = ggml_nrows(src0);
7855
+ const int64_t nrows0 = ggml_nrows(src0);
7071
7856
 
7072
7857
  const int64_t ne10 = src1->ne[0];
7073
7858
  const int64_t ne11 = src1->ne[1];
@@ -7103,10 +7888,9 @@ static void ggml_cuda_op_mul_mat(
7103
7888
 
7104
7889
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
7105
7890
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
7106
-
7107
7891
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
7108
- const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
7109
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
7892
+
7893
+ const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
7110
7894
 
7111
7895
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7112
7896
  GGML_ASSERT(!(split && ne02 > 1));
@@ -7231,7 +8015,7 @@ static void ggml_cuda_op_mul_mat(
7231
8015
  const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
7232
8016
 
7233
8017
  // for split tensors the data begins at i0 == i0_offset_low
7234
- char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
8018
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
7235
8019
  float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
7236
8020
  char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
7237
8021
  float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
@@ -7372,10 +8156,18 @@ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, gg
7372
8156
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
7373
8157
  }
7374
8158
 
8159
+ static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8160
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
8161
+ }
8162
+
7375
8163
  static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7376
8164
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
7377
8165
  }
7378
8166
 
8167
+ static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8168
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
8169
+ }
8170
+
7379
8171
  static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7380
8172
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
7381
8173
  }
@@ -7384,10 +8176,22 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7384
8176
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7385
8177
  }
7386
8178
 
8179
+ static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8180
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
8181
+ }
8182
+
8183
+ static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8184
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
8185
+ }
8186
+
7387
8187
  static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7388
8188
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7389
8189
  }
7390
8190
 
8191
+ static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8192
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
8193
+ }
8194
+
7391
8195
  static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7392
8196
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7393
8197
  }
@@ -7396,12 +8200,28 @@ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, g
7396
8200
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7397
8201
  }
7398
8202
 
8203
+ static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8204
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
8205
+ }
8206
+
8207
+ static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8208
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
8209
+ }
8210
+
8211
+ static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8212
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
8213
+ }
8214
+
8215
+ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8216
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
8217
+ }
8218
+
7399
8219
  static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7400
8220
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
7401
8221
  }
7402
8222
 
7403
8223
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7404
- if (!g_cublas_loaded) { return false; }
8224
+ if (!g_cublas_loaded) return false;
7405
8225
 
7406
8226
  const int64_t ne10 = src1->ne[0];
7407
8227
 
@@ -7479,7 +8299,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7479
8299
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7480
8300
  }
7481
8301
 
7482
- __global__ static void k_compute_batched_ptrs(
8302
+ static __global__ void k_compute_batched_ptrs(
7483
8303
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7484
8304
  const void ** ptrs_src, void ** ptrs_dst,
7485
8305
  int ne12, int ne13,
@@ -7535,9 +8355,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7535
8355
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7536
8356
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7537
8357
 
7538
- int id;
7539
- CUDA_CHECK(cudaGetDevice(&id));
7540
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
8358
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
7541
8359
 
7542
8360
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7543
8361
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -7594,7 +8412,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7594
8412
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
7595
8413
  // use cublasGemmStridedBatchedEx
7596
8414
  CUBLAS_CHECK(
7597
- cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
8415
+ cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7598
8416
  ne01, ne11, ne10,
7599
8417
  &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
7600
8418
  (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
@@ -7628,7 +8446,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7628
8446
  CUDA_CHECK(cudaGetLastError());
7629
8447
 
7630
8448
  CUBLAS_CHECK(
7631
- cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
8449
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7632
8450
  ne01, ne11, ne10,
7633
8451
  &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7634
8452
  (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
@@ -7698,10 +8516,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7698
8516
  #ifdef GGML_CUDA_FORCE_DMMV
7699
8517
  const bool use_mul_mat_vec_q = false;
7700
8518
  #else
7701
- const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
8519
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
7702
8520
  #endif // GGML_CUDA_FORCE_DMMV
7703
8521
 
7704
8522
  if (use_mul_mat_vec_q) {
8523
+ // NOTE: this kernel does not support ggml_nrows(src1) > 1
7705
8524
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
7706
8525
  } else {
7707
8526
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
@@ -7726,6 +8545,252 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7726
8545
  }
7727
8546
  }
7728
8547
 
8548
+ #if 0
8549
+ template<typename ... Srcs>
8550
+ static __global__ void k_compute_batched_ptrs_id(
8551
+ const void ** ptrs_src, void ** ptrs_dst,
8552
+ int ne12, int ne13,
8553
+ int ne23,
8554
+ int nb02, int nb03,
8555
+ int nb12, int nb13,
8556
+ int nb2, int nb3,
8557
+ int r2, int r3,
8558
+ ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
8559
+ const half * src1_f16, half * dst_f16,
8560
+ const int32_t * ids, const int id,
8561
+ Srcs... src0s) {
8562
+
8563
+ int i = ids[id];
8564
+
8565
+ half * src0_f16;
8566
+ const void * srcs_ar[] = { (const half *) src0s... };
8567
+ if (src0_type == GGML_TYPE_F16) {
8568
+ src0_f16 = (half *) srcs_ar[i];
8569
+ } else {
8570
+ src0_f16 = src0_as_f16;
8571
+ if (threadIdx.x == 0 && threadIdx.y == 0) {
8572
+ const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
8573
+ to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
8574
+ }
8575
+ }
8576
+
8577
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
8578
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
8579
+
8580
+ if (i13 >= ne13 || i12 >= ne12) {
8581
+ return;
8582
+ }
8583
+
8584
+ int i03 = i13 / r3;
8585
+ int i02 = i12 / r2;
8586
+
8587
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
8588
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
8589
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
8590
+ }
8591
+
8592
+ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
8593
+ const struct ggml_tensor * ids = dst->src[0];
8594
+ const struct ggml_tensor * src1 = dst->src[1];
8595
+ const struct ggml_tensor * src00 = dst->src[2];
8596
+
8597
+ const int id = dst->op_params[0];
8598
+
8599
+ GGML_ASSERT(!ggml_is_transposed(src00));
8600
+ GGML_ASSERT(!ggml_is_transposed(src1));
8601
+
8602
+ GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
8603
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
8604
+
8605
+ const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
8606
+ const int64_t ne01 = src00->ne[1];
8607
+ const int64_t ne02 = src00->ne[2];
8608
+ const int64_t ne03 = src00->ne[3];
8609
+
8610
+ //const int64_t nb01 = src00->nb[1];
8611
+ const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
8612
+ const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
8613
+
8614
+ const int64_t ne10 = src1->ne[0];
8615
+ const int64_t ne11 = src1->ne[1];
8616
+ const int64_t ne12 = src1->ne[2];
8617
+ const int64_t ne13 = src1->ne[3];
8618
+
8619
+ //const int64_t nb11 = src1->nb[1];
8620
+ const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
8621
+ const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
8622
+
8623
+ const int64_t ne1 = ggml_nelements(src1);
8624
+ const int64_t ne = ggml_nelements(dst);
8625
+
8626
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8627
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8628
+
8629
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
8630
+
8631
+ //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
8632
+ //void * src0_ddq = src0_extra->data_device[g_main_device];
8633
+ //half * src0_as_f16 = (half *) src0_ddq;
8634
+
8635
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
8636
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
8637
+
8638
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
8639
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
8640
+
8641
+ // convert src1 to fp16
8642
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8643
+ GGML_ASSERT(to_fp16_cuda != nullptr);
8644
+
8645
+ size_t src1_as = 0;
8646
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
8647
+ to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8648
+
8649
+ size_t dst_as = 0;
8650
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8651
+
8652
+ GGML_ASSERT(ne12 % ne02 == 0);
8653
+ GGML_ASSERT(ne13 % ne03 == 0);
8654
+
8655
+ // broadcast factors
8656
+ const int64_t r2 = ne12/ne02;
8657
+ const int64_t r3 = ne13/ne03;
8658
+
8659
+ const half alpha_f16 = 1.0f;
8660
+ const half beta_f16 = 0.0f;
8661
+
8662
+ // use cublasGemmBatchedEx
8663
+ const int ne23 = ne12*ne13;
8664
+
8665
+ const void ** ptrs_src = nullptr;
8666
+ void ** ptrs_dst = nullptr;
8667
+
8668
+ size_t ptrs_src_s = 0;
8669
+ size_t ptrs_dst_s = 0;
8670
+
8671
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
8672
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
8673
+
8674
+ int64_t src0_ne = ggml_nelements(src00);
8675
+ half * src0_as_f16 = nullptr;
8676
+ size_t src0_as = 0;
8677
+ if (src00->type != GGML_TYPE_F16) {
8678
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
8679
+ }
8680
+
8681
+ static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
8682
+ dim3 block_dims(ne13, ne12);
8683
+ k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
8684
+ ptrs_src, ptrs_dst,
8685
+ ne12, ne13,
8686
+ ne23,
8687
+ ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
8688
+ nb12, nb13,
8689
+ dst->nb[2], dst->nb[3],
8690
+ r2, r3,
8691
+ src00->type, src0_as_f16, src0_ne,
8692
+ src1_as_f16, dst_f16,
8693
+ (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
8694
+ dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
8695
+ dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
8696
+ dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
8697
+ dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
8698
+ );
8699
+ CUDA_CHECK(cudaGetLastError());
8700
+
8701
+ CUBLAS_CHECK(
8702
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8703
+ ne01, ne11, ne10,
8704
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
8705
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
8706
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
8707
+ ne23,
8708
+ CUBLAS_COMPUTE_16F,
8709
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8710
+
8711
+ if (src0_as != 0) {
8712
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
8713
+ }
8714
+ if (ptrs_src_s != 0) {
8715
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
8716
+ }
8717
+ if (ptrs_dst_s != 0) {
8718
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
8719
+ }
8720
+
8721
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8722
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8723
+
8724
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
8725
+ ggml_cuda_pool_free(dst_f16, dst_as);
8726
+ }
8727
+ #endif
8728
+
8729
+ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8730
+ #if 0
8731
+ ggml_cuda_mul_mat_id_cublas(dst);
8732
+ // TODO: mmq/mmv support
8733
+ #endif
8734
+
8735
+ GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8736
+
8737
+ const struct ggml_tensor * ids = src0;
8738
+ const int32_t id = ((int32_t *) dst->op_params)[0];
8739
+ const int32_t n_as = ((int32_t *) dst->op_params)[1];
8740
+
8741
+ std::vector<char> ids_host(ggml_nbytes(ids));
8742
+
8743
+ if (ids->backend == GGML_BACKEND_GPU) {
8744
+ const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8745
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8746
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8747
+ } else {
8748
+ memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
8749
+ }
8750
+
8751
+ const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
8752
+ const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
8753
+
8754
+ ggml_tensor_extra_gpu src1_row_extra;
8755
+ ggml_tensor_extra_gpu dst_row_extra;
8756
+
8757
+ ggml_tensor src1_row = *src1;
8758
+ ggml_tensor dst_row = *dst;
8759
+
8760
+ src1_row.ne[1] = 1;
8761
+ dst_row.ne[1] = 1;
8762
+
8763
+ src1_row.nb[2] = src1_row.nb[1];
8764
+ dst_row.nb[2] = dst_row.nb[1];
8765
+
8766
+ src1_row.nb[3] = src1_row.nb[1];
8767
+ dst_row.nb[3] = dst_row.nb[1];
8768
+
8769
+ src1_row.extra = &src1_row_extra;
8770
+ dst_row.extra = &dst_row_extra;
8771
+
8772
+
8773
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8774
+ //int32_t row_id;
8775
+ //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8776
+ //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8777
+
8778
+ const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8779
+
8780
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8781
+
8782
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8783
+
8784
+ src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
8785
+ src1_row.data = (char *) src1->data + i01*src1->nb[1];
8786
+
8787
+ dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
8788
+ dst_row.data = (char *) dst->data + i01*dst->nb[1];
8789
+
8790
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8791
+ }
8792
+ }
8793
+
7729
8794
  static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7730
8795
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
7731
8796
  }
@@ -7770,14 +8835,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7770
8835
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
7771
8836
 
7772
8837
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
7773
- ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7774
- ne10, ne11, nb10, nb11, nb12, main_stream);
8838
+ ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7775
8839
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7776
- ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7777
- ne10, ne11, nb10, nb11, nb12, main_stream);
8840
+ ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8841
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
8842
+ ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8843
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
8844
+ ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8845
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
8846
+ ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7778
8847
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7779
- ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7780
- ne10, ne11, nb10, nb11, nb12, main_stream);
8848
+ ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7781
8849
  } else {
7782
8850
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7783
8851
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7788,6 +8856,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7788
8856
  }
7789
8857
 
7790
8858
  static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8859
+ // TODO: why do we pass dst as src1 here?
7791
8860
  ggml_cuda_cpy(src0, dst, nullptr);
7792
8861
  (void) src1;
7793
8862
  }
@@ -7813,12 +8882,28 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
7813
8882
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7814
8883
  }
7815
8884
 
8885
+ static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8886
+ GGML_ASSERT(ggml_is_contiguous(src0));
8887
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
8888
+ }
8889
+
8890
+ static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8891
+ GGML_ASSERT(ggml_is_contiguous(src0));
8892
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
8893
+ }
8894
+
7816
8895
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7817
8896
  (void) src0;
7818
8897
  (void) src1;
7819
8898
  (void) dst;
7820
8899
  }
7821
8900
 
8901
+ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
8902
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
8903
+
8904
+ return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
8905
+ }
8906
+
7822
8907
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
7823
8908
  const int64_t nrows = ggml_nrows(tensor);
7824
8909
 
@@ -7868,8 +8953,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
7868
8953
 
7869
8954
  // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
7870
8955
  if (ne0 % MATRIX_ROW_PADDING != 0) {
7871
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
7872
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8956
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
7873
8957
  }
7874
8958
 
7875
8959
  char * buf;
@@ -8068,8 +9152,9 @@ void ggml_cuda_set_main_device(const int main_device) {
8068
9152
  main_device, g_device_count, g_main_device);
8069
9153
  return;
8070
9154
  }
8071
- g_main_device = main_device;
8072
- if (g_device_count > 1) {
9155
+
9156
+ if (g_main_device != main_device && g_device_count > 1) {
9157
+ g_main_device = main_device;
8073
9158
  cudaDeviceProp prop;
8074
9159
  CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
8075
9160
  fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
@@ -8095,7 +9180,7 @@ void ggml_cuda_free_scratch() {
8095
9180
  }
8096
9181
 
8097
9182
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8098
- if (!g_cublas_loaded) { return false; }
9183
+ if (!g_cublas_loaded) return false;
8099
9184
 
8100
9185
  ggml_cuda_func_t func;
8101
9186
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8128,9 +9213,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8128
9213
  case GGML_OP_ADD:
8129
9214
  func = ggml_cuda_add;
8130
9215
  break;
9216
+ case GGML_OP_ACC:
9217
+ func = ggml_cuda_acc;
9218
+ break;
8131
9219
  case GGML_OP_MUL:
8132
9220
  func = ggml_cuda_mul;
8133
9221
  break;
9222
+ case GGML_OP_DIV:
9223
+ func = ggml_cuda_div;
9224
+ break;
8134
9225
  case GGML_OP_UNARY:
8135
9226
  switch (ggml_get_unary_op(tensor)) {
8136
9227
  case GGML_UNARY_OP_GELU:
@@ -8139,15 +9230,37 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8139
9230
  case GGML_UNARY_OP_SILU:
8140
9231
  func = ggml_cuda_silu;
8141
9232
  break;
9233
+ case GGML_UNARY_OP_GELU_QUICK:
9234
+ func = ggml_cuda_gelu_quick;
9235
+ break;
9236
+ case GGML_UNARY_OP_TANH:
9237
+ func = ggml_cuda_tanh;
9238
+ break;
8142
9239
  case GGML_UNARY_OP_RELU:
8143
9240
  func = ggml_cuda_relu;
8144
9241
  break;
8145
9242
  default:
8146
9243
  return false;
8147
- } break;
9244
+ }
9245
+ break;
8148
9246
  case GGML_OP_NORM:
8149
9247
  func = ggml_cuda_norm;
8150
9248
  break;
9249
+ case GGML_OP_GROUP_NORM:
9250
+ func = ggml_cuda_group_norm;
9251
+ break;
9252
+ case GGML_OP_CONCAT:
9253
+ func = ggml_cuda_concat;
9254
+ break;
9255
+ case GGML_OP_UPSCALE:
9256
+ func = ggml_cuda_upscale;
9257
+ break;
9258
+ case GGML_OP_PAD:
9259
+ func = ggml_cuda_pad;
9260
+ break;
9261
+ case GGML_OP_LEAKY_RELU:
9262
+ func = ggml_cuda_leaky_relu;
9263
+ break;
8151
9264
  case GGML_OP_RMS_NORM:
8152
9265
  func = ggml_cuda_rms_norm;
8153
9266
  break;
@@ -8157,6 +9270,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8157
9270
  }
8158
9271
  func = ggml_cuda_mul_mat;
8159
9272
  break;
9273
+ case GGML_OP_MUL_MAT_ID:
9274
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
9275
+ return false;
9276
+ }
9277
+ func = ggml_cuda_mul_mat_id;
9278
+ break;
8160
9279
  case GGML_OP_SCALE:
8161
9280
  func = ggml_cuda_scale;
8162
9281
  break;
@@ -8164,9 +9283,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8164
9283
  func = ggml_cuda_sqr;
8165
9284
  break;
8166
9285
  case GGML_OP_CLAMP:
8167
- if (!any_on_device) {
8168
- return false;
8169
- }
8170
9286
  func = ggml_cuda_clamp;
8171
9287
  break;
8172
9288
  case GGML_OP_CPY:
@@ -8175,6 +9291,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8175
9291
  case GGML_OP_CONT:
8176
9292
  func = ggml_cuda_dup;
8177
9293
  break;
9294
+ case GGML_OP_NONE:
8178
9295
  case GGML_OP_RESHAPE:
8179
9296
  case GGML_OP_VIEW:
8180
9297
  case GGML_OP_PERMUTE:
@@ -8196,6 +9313,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8196
9313
  case GGML_OP_IM2COL:
8197
9314
  func = ggml_cuda_im2col;
8198
9315
  break;
9316
+ case GGML_OP_SUM_ROWS:
9317
+ func = ggml_cuda_sum_rows;
9318
+ break;
9319
+ case GGML_OP_ARGSORT:
9320
+ func = ggml_cuda_argsort;
9321
+ break;
8199
9322
  default:
8200
9323
  return false;
8201
9324
  }
@@ -8212,7 +9335,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8212
9335
 
8213
9336
  int ggml_cuda_get_device_count() {
8214
9337
  int device_count;
8215
- CUDA_CHECK(cudaGetDeviceCount(&device_count));
9338
+ if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
9339
+ return 0;
9340
+ }
8216
9341
  return device_count;
8217
9342
  }
8218
9343
 
@@ -8228,27 +9353,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
8228
9353
 
8229
9354
  #define UNUSED GGML_UNUSED
8230
9355
 
8231
- struct ggml_backend_context_cuda {
8232
- };
8233
-
8234
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
8235
- return GGML_CUDA_NAME;
8236
-
8237
- UNUSED(backend);
8238
- }
8239
-
8240
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
8241
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
8242
- delete cuda_ctx;
8243
- delete backend;
8244
- }
9356
+ // cuda buffer
8245
9357
 
8246
9358
  struct ggml_backend_buffer_context_cuda {
8247
- void * device;
8248
-
9359
+ int device;
9360
+ void * dev_ptr = nullptr;
8249
9361
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
8250
9362
  size_t temp_tensor_extra_index = 0;
8251
9363
 
9364
+ ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
9365
+
8252
9366
  ~ggml_backend_buffer_context_cuda() {
8253
9367
  delete[] temp_tensor_extras;
8254
9368
  }
@@ -8269,41 +9383,20 @@ struct ggml_backend_buffer_context_cuda {
8269
9383
 
8270
9384
  static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
8271
9385
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8272
- CUDA_CHECK(cudaFree(ctx->device));
9386
+ CUDA_CHECK(cudaFree(ctx->dev_ptr));
8273
9387
  delete ctx;
8274
9388
  }
8275
9389
 
8276
9390
  static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
8277
9391
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8278
- return ctx->device;
8279
- }
8280
-
8281
- static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8282
- int64_t row_low = 0;
8283
- int64_t row_high = ggml_nrows(tensor);
8284
- int64_t nrows_split = row_high - row_low;
8285
-
8286
- size_t size = ggml_nbytes_split(tensor, nrows_split);
8287
-
8288
- int64_t ne0 = tensor->ne[0];
8289
-
8290
- if (ggml_is_quantized(tensor->type)) {
8291
- if (ne0 % MATRIX_ROW_PADDING != 0) {
8292
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8293
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8294
- }
8295
- }
8296
-
8297
- return size;
8298
-
8299
- UNUSED(buffer);
9392
+ return ctx->dev_ptr;
8300
9393
  }
8301
9394
 
8302
9395
  static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8303
9396
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8304
9397
 
8305
9398
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
8306
- assert(tensor->view_src->buffer->backend == buffer->backend);
9399
+ assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
8307
9400
  tensor->backend = tensor->view_src->backend;
8308
9401
  tensor->extra = tensor->view_src->extra;
8309
9402
  return;
@@ -8311,7 +9404,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8311
9404
 
8312
9405
  ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
8313
9406
 
8314
- extra->data_device[g_main_device] = tensor->data;
9407
+ extra->data_device[ctx->device] = tensor->data;
8315
9408
 
8316
9409
  tensor->backend = GGML_BACKEND_GPU;
8317
9410
  tensor->extra = extra;
@@ -8323,64 +9416,207 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8323
9416
  int64_t nrows_split = row_high - row_low;
8324
9417
 
8325
9418
  size_t original_size = ggml_nbytes_split(tensor, nrows_split);
8326
- size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
9419
+ size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
8327
9420
 
8328
9421
  if (padded_size > original_size && tensor->view_src == nullptr) {
8329
- CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
9422
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
8330
9423
  }
8331
9424
  }
8332
9425
 
8333
9426
  UNUSED(buffer);
8334
9427
  }
8335
9428
 
9429
+ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9430
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
9431
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9432
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9433
+
9434
+ CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9435
+
9436
+ UNUSED(buffer);
9437
+ }
9438
+
9439
+ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9440
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
9441
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9442
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9443
+
9444
+ CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
9445
+
9446
+ UNUSED(buffer);
9447
+ }
9448
+
8336
9449
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
8337
- /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
8338
- /* .get_base = */ ggml_backend_cuda_buffer_get_base,
8339
- /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
8340
- /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
8341
- /* .free_tensor = */ NULL,
9450
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
9451
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
9452
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
9453
+ /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
9454
+ /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
9455
+ /* .cpy_tensor_from = */ NULL,
9456
+ /* .cpy_tensor_to = */ NULL,
8342
9457
  };
8343
9458
 
8344
- static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
8345
- ggml_cuda_set_device(g_main_device);
9459
+ // cuda buffer type
8346
9460
 
8347
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
9461
+ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
9462
+ int device = (int) (intptr_t) buft->context;
9463
+
9464
+ ggml_cuda_set_device(device);
8348
9465
 
8349
9466
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8350
9467
 
8351
- ggml_cuda_set_device(g_main_device);
8352
- CUDA_CHECK(cudaMalloc(&ctx->device, size));
9468
+ void * dev_ptr;
9469
+ CUDA_CHECK(cudaMalloc(&dev_ptr, size));
8353
9470
 
8354
- return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
9471
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
9472
+
9473
+ return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
8355
9474
  }
8356
9475
 
8357
- static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
9476
+ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
8358
9477
  return 128;
9478
+
9479
+ UNUSED(buft);
9480
+ }
9481
+
9482
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
9483
+ int64_t row_low = 0;
9484
+ int64_t row_high = ggml_nrows(tensor);
9485
+ int64_t nrows_split = row_high - row_low;
9486
+
9487
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
9488
+
9489
+ int64_t ne0 = tensor->ne[0];
9490
+
9491
+ if (ggml_is_quantized(tensor->type)) {
9492
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
9493
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
9494
+ }
9495
+ }
9496
+
9497
+ return size;
9498
+
9499
+ UNUSED(buft);
9500
+ }
9501
+
9502
+ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
9503
+ return ggml_backend_is_cuda(backend);
9504
+
9505
+ UNUSED(buft);
9506
+ }
9507
+
9508
+ static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
9509
+ /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
9510
+ /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
9511
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
9512
+ /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
9513
+ };
9514
+
9515
+ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9516
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
9517
+ static bool ggml_backend_buffer_type_cuda_initialized = false;
9518
+ if (!ggml_backend_buffer_type_cuda_initialized) {
9519
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
9520
+ ggml_backend_buffer_type_cuda[i] = {
9521
+ /* .iface = */ cuda_backend_buffer_type_interface,
9522
+ /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
9523
+ };
9524
+ }
9525
+ ggml_backend_buffer_type_cuda_initialized = true;
9526
+ }
9527
+
9528
+ return &ggml_backend_buffer_type_cuda[device];
9529
+ }
9530
+
9531
+ // host buffer type
9532
+
9533
+ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9534
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9535
+ CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
9536
+ delete ctx;
9537
+ }
9538
+
9539
+ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
9540
+ void * ptr;
9541
+ CUDA_CHECK(cudaMallocHost(&ptr, size));
9542
+
9543
+ // FIXME: this is a hack to avoid having to implement a new buffer type
9544
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
9545
+ buffer->buft = buft;
9546
+ buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
9547
+
9548
+ return buffer;
9549
+
9550
+ UNUSED(buft);
9551
+ }
9552
+
9553
+ struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9554
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9555
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9556
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9557
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9558
+ };
9559
+
9560
+ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9561
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9562
+ /* .iface = */ cuda_backend_host_buffer_type_interface,
9563
+ /* .context = */ nullptr,
9564
+ };
9565
+
9566
+ return &ggml_backend_buffer_type_cuda_host;
9567
+ }
9568
+
9569
+ // backend
9570
+
9571
+ struct ggml_backend_context_cuda {
9572
+ int device;
9573
+ };
9574
+
9575
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
9576
+ return GGML_CUDA_NAME;
9577
+
8359
9578
  UNUSED(backend);
8360
9579
  }
8361
9580
 
9581
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
9582
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9583
+
9584
+ delete cuda_ctx;
9585
+ delete backend;
9586
+ }
9587
+
9588
+ static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
9589
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9590
+
9591
+ return ggml_backend_cuda_buffer_type(cuda_ctx->device);
9592
+ }
9593
+
8362
9594
  static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9595
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9596
+
9597
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8363
9598
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
8364
9599
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8365
9600
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8366
9601
 
8367
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
8368
-
8369
- UNUSED(backend);
9602
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
8370
9603
  }
8371
9604
 
8372
9605
  static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9606
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9607
+
9608
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8373
9609
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
8374
9610
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8375
9611
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8376
9612
 
8377
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8378
-
8379
- UNUSED(backend);
9613
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
8380
9614
  }
8381
9615
 
8382
9616
  static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
8383
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
9617
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9618
+
9619
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
8384
9620
 
8385
9621
  UNUSED(backend);
8386
9622
  }
@@ -8394,14 +9630,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8394
9630
  UNUSED(cgraph);
8395
9631
  }
8396
9632
 
8397
- [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9633
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8398
9634
  GGML_ASSERT(!"not implemented");
8399
9635
 
8400
9636
  UNUSED(backend);
8401
9637
  UNUSED(plan);
8402
9638
  }
8403
9639
 
8404
- [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9640
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8405
9641
  GGML_ASSERT(!"not implemented");
8406
9642
 
8407
9643
  UNUSED(backend);
@@ -8409,7 +9645,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8409
9645
  }
8410
9646
 
8411
9647
  static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
8412
- ggml_cuda_set_device(g_main_device);
9648
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9649
+
9650
+ ggml_cuda_set_main_device(cuda_ctx->device);
8413
9651
 
8414
9652
  ggml_compute_params params = {};
8415
9653
  params.type = GGML_TASK_COMPUTE;
@@ -8417,13 +9655,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8417
9655
  for (int i = 0; i < cgraph->n_nodes; i++) {
8418
9656
  ggml_tensor * node = cgraph->nodes[i];
8419
9657
 
8420
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
9658
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8421
9659
  continue;
8422
- }
9660
+
8423
9661
  assert(node->backend == GGML_BACKEND_GPU);
9662
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9663
+ assert(node->extra != nullptr);
9664
+
8424
9665
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8425
9666
  if (node->src[j] != nullptr) {
8426
9667
  assert(node->src[j]->backend == GGML_BACKEND_GPU);
9668
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9669
+ assert(node->src[j]->extra != nullptr);
8427
9670
  }
8428
9671
  }
8429
9672
 
@@ -8460,27 +9703,143 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8460
9703
  UNUSED(backend);
8461
9704
  }
8462
9705
 
9706
+ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
9707
+ switch (op->op) {
9708
+ case GGML_OP_UNARY:
9709
+ switch (ggml_get_unary_op(op)) {
9710
+ case GGML_UNARY_OP_GELU:
9711
+ case GGML_UNARY_OP_SILU:
9712
+ case GGML_UNARY_OP_RELU:
9713
+ case GGML_UNARY_OP_GELU_QUICK:
9714
+ case GGML_UNARY_OP_TANH:
9715
+ return true;
9716
+ default:
9717
+ return false;
9718
+ }
9719
+ break;
9720
+ case GGML_OP_MUL_MAT:
9721
+ case GGML_OP_MUL_MAT_ID:
9722
+ {
9723
+ struct ggml_tensor * a;
9724
+ struct ggml_tensor * b;
9725
+ if (op->op == GGML_OP_MUL_MAT) {
9726
+ a = op->src[0];
9727
+ b = op->src[1];
9728
+ } else {
9729
+ a = op->src[2];
9730
+ b = op->src[1];
9731
+ }
9732
+ if (a->ne[3] != b->ne[3]) {
9733
+ return false;
9734
+ }
9735
+ return true;
9736
+ } break;
9737
+ case GGML_OP_GET_ROWS:
9738
+ {
9739
+ switch (op->src[0]->type) {
9740
+ case GGML_TYPE_F16:
9741
+ case GGML_TYPE_F32:
9742
+ case GGML_TYPE_Q4_0:
9743
+ case GGML_TYPE_Q4_1:
9744
+ case GGML_TYPE_Q5_0:
9745
+ case GGML_TYPE_Q5_1:
9746
+ case GGML_TYPE_Q8_0:
9747
+ return true;
9748
+ default:
9749
+ return false;
9750
+ }
9751
+ } break;
9752
+ case GGML_OP_CPY:
9753
+ {
9754
+ ggml_type src0_type = op->src[0]->type;
9755
+ ggml_type src1_type = op->src[1]->type;
9756
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
9757
+ return true;
9758
+ }
9759
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
9760
+ return true;
9761
+ }
9762
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
9763
+ return true;
9764
+ }
9765
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
9766
+ return true;
9767
+ }
9768
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
9769
+ return true;
9770
+ }
9771
+ if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
9772
+ return true;
9773
+ }
9774
+ return false;
9775
+ } break;
9776
+ case GGML_OP_NONE:
9777
+ case GGML_OP_RESHAPE:
9778
+ case GGML_OP_VIEW:
9779
+ case GGML_OP_PERMUTE:
9780
+ case GGML_OP_TRANSPOSE:
9781
+ case GGML_OP_NORM:
9782
+ case GGML_OP_REPEAT:
9783
+ case GGML_OP_DUP:
9784
+ case GGML_OP_ADD:
9785
+ case GGML_OP_MUL:
9786
+ case GGML_OP_DIV:
9787
+ case GGML_OP_RMS_NORM:
9788
+ case GGML_OP_SCALE:
9789
+ case GGML_OP_SQR:
9790
+ case GGML_OP_CLAMP:
9791
+ case GGML_OP_CONT:
9792
+ case GGML_OP_DIAG_MASK_INF:
9793
+ case GGML_OP_SOFT_MAX:
9794
+ case GGML_OP_ROPE:
9795
+ case GGML_OP_ALIBI:
9796
+ case GGML_OP_IM2COL:
9797
+ case GGML_OP_SUM_ROWS:
9798
+ case GGML_OP_ARGSORT:
9799
+ case GGML_OP_ACC:
9800
+ case GGML_OP_CONCAT:
9801
+ case GGML_OP_GROUP_NORM:
9802
+ case GGML_OP_UPSCALE:
9803
+ case GGML_OP_PAD:
9804
+ case GGML_OP_LEAKY_RELU:
9805
+ return true;
9806
+ default:
9807
+ return false;
9808
+ }
9809
+
9810
+ UNUSED(backend);
9811
+ }
9812
+
8463
9813
  static ggml_backend_i cuda_backend_i = {
8464
- /* .get_name = */ ggml_backend_cuda_name,
8465
- /* .free = */ ggml_backend_cuda_free,
8466
- /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
8467
- /* .get_alignment = */ ggml_backend_cuda_get_alignment,
8468
- /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
8469
- /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
8470
- /* .synchronize = */ ggml_backend_cuda_synchronize,
8471
- /* .cpy_tensor_from = */ nullptr,
8472
- /* .cpy_tensor_to = */ nullptr,
8473
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
8474
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
8475
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
8476
- /* .graph_compute = */ ggml_backend_cuda_graph_compute,
8477
- /* .supports_op = */ nullptr,
9814
+ /* .get_name = */ ggml_backend_cuda_name,
9815
+ /* .free = */ ggml_backend_cuda_free,
9816
+ /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
9817
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
9818
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
9819
+ /* .cpy_tensor_from_async = */ NULL,
9820
+ /* .cpy_tensor_to_async = */ NULL,
9821
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
9822
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
9823
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
9824
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
9825
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
9826
+ /* .supports_op = */ ggml_backend_cuda_supports_op,
8478
9827
  };
8479
9828
 
8480
- ggml_backend_t ggml_backend_cuda_init() {
9829
+ ggml_backend_t ggml_backend_cuda_init(int device) {
8481
9830
  ggml_init_cublas(); // TODO: remove from ggml.c
8482
9831
 
8483
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
9832
+ if (device < 0 || device >= ggml_cuda_get_device_count()) {
9833
+ fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
9834
+ return nullptr;
9835
+ }
9836
+
9837
+ // not strictly necessary, but it may reduce the overhead of the first graph_compute
9838
+ ggml_cuda_set_main_device(device);
9839
+
9840
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
9841
+ /* .device = */ device
9842
+ };
8484
9843
 
8485
9844
  ggml_backend_t cuda_backend = new ggml_backend {
8486
9845
  /* .interface = */ cuda_backend_i,
@@ -8489,3 +9848,27 @@ ggml_backend_t ggml_backend_cuda_init() {
8489
9848
 
8490
9849
  return cuda_backend;
8491
9850
  }
9851
+
9852
+ bool ggml_backend_is_cuda(ggml_backend_t backend) {
9853
+ return backend->iface.get_name == ggml_backend_cuda_name;
9854
+ }
9855
+
9856
+ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
9857
+ ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
9858
+ return cuda_backend;
9859
+
9860
+ UNUSED(params);
9861
+ }
9862
+
9863
+ extern "C" int ggml_backend_cuda_reg_devices();
9864
+
9865
+ int ggml_backend_cuda_reg_devices() {
9866
+ int device_count = ggml_cuda_get_device_count();
9867
+ //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
9868
+ for (int i = 0; i < device_count; i++) {
9869
+ char name[128];
9870
+ snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
9871
+ ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
9872
+ }
9873
+ return device_count;
9874
+ }