llama_cpp 0.9.5 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,8 @@
1
1
  #include <algorithm>
2
- #include <cinttypes>
3
2
  #include <cstddef>
4
3
  #include <cstdint>
4
+ #include <cinttypes>
5
+ #include <float.h>
5
6
  #include <limits>
6
7
  #include <stdint.h>
7
8
  #include <stdio.h>
@@ -69,6 +70,7 @@
69
70
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
70
71
  #define cudaSetDevice hipSetDevice
71
72
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
73
+ #define cudaStreamFireAndForget hipStreamFireAndForget
72
74
  #define cudaStreamNonBlocking hipStreamNonBlocking
73
75
  #define cudaStreamSynchronize hipStreamSynchronize
74
76
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
@@ -190,7 +192,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190
192
  fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
191
193
  cudaGetErrorString(err_)); \
192
194
  fprintf(stderr, "current device: %d\n", id); \
193
- exit(1); \
195
+ GGML_ASSERT(!"CUDA error"); \
194
196
  } \
195
197
  } while (0)
196
198
 
@@ -204,7 +206,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
204
206
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
205
207
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
206
208
  fprintf(stderr, "current device: %d\n", id); \
207
- exit(1); \
209
+ GGML_ASSERT(!"cuBLAS error"); \
208
210
  } \
209
211
  } while (0)
210
212
  #else
@@ -216,7 +218,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
216
218
  cudaGetDevice(&id); \
217
219
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
218
220
  fprintf(stderr, "current device: %d\n", id); \
219
- exit(1); \
221
+ GGML_ASSERT(!"cuBLAS error"); \
220
222
  } \
221
223
  } while (0)
222
224
  #endif // CUDART_VERSION >= 11
@@ -433,8 +435,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
435
  #define WARP_SIZE 32
434
436
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
435
437
 
436
- #define CUDA_ADD_BLOCK_SIZE 256
437
- #define CUDA_MUL_BLOCK_SIZE 256
438
438
  #define CUDA_GELU_BLOCK_SIZE 256
439
439
  #define CUDA_SILU_BLOCK_SIZE 256
440
440
  #define CUDA_RELU_BLOCK_SIZE 256
@@ -527,40 +527,87 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
527
527
  return x;
528
528
  }
529
529
 
530
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
531
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
530
+ static __device__ __forceinline__ float op_repeat(const float a, const float b) {
531
+ return b;
532
+ }
532
533
 
533
- if (i >= kx) {
534
- return;
535
- }
536
- dst[i] = x[i] + y[i%ky];
534
+ static __device__ __forceinline__ float op_add(const float a, const float b) {
535
+ return a + b;
537
536
  }
538
537
 
539
- static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
540
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
538
+ static __device__ __forceinline__ float op_mul(const float a, const float b) {
539
+ return a * b;
540
+ }
541
541
 
542
- if (i >= k) {
543
- return;
544
- }
545
- dst[i] = __hadd(x[i], __float2half(y[i]));
542
+ static __device__ __forceinline__ float op_div(const float a, const float b) {
543
+ return a / b;
546
544
  }
547
545
 
548
- static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
549
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
546
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
547
+ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
548
+ int ne0, int ne1, int ne2, int ne3,
549
+ int ne10, int ne11, int ne12, int ne13,
550
+ /*int s0, */ int s1, int s2, int s3,
551
+ /*int s10,*/ int s11, int s12, int s13) {
552
+ const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
553
+ const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
554
+ const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
555
+ const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
550
556
 
551
- if (i >= k) {
557
+ if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
552
558
  return;
553
559
  }
554
- dst[i] = __half2float(x[i]) + y[i];
560
+
561
+ const int i11 = i1 % ne11;
562
+ const int i12 = i2 % ne12;
563
+ const int i13 = i3 % ne13;
564
+
565
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
566
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
567
+ const size_t i_dst = i_src0;
568
+
569
+ const src0_t * src0_row = src0 + i_src0;
570
+ const src1_t * src1_row = src1 + i_src1;
571
+ dst_t * dst_row = dst + i_dst;
572
+
573
+ for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
574
+ const int i10 = i0 % ne10;
575
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
576
+ }
555
577
  }
556
578
 
557
- static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
579
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
580
+ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
581
+ int ne0, int ne1, int ne2, int ne3,
582
+ int ne10, int ne11, int ne12, int ne13,
583
+ /*int s0, */ int s1, int s2, int s3,
584
+ /*int s10,*/ int s11, int s12, int s13) {
585
+
558
586
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
559
587
 
560
- if (i >= kx) {
588
+ const int i3 = i/(ne2*ne1*ne0);
589
+ const int i2 = (i/(ne1*ne0)) % ne2;
590
+ const int i1 = (i/ne0) % ne1;
591
+ const int i0 = i % ne0;
592
+
593
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
561
594
  return;
562
595
  }
563
- dst[i] = x[i] * y[i%ky];
596
+
597
+ const int i11 = i1 % ne11;
598
+ const int i12 = i2 % ne12;
599
+ const int i13 = i3 % ne13;
600
+
601
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
602
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
603
+ const size_t i_dst = i_src0;
604
+
605
+ const src0_t * src0_row = src0 + i_src0;
606
+ const src1_t * src1_row = src1 + i_src1;
607
+ dst_t * dst_row = dst + i_dst;
608
+
609
+ const int i10 = i0 % ne10;
610
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
564
611
  }
565
612
 
566
613
  static __global__ void gelu_f32(const float * x, float * dst, const int k) {
@@ -604,12 +651,10 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
604
651
  }
605
652
 
606
653
  template <int block_size>
607
- static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
654
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
608
655
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
609
656
  const int tid = threadIdx.x;
610
657
 
611
- const float eps = 1e-5f;
612
-
613
658
  float2 mean_var = make_float2(0.f, 0.f);
614
659
 
615
660
  for (int col = tid; col < ncols; col += block_size) {
@@ -4559,6 +4604,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4559
4604
  cpy_1(cx + x_offset, cdst + dst_offset);
4560
4605
  }
4561
4606
 
4607
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
4608
+ const float * xi = (const float *) cxi;
4609
+ block_q8_0 * dsti = (block_q8_0 *) cdsti;
4610
+
4611
+ float amax = 0.0f; // absolute max
4612
+
4613
+ for (int j = 0; j < QK8_0; j++) {
4614
+ const float v = xi[j];
4615
+ amax = fmaxf(amax, fabsf(v));
4616
+ }
4617
+
4618
+ const float d = amax / ((1 << 7) - 1);
4619
+ const float id = d ? 1.0f/d : 0.0f;
4620
+
4621
+ dsti->d = d;
4622
+
4623
+ for (int j = 0; j < QK8_0; ++j) {
4624
+ const float x0 = xi[j]*id;
4625
+
4626
+ dsti->qs[j] = roundf(x0);
4627
+ }
4628
+ }
4629
+
4630
+ static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
4631
+ const float * xi = (const float *) cxi;
4632
+ block_q4_0 * dsti = (block_q4_0 *) cdsti;
4633
+
4634
+ float amax = 0.0f;
4635
+ float vmax = 0.0f;
4636
+
4637
+ for (int j = 0; j < QK4_0; ++j) {
4638
+ const float v = xi[j];
4639
+ if (amax < fabsf(v)) {
4640
+ amax = fabsf(v);
4641
+ vmax = v;
4642
+ }
4643
+ }
4644
+
4645
+ const float d = vmax / -8;
4646
+ const float id = d ? 1.0f/d : 0.0f;
4647
+
4648
+ dsti->d = d;
4649
+
4650
+ for (int j = 0; j < QK4_0/2; ++j) {
4651
+ const float x0 = xi[0 + j]*id;
4652
+ const float x1 = xi[QK4_0/2 + j]*id;
4653
+
4654
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
4655
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
4656
+
4657
+ dsti->qs[j] = xi0;
4658
+ dsti->qs[j] |= xi1 << 4;
4659
+ }
4660
+ }
4661
+
4662
+ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
4663
+ const float * xi = (const float *) cxi;
4664
+ block_q4_1 * dsti = (block_q4_1 *) cdsti;
4665
+
4666
+ float vmin = FLT_MAX;
4667
+ float vmax = -FLT_MAX;
4668
+
4669
+ for (int j = 0; j < QK4_1; ++j) {
4670
+ const float v = xi[j];
4671
+
4672
+ if (v < vmin) vmin = v;
4673
+ if (v > vmax) vmax = v;
4674
+ }
4675
+
4676
+ const float d = (vmax - vmin) / ((1 << 4) - 1);
4677
+ const float id = d ? 1.0f/d : 0.0f;
4678
+
4679
+ dsti->dm.x = d;
4680
+ dsti->dm.y = vmin;
4681
+
4682
+ for (int j = 0; j < QK4_1/2; ++j) {
4683
+ const float x0 = (xi[0 + j] - vmin)*id;
4684
+ const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
4685
+
4686
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
4687
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
4688
+
4689
+ dsti->qs[j] = xi0;
4690
+ dsti->qs[j] |= xi1 << 4;
4691
+ }
4692
+ }
4693
+
4694
+ template <cpy_kernel_t cpy_blck, int qk>
4695
+ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
4696
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
4697
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
4698
+ const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
4699
+
4700
+ if (i >= ne) {
4701
+ return;
4702
+ }
4703
+
4704
+ const int i02 = i / (ne00*ne01);
4705
+ const int i01 = (i - i02*ne01*ne00) / ne00;
4706
+ const int i00 = (i - i02*ne01*ne00 - i01*ne00);
4707
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
4708
+
4709
+ const int i12 = i / (ne10*ne11);
4710
+ const int i11 = (i - i12*ne10*ne11) / ne10;
4711
+ const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
4712
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
4713
+
4714
+ cpy_blck(cx + x_offset, cdst + dst_offset);
4715
+ }
4716
+
4562
4717
  static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4563
4718
  const float y = (i0 / 2 - low) / max(0.001f, high - low);
4564
4719
  return 1.0f - min(1.0f, max(0.0f, y));
@@ -4713,6 +4868,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
4713
4868
  dst[i] = col * m_k + x[i];
4714
4869
  }
4715
4870
 
4871
+ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
4872
+ const int row = blockIdx.y;
4873
+ const int col = threadIdx.x;
4874
+
4875
+ float sum = 0.0f;
4876
+ for (int i = col; i < ncols; i += blockDim.x) {
4877
+ sum += x[row * ncols + i];
4878
+ }
4879
+
4880
+ sum = warp_reduce_sum(sum);
4881
+
4882
+ if (col == 0) {
4883
+ dst[row] = sum;
4884
+ }
4885
+ }
4886
+
4887
+ template<typename T>
4888
+ static inline __device__ void swap(T & a, T & b) {
4889
+ T tmp = a;
4890
+ a = b;
4891
+ b = tmp;
4892
+ }
4893
+
4894
+ template<ggml_sort_order order>
4895
+ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
4896
+ // bitonic sort
4897
+ int col = threadIdx.x;
4898
+ int row = blockIdx.y;
4899
+
4900
+ if (col >= ncols) return;
4901
+
4902
+ const float * x_row = x + row * ncols;
4903
+ int * dst_row = dst + row * ncols;
4904
+
4905
+ // initialize indices
4906
+ if (col < ncols) {
4907
+ dst_row[col] = col;
4908
+ }
4909
+ __syncthreads();
4910
+
4911
+ for (int k = 2; k <= ncols; k *= 2) {
4912
+ for (int j = k / 2; j > 0; j /= 2) {
4913
+ int ixj = col ^ j;
4914
+ if (ixj > col) {
4915
+ if ((col & k) == 0) {
4916
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
4917
+ swap(dst_row[col], dst_row[ixj]);
4918
+ }
4919
+ } else {
4920
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
4921
+ swap(dst_row[col], dst_row[ixj]);
4922
+ }
4923
+ }
4924
+ }
4925
+ __syncthreads();
4926
+ }
4927
+ }
4928
+ }
4929
+
4716
4930
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4717
4931
  const int col = blockDim.y*blockIdx.y + threadIdx.y;
4718
4932
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
@@ -4722,8 +4936,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
4722
4936
  }
4723
4937
 
4724
4938
  const int i = row*ncols + col;
4725
- // dst[i] = col > n_past + row ? -INFINITY : x[i];
4726
- dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4939
+ //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
4940
+ //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4941
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
4727
4942
  }
4728
4943
 
4729
4944
  static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
@@ -4845,25 +5060,119 @@ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const
4845
5060
  k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4846
5061
  }
4847
5062
 
4848
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4849
- const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4850
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
4851
- }
4852
-
4853
- static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
4854
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4855
- add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4856
- }
4857
-
4858
- static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4859
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4860
- add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4861
- }
5063
+ template<float (*bin_op)(const float, const float)>
5064
+ struct bin_bcast_cuda {
5065
+ template<typename src0_t, typename src1_t, typename dst_t>
5066
+ void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
5067
+ const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
5068
+ cudaStream_t stream) {
5069
+
5070
+ GGML_TENSOR_BINARY_OP_LOCALS
5071
+
5072
+
5073
+ int nr0 = ne10/ne0;
5074
+ int nr1 = ne11/ne1;
5075
+ int nr2 = ne12/ne2;
5076
+ int nr3 = ne13/ne3;
5077
+
5078
+ int nr[4] = { nr0, nr1, nr2, nr3 };
5079
+
5080
+ // collapse dimensions until first broadcast dimension
5081
+ int64_t cne0[] = {ne0, ne1, ne2, ne3};
5082
+ int64_t cne1[] = {ne10, ne11, ne12, ne13};
5083
+ size_t cnb0[] = {nb0, nb1, nb2, nb3};
5084
+ size_t cnb1[] = {nb10, nb11, nb12, nb13};
5085
+ auto collapse = [](int64_t cne[]) {
5086
+ cne[0] *= cne[1];
5087
+ cne[1] = cne[2];
5088
+ cne[2] = cne[3];
5089
+ cne[3] = 1;
5090
+ };
5091
+
5092
+ auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
5093
+ cnb[1] *= cne[1];
5094
+ cnb[2] *= cne[2];
5095
+ cnb[3] *= cne[3];
5096
+ };
5097
+
5098
+ for (int i = 0; i < 4; i++) {
5099
+ if (nr[i] != 1) {
5100
+ break;
5101
+ }
5102
+ if (i > 0) {
5103
+ collapse_nb(cnb0, cne0);
5104
+ collapse_nb(cnb1, cne1);
5105
+ collapse(cne0);
5106
+ collapse(cne1);
5107
+ }
5108
+ }
5109
+ {
5110
+ int64_t ne0 = cne0[0];
5111
+ int64_t ne1 = cne0[1];
5112
+ int64_t ne2 = cne0[2];
5113
+ int64_t ne3 = cne0[3];
5114
+
5115
+ int64_t ne10 = cne1[0];
5116
+ int64_t ne11 = cne1[1];
5117
+ int64_t ne12 = cne1[2];
5118
+ int64_t ne13 = cne1[3];
5119
+
5120
+ //size_t nb0 = cnb0[0];
5121
+ size_t nb1 = cnb0[1];
5122
+ size_t nb2 = cnb0[2];
5123
+ size_t nb3 = cnb0[3];
5124
+
5125
+ //size_t nb10 = cnb1[0];
5126
+ size_t nb11 = cnb1[1];
5127
+ size_t nb12 = cnb1[2];
5128
+ size_t nb13 = cnb1[3];
5129
+
5130
+ //size_t s0 = nb0 / sizeof(src1_t);
5131
+ size_t s1 = nb1 / sizeof(src1_t);
5132
+ size_t s2 = nb2 / sizeof(src1_t);
5133
+ size_t s3 = nb3 / sizeof(src1_t);
5134
+
5135
+ //size_t s10 = nb10 / sizeof(src1_t);
5136
+ size_t s11 = nb11 / sizeof(src1_t);
5137
+ size_t s12 = nb12 / sizeof(src1_t);
5138
+ size_t s13 = nb13 / sizeof(src1_t);
5139
+
5140
+
5141
+ const int block_size = 128;
5142
+
5143
+ int64_t hne0 = std::max(ne0/2LL, 1LL);
5144
+
5145
+ dim3 block_dims;
5146
+ block_dims.x = std::min<unsigned int>(hne0, block_size);
5147
+ block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
5148
+ block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
5149
+
5150
+ dim3 block_nums(
5151
+ (hne0 + block_dims.x - 1) / block_dims.x,
5152
+ (ne1 + block_dims.y - 1) / block_dims.y,
5153
+ (ne2*ne3 + block_dims.z - 1) / block_dims.z
5154
+ );
4862
5155
 
4863
- static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4864
- const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4865
- mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
4866
- }
5156
+ if (block_nums.z > 65535) {
5157
+ // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
5158
+ int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
5159
+ k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
5160
+ src0_dd, src1_dd, dst_dd,
5161
+ ne0, ne1, ne2, ne3,
5162
+ ne10, ne11, ne12, ne13,
5163
+ /* s0, */ s1, s2, s3,
5164
+ /* s10, */ s11, s12, s13);
5165
+ } else {
5166
+ k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
5167
+ src0_dd, src1_dd, dst_dd,
5168
+ ne0, ne1, ne2, ne3,
5169
+ ne10, ne11, ne12, ne13,
5170
+ /* s0, */ s1, s2, s3,
5171
+ /* s10, */ s11, s12, s13);
5172
+ }
5173
+ }
5174
+ }
5175
+ };
4867
5176
 
4868
5177
  static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4869
5178
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
@@ -4885,14 +5194,14 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
4885
5194
  sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4886
5195
  }
4887
5196
 
4888
- static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5197
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4889
5198
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4890
5199
  if (ncols < 1024) {
4891
5200
  const dim3 block_dims(WARP_SIZE, 1, 1);
4892
- norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5201
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4893
5202
  } else {
4894
5203
  const dim3 block_dims(1024, 1, 1);
4895
- norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5204
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4896
5205
  }
4897
5206
  }
4898
5207
 
@@ -4914,34 +5223,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4914
5223
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4915
5224
  }
4916
5225
 
4917
- template<typename dst_t>
4918
- static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4919
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4920
- dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4921
- }
4922
-
4923
- template<typename dst_t>
4924
- static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4925
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4926
- dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4927
- }
4928
-
4929
- template<typename dst_t>
4930
- static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4931
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4932
- dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4933
- }
4934
-
4935
- template<typename dst_t>
4936
- static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4937
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4938
- dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4939
- }
4940
-
4941
- template<typename dst_t>
4942
- static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
5226
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
5227
+ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
4943
5228
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4944
- dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5229
+ dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4945
5230
  }
4946
5231
 
4947
5232
  template<typename dst_t>
@@ -4990,6 +5275,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4990
5275
  #endif
4991
5276
  }
4992
5277
 
5278
+ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5279
+ switch (type) {
5280
+ case GGML_TYPE_Q4_0:
5281
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5282
+ case GGML_TYPE_Q4_1:
5283
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5284
+ case GGML_TYPE_Q5_0:
5285
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5286
+ case GGML_TYPE_Q5_1:
5287
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5288
+ case GGML_TYPE_Q8_0:
5289
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5290
+ case GGML_TYPE_Q2_K:
5291
+ return dequantize_row_q2_K_cuda;
5292
+ case GGML_TYPE_Q3_K:
5293
+ return dequantize_row_q3_K_cuda;
5294
+ case GGML_TYPE_Q4_K:
5295
+ return dequantize_row_q4_K_cuda;
5296
+ case GGML_TYPE_Q5_K:
5297
+ return dequantize_row_q5_K_cuda;
5298
+ case GGML_TYPE_Q6_K:
5299
+ return dequantize_row_q6_K_cuda;
5300
+ case GGML_TYPE_F32:
5301
+ return dequantize_block_cuda<1, 1, convert_f32>;
5302
+ default:
5303
+ return nullptr;
5304
+ }
5305
+ }
5306
+
5307
+ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5308
+ switch (type) {
5309
+ case GGML_TYPE_Q4_0:
5310
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5311
+ case GGML_TYPE_Q4_1:
5312
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5313
+ case GGML_TYPE_Q5_0:
5314
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5315
+ case GGML_TYPE_Q5_1:
5316
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5317
+ case GGML_TYPE_Q8_0:
5318
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5319
+ case GGML_TYPE_Q2_K:
5320
+ return dequantize_row_q2_K_cuda;
5321
+ case GGML_TYPE_Q3_K:
5322
+ return dequantize_row_q3_K_cuda;
5323
+ case GGML_TYPE_Q4_K:
5324
+ return dequantize_row_q4_K_cuda;
5325
+ case GGML_TYPE_Q5_K:
5326
+ return dequantize_row_q5_K_cuda;
5327
+ case GGML_TYPE_Q6_K:
5328
+ return dequantize_row_q6_K_cuda;
5329
+ case GGML_TYPE_F16:
5330
+ return dequantize_block_cuda<1, 1, convert_f16>;
5331
+ default:
5332
+ return nullptr;
5333
+ }
5334
+ }
5335
+
4993
5336
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4994
5337
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4995
5338
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -5078,6 +5421,15 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
5078
5421
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5079
5422
  }
5080
5423
 
5424
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5425
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5426
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5427
+ const dim3 block_nums(block_num_y, 1, 1);
5428
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5429
+ dequantize_mul_mat_vec<1, 1, convert_f16>
5430
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5431
+ }
5432
+
5081
5433
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5082
5434
  GGML_ASSERT(ncols % QK4_0 == 0);
5083
5435
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -5168,83 +5520,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
5168
5520
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5169
5521
  }
5170
5522
 
5171
- static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
5172
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
5173
- dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5174
- }
5175
-
5176
- static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
5177
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
5178
- dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5179
- }
5180
-
5181
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5182
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5183
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5184
- const dim3 block_nums(block_num_y, 1, 1);
5185
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5186
- dequantize_mul_mat_vec<1, 1, convert_f16>
5187
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5188
- }
5189
-
5190
- static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5191
- switch (type) {
5192
- case GGML_TYPE_Q4_0:
5193
- return dequantize_row_q4_0_cuda;
5194
- case GGML_TYPE_Q4_1:
5195
- return dequantize_row_q4_1_cuda;
5196
- case GGML_TYPE_Q5_0:
5197
- return dequantize_row_q5_0_cuda;
5198
- case GGML_TYPE_Q5_1:
5199
- return dequantize_row_q5_1_cuda;
5200
- case GGML_TYPE_Q8_0:
5201
- return dequantize_row_q8_0_cuda;
5202
- case GGML_TYPE_Q2_K:
5203
- return dequantize_row_q2_K_cuda;
5204
- case GGML_TYPE_Q3_K:
5205
- return dequantize_row_q3_K_cuda;
5206
- case GGML_TYPE_Q4_K:
5207
- return dequantize_row_q4_K_cuda;
5208
- case GGML_TYPE_Q5_K:
5209
- return dequantize_row_q5_K_cuda;
5210
- case GGML_TYPE_Q6_K:
5211
- return dequantize_row_q6_K_cuda;
5212
- case GGML_TYPE_F32:
5213
- return convert_fp32_to_fp16_cuda;
5214
- default:
5215
- return nullptr;
5216
- }
5217
- }
5218
-
5219
- static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5220
- switch (type) {
5221
- case GGML_TYPE_Q4_0:
5222
- return dequantize_row_q4_0_cuda;
5223
- case GGML_TYPE_Q4_1:
5224
- return dequantize_row_q4_1_cuda;
5225
- case GGML_TYPE_Q5_0:
5226
- return dequantize_row_q5_0_cuda;
5227
- case GGML_TYPE_Q5_1:
5228
- return dequantize_row_q5_1_cuda;
5229
- case GGML_TYPE_Q8_0:
5230
- return dequantize_row_q8_0_cuda;
5231
- case GGML_TYPE_Q2_K:
5232
- return dequantize_row_q2_K_cuda;
5233
- case GGML_TYPE_Q3_K:
5234
- return dequantize_row_q3_K_cuda;
5235
- case GGML_TYPE_Q4_K:
5236
- return dequantize_row_q4_K_cuda;
5237
- case GGML_TYPE_Q5_K:
5238
- return dequantize_row_q5_K_cuda;
5239
- case GGML_TYPE_Q6_K:
5240
- return dequantize_row_q6_K_cuda;
5241
- case GGML_TYPE_F16:
5242
- return convert_fp16_to_fp32_cuda;
5243
- default:
5244
- return nullptr;
5245
- }
5246
- }
5247
-
5248
5523
  static void ggml_mul_mat_q4_0_q8_1_cuda(
5249
5524
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
5250
5525
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
@@ -5737,19 +6012,52 @@ static void ggml_cpy_f32_f16_cuda(
5737
6012
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5738
6013
  }
5739
6014
 
5740
- static void ggml_cpy_f16_f16_cuda(
6015
+ static void ggml_cpy_f32_q8_0_cuda(
5741
6016
  const char * cx, char * cdst, const int ne,
5742
6017
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
5743
6018
  const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
5744
6019
 
5745
- const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
5746
- cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
6020
+ GGML_ASSERT(ne % QK8_0 == 0);
6021
+ const int num_blocks = ne / QK8_0;
6022
+ cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
5747
6023
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5748
6024
  }
5749
6025
 
5750
- static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
5751
- const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
5752
- scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
6026
+ static void ggml_cpy_f32_q4_0_cuda(
6027
+ const char * cx, char * cdst, const int ne,
6028
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6029
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6030
+
6031
+ GGML_ASSERT(ne % QK4_0 == 0);
6032
+ const int num_blocks = ne / QK4_0;
6033
+ cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
6034
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6035
+ }
6036
+
6037
+ static void ggml_cpy_f32_q4_1_cuda(
6038
+ const char * cx, char * cdst, const int ne,
6039
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6040
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6041
+
6042
+ GGML_ASSERT(ne % QK4_1 == 0);
6043
+ const int num_blocks = ne / QK4_1;
6044
+ cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
6045
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6046
+ }
6047
+
6048
+ static void ggml_cpy_f16_f16_cuda(
6049
+ const char * cx, char * cdst, const int ne,
6050
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6051
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6052
+
6053
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
6054
+ cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
6055
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6056
+ }
6057
+
6058
+ static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
6059
+ const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
6060
+ scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5753
6061
  }
5754
6062
 
5755
6063
  static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
@@ -5823,6 +6131,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
5823
6131
  alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
5824
6132
  }
5825
6133
 
6134
+ static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6135
+ const dim3 block_dims(WARP_SIZE, 1, 1);
6136
+ const dim3 block_nums(1, nrows, 1);
6137
+ k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6138
+ }
6139
+
6140
+ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
6141
+ // bitonic sort requires ncols to be power of 2
6142
+ GGML_ASSERT((ncols & (ncols - 1)) == 0);
6143
+
6144
+ const dim3 block_dims(ncols, 1, 1);
6145
+ const dim3 block_nums(1, nrows, 1);
6146
+ if (order == GGML_SORT_ASC) {
6147
+ k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6148
+ } else if (order == GGML_SORT_DESC) {
6149
+ k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6150
+ } else {
6151
+ GGML_ASSERT(false);
6152
+ }
6153
+ }
6154
+
5826
6155
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
5827
6156
  const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
5828
6157
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -5915,7 +6244,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5915
6244
  return ptr;
5916
6245
  }
5917
6246
  #ifdef DEBUG_CUDA_MALLOC
5918
- fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
6247
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5919
6248
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5920
6249
  #endif
5921
6250
  void * ptr;
@@ -6053,7 +6382,7 @@ void * ggml_cuda_host_malloc(size_t size) {
6053
6382
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
6054
6383
  // This can fixed the OOM error in WSL.
6055
6384
  cudaGetLastError();
6056
- fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
6385
+ fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6057
6386
  size/1024.0/1024.0, cudaGetErrorString(err));
6058
6387
  return nullptr;
6059
6388
  }
@@ -6098,75 +6427,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6098
6427
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
6099
6428
  if (nb0 == ts && nb1 == ts*ne0/bs) {
6100
6429
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
6101
- }
6102
- if (nb0 == ts) {
6430
+ } else if (nb0 == ts) {
6103
6431
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
6104
- }
6105
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6106
- const void * rx = (const void *) ((const char *) x + i1*nb1);
6107
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6108
- // pretend the row is a matrix with cols=1
6109
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6110
- if (r != cudaSuccess) { return r; }
6111
- }
6112
- return cudaSuccess;
6113
- }
6114
-
6115
- static void ggml_cuda_op_repeat(
6116
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6117
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
6118
- // guaranteed to be an integer due to the check in ggml_can_repeat
6119
- const int64_t ne0 = dst->ne[0];
6120
- const int64_t ne1 = dst->ne[1];
6121
- const int64_t ne2 = dst->ne[2];
6122
- const int64_t ne3 = dst->ne[3];
6123
-
6124
- const int64_t ne00 = src0->ne[0];
6125
- const int64_t ne01 = src0->ne[1];
6126
- const int64_t ne02 = src0->ne[2];
6127
- const int64_t ne03 = src0->ne[3];
6128
-
6129
- const size_t nb0 = dst->nb[0];
6130
- const size_t nb1 = dst->nb[1];
6131
- const size_t nb2 = dst->nb[2];
6132
- const size_t nb3 = dst->nb[3];
6133
-
6134
- const size_t nb00 = src0->nb[0];
6135
- const size_t nb01 = src0->nb[1];
6136
- const size_t nb02 = src0->nb[2];
6137
- const size_t nb03 = src0->nb[3];
6138
-
6139
- const int nr0 = (int)(ne0/ne00);
6140
- const int nr1 = (int)(ne1/ne01);
6141
- const int nr2 = (int)(ne2/ne02);
6142
- const int nr3 = (int)(ne3/ne03);
6143
-
6144
- // TODO: support for transposed / permuted tensors
6145
- GGML_ASSERT(nb0 == sizeof(float));
6146
- GGML_ASSERT(nb00 == sizeof(float));
6147
-
6148
- // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
6149
- for (int i3 = 0; i3 < nr3; i3++) {
6150
- for (int k3 = 0; k3 < ne03; k3++) {
6151
- for (int i2 = 0; i2 < nr2; i2++) {
6152
- for (int k2 = 0; k2 < ne02; k2++) {
6153
- for (int i1 = 0; i1 < nr1; i1++) {
6154
- for (int k1 = 0; k1 < ne01; k1++) {
6155
- for (int i0 = 0; i0 < nr0; i0++) {
6156
- CUDA_CHECK(cudaMemcpyAsync(
6157
- (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
6158
- (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
6159
- ne00*nb0, cudaMemcpyDeviceToDevice, stream));
6160
- }
6161
- }
6162
- }
6163
- }
6164
- }
6432
+ } else {
6433
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6434
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6435
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6436
+ // pretend the row is a matrix with cols=1
6437
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6438
+ if (r != cudaSuccess) return r;
6165
6439
  }
6440
+ return cudaSuccess;
6166
6441
  }
6167
-
6168
- (void) src1;
6169
- (void) src1_d;
6170
6442
  }
6171
6443
 
6172
6444
  static void ggml_cuda_op_get_rows(
@@ -6213,44 +6485,55 @@ static void ggml_cuda_op_get_rows(
6213
6485
  }
6214
6486
  }
6215
6487
 
6216
- inline void ggml_cuda_op_add(
6488
+ template<class op>
6489
+ inline void ggml_cuda_op_bin_bcast(
6217
6490
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6218
6491
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6219
6492
 
6220
6493
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6221
6494
 
6222
- const int64_t ne10 = src1->ne[0];
6223
- const int64_t ne11 = src1->ne[1];
6224
-
6225
6495
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6226
- add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6496
+ op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6227
6497
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
6228
- add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6498
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
6229
6499
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6230
- add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
6500
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
6231
6501
  } else {
6232
- fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
6502
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
6503
+ ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
6233
6504
  GGML_ASSERT(false);
6234
6505
  }
6506
+ }
6507
+
6508
+ static void ggml_cuda_op_repeat(
6509
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6510
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
6511
+
6512
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
6235
6513
 
6236
6514
  (void) src1;
6237
- (void) dst;
6515
+ (void) src1_d;
6238
6516
  }
6239
6517
 
6240
- inline void ggml_cuda_op_mul(
6518
+ inline void ggml_cuda_op_add(
6241
6519
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6242
6520
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6243
6521
 
6244
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
6245
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
6246
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
6522
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6523
+ }
6247
6524
 
6248
- const int64_t ne10 = src1->ne[0];
6249
- const int64_t ne11 = src1->ne[1];
6525
+ inline void ggml_cuda_op_mul(
6526
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6527
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6250
6528
 
6251
- mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6529
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6530
+ }
6252
6531
 
6253
- (void) dst;
6532
+ inline void ggml_cuda_op_div(
6533
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6534
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6535
+
6536
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6254
6537
  }
6255
6538
 
6256
6539
  inline void ggml_cuda_op_gelu(
@@ -6319,7 +6602,10 @@ inline void ggml_cuda_op_norm(
6319
6602
  const int64_t ne00 = src0->ne[0];
6320
6603
  const int64_t nrows = ggml_nrows(src0);
6321
6604
 
6322
- norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
6605
+ float eps;
6606
+ memcpy(&eps, dst->op_params, sizeof(float));
6607
+
6608
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
6323
6609
 
6324
6610
  (void) src1;
6325
6611
  (void) dst;
@@ -6474,6 +6760,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
6474
6760
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6475
6761
  const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6476
6762
 
6763
+ GGML_ASSERT(ggml_nrows(src1) == 1);
6764
+
6477
6765
  const int64_t ne00 = src0->ne[0];
6478
6766
  const int64_t row_diff = row_high - row_low;
6479
6767
 
@@ -6533,7 +6821,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
6533
6821
  size_t ash;
6534
6822
  dfloat * src1_dfloat = nullptr; // dfloat == half
6535
6823
 
6536
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
6824
+ bool src1_convert_f16 =
6825
+ src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
6537
6826
  src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
6538
6827
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
6539
6828
 
@@ -6859,6 +7148,42 @@ inline void ggml_cuda_op_im2col(
6859
7148
  (void) src0_dd;
6860
7149
  }
6861
7150
 
7151
+ inline void ggml_cuda_op_sum_rows(
7152
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7153
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7154
+
7155
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7156
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7157
+
7158
+ const int64_t ncols = src0->ne[0];
7159
+ const int64_t nrows = ggml_nrows(src0);
7160
+
7161
+ sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
7162
+
7163
+ (void) src1;
7164
+ (void) dst;
7165
+ (void) src1_dd;
7166
+ }
7167
+
7168
+ inline void ggml_cuda_op_argsort(
7169
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7170
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7171
+
7172
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7173
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
7174
+
7175
+ const int64_t ncols = src0->ne[0];
7176
+ const int64_t nrows = ggml_nrows(src0);
7177
+
7178
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
7179
+
7180
+ argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
7181
+
7182
+ (void) src1;
7183
+ (void) dst;
7184
+ (void) src1_dd;
7185
+ }
7186
+
6862
7187
  inline void ggml_cuda_op_diag_mask_inf(
6863
7188
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6864
7189
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7067,7 +7392,7 @@ static void ggml_cuda_op_mul_mat(
7067
7392
  const int64_t ne01 = src0->ne[1];
7068
7393
  const int64_t ne02 = src0->ne[2];
7069
7394
  const int64_t ne03 = src0->ne[3];
7070
- // const int64_t nrows0 = ggml_nrows(src0);
7395
+ const int64_t nrows0 = ggml_nrows(src0);
7071
7396
 
7072
7397
  const int64_t ne10 = src1->ne[0];
7073
7398
  const int64_t ne11 = src1->ne[1];
@@ -7103,10 +7428,9 @@ static void ggml_cuda_op_mul_mat(
7103
7428
 
7104
7429
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
7105
7430
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
7106
-
7107
7431
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
7108
- const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
7109
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
7432
+
7433
+ const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
7110
7434
 
7111
7435
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7112
7436
  GGML_ASSERT(!(split && ne02 > 1));
@@ -7231,7 +7555,7 @@ static void ggml_cuda_op_mul_mat(
7231
7555
  const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
7232
7556
 
7233
7557
  // for split tensors the data begins at i0 == i0_offset_low
7234
- char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
7558
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
7235
7559
  float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
7236
7560
  char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
7237
7561
  float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
@@ -7376,6 +7700,10 @@ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, gg
7376
7700
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
7377
7701
  }
7378
7702
 
7703
+ static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7704
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
7705
+ }
7706
+
7379
7707
  static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7380
7708
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
7381
7709
  }
@@ -7401,7 +7729,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7401
7729
  }
7402
7730
 
7403
7731
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7404
- if (!g_cublas_loaded) { return false; }
7732
+ if (!g_cublas_loaded) return false;
7405
7733
 
7406
7734
  const int64_t ne10 = src1->ne[0];
7407
7735
 
@@ -7479,7 +7807,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7479
7807
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7480
7808
  }
7481
7809
 
7482
- __global__ static void k_compute_batched_ptrs(
7810
+ static __global__ void k_compute_batched_ptrs(
7483
7811
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7484
7812
  const void ** ptrs_src, void ** ptrs_dst,
7485
7813
  int ne12, int ne13,
@@ -7535,9 +7863,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7535
7863
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7536
7864
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7537
7865
 
7538
- int id;
7539
- CUDA_CHECK(cudaGetDevice(&id));
7540
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
7866
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
7541
7867
 
7542
7868
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7543
7869
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -7594,7 +7920,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7594
7920
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
7595
7921
  // use cublasGemmStridedBatchedEx
7596
7922
  CUBLAS_CHECK(
7597
- cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7923
+ cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7598
7924
  ne01, ne11, ne10,
7599
7925
  &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
7600
7926
  (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
@@ -7628,7 +7954,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7628
7954
  CUDA_CHECK(cudaGetLastError());
7629
7955
 
7630
7956
  CUBLAS_CHECK(
7631
- cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7957
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7632
7958
  ne01, ne11, ne10,
7633
7959
  &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7634
7960
  (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
@@ -7698,10 +8024,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7698
8024
  #ifdef GGML_CUDA_FORCE_DMMV
7699
8025
  const bool use_mul_mat_vec_q = false;
7700
8026
  #else
7701
- const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
8027
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
7702
8028
  #endif // GGML_CUDA_FORCE_DMMV
7703
8029
 
7704
8030
  if (use_mul_mat_vec_q) {
8031
+ // NOTE: this kernel does not support ggml_nrows(src1) > 1
7705
8032
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
7706
8033
  } else {
7707
8034
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
@@ -7726,6 +8053,219 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7726
8053
  }
7727
8054
  }
7728
8055
 
8056
+ #if 0
8057
+ template<typename ... Srcs>
8058
+ static __global__ void k_compute_batched_ptrs_id(
8059
+ const void ** ptrs_src, void ** ptrs_dst,
8060
+ int ne12, int ne13,
8061
+ int ne23,
8062
+ int nb02, int nb03,
8063
+ int nb12, int nb13,
8064
+ int nb2, int nb3,
8065
+ int r2, int r3,
8066
+ ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
8067
+ const half * src1_f16, half * dst_f16,
8068
+ const int32_t * ids, const int id,
8069
+ Srcs... src0s) {
8070
+
8071
+ int i = ids[id];
8072
+
8073
+ half * src0_f16;
8074
+ const void * srcs_ar[] = { (const half *) src0s... };
8075
+ if (src0_type == GGML_TYPE_F16) {
8076
+ src0_f16 = (half *) srcs_ar[i];
8077
+ } else {
8078
+ src0_f16 = src0_as_f16;
8079
+ if (threadIdx.x == 0 && threadIdx.y == 0) {
8080
+ const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
8081
+ to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
8082
+ }
8083
+ }
8084
+
8085
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
8086
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
8087
+
8088
+ if (i13 >= ne13 || i12 >= ne12) {
8089
+ return;
8090
+ }
8091
+
8092
+ int i03 = i13 / r3;
8093
+ int i02 = i12 / r2;
8094
+
8095
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
8096
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
8097
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
8098
+ }
8099
+
8100
+ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
8101
+ const struct ggml_tensor * ids = dst->src[0];
8102
+ const struct ggml_tensor * src1 = dst->src[1];
8103
+ const struct ggml_tensor * src00 = dst->src[2];
8104
+
8105
+ const int id = dst->op_params[0];
8106
+
8107
+ GGML_ASSERT(!ggml_is_transposed(src00));
8108
+ GGML_ASSERT(!ggml_is_transposed(src1));
8109
+
8110
+ GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
8111
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
8112
+
8113
+ const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
8114
+ const int64_t ne01 = src00->ne[1];
8115
+ const int64_t ne02 = src00->ne[2];
8116
+ const int64_t ne03 = src00->ne[3];
8117
+
8118
+ //const int64_t nb01 = src00->nb[1];
8119
+ const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
8120
+ const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
8121
+
8122
+ const int64_t ne10 = src1->ne[0];
8123
+ const int64_t ne11 = src1->ne[1];
8124
+ const int64_t ne12 = src1->ne[2];
8125
+ const int64_t ne13 = src1->ne[3];
8126
+
8127
+ //const int64_t nb11 = src1->nb[1];
8128
+ const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
8129
+ const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
8130
+
8131
+ const int64_t ne1 = ggml_nelements(src1);
8132
+ const int64_t ne = ggml_nelements(dst);
8133
+
8134
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8135
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8136
+
8137
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
8138
+
8139
+ //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
8140
+ //void * src0_ddq = src0_extra->data_device[g_main_device];
8141
+ //half * src0_as_f16 = (half *) src0_ddq;
8142
+
8143
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
8144
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
8145
+
8146
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
8147
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
8148
+
8149
+ // convert src1 to fp16
8150
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8151
+ GGML_ASSERT(to_fp16_cuda != nullptr);
8152
+
8153
+ size_t src1_as = 0;
8154
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
8155
+ to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8156
+
8157
+ size_t dst_as = 0;
8158
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8159
+
8160
+ GGML_ASSERT(ne12 % ne02 == 0);
8161
+ GGML_ASSERT(ne13 % ne03 == 0);
8162
+
8163
+ // broadcast factors
8164
+ const int64_t r2 = ne12/ne02;
8165
+ const int64_t r3 = ne13/ne03;
8166
+
8167
+ const half alpha_f16 = 1.0f;
8168
+ const half beta_f16 = 0.0f;
8169
+
8170
+ // use cublasGemmBatchedEx
8171
+ const int ne23 = ne12*ne13;
8172
+
8173
+ const void ** ptrs_src = nullptr;
8174
+ void ** ptrs_dst = nullptr;
8175
+
8176
+ size_t ptrs_src_s = 0;
8177
+ size_t ptrs_dst_s = 0;
8178
+
8179
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
8180
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
8181
+
8182
+ int64_t src0_ne = ggml_nelements(src00);
8183
+ half * src0_as_f16 = nullptr;
8184
+ size_t src0_as = 0;
8185
+ if (src00->type != GGML_TYPE_F16) {
8186
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
8187
+ }
8188
+
8189
+ static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
8190
+ dim3 block_dims(ne13, ne12);
8191
+ k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
8192
+ ptrs_src, ptrs_dst,
8193
+ ne12, ne13,
8194
+ ne23,
8195
+ ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
8196
+ nb12, nb13,
8197
+ dst->nb[2], dst->nb[3],
8198
+ r2, r3,
8199
+ src00->type, src0_as_f16, src0_ne,
8200
+ src1_as_f16, dst_f16,
8201
+ (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
8202
+ dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
8203
+ dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
8204
+ dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
8205
+ dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
8206
+ );
8207
+ CUDA_CHECK(cudaGetLastError());
8208
+
8209
+ CUBLAS_CHECK(
8210
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8211
+ ne01, ne11, ne10,
8212
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
8213
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
8214
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
8215
+ ne23,
8216
+ CUBLAS_COMPUTE_16F,
8217
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8218
+
8219
+ if (src0_as != 0) {
8220
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
8221
+ }
8222
+ if (ptrs_src_s != 0) {
8223
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
8224
+ }
8225
+ if (ptrs_dst_s != 0) {
8226
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
8227
+ }
8228
+
8229
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8230
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8231
+
8232
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
8233
+ ggml_cuda_pool_free(dst_f16, dst_as);
8234
+ }
8235
+ #endif
8236
+
8237
+ static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
8238
+ #if 0
8239
+ //#ifdef CUDA_USE_TENSOR_CORES
8240
+ // const bool use_tensor_cores = true;
8241
+ //#else
8242
+ // const bool use_tensor_cores = false;
8243
+ //#endif
8244
+
8245
+ ggml_cuda_mul_mat_id_cublas(dst);
8246
+
8247
+ // TODO: mmq/mmv support
8248
+ #else
8249
+ const struct ggml_tensor * ids = dst->src[0];
8250
+ const struct ggml_tensor * src1 = dst->src[1];
8251
+ const int id = dst->op_params[0];
8252
+
8253
+ int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8254
+
8255
+ int32_t a_id;
8256
+ CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8257
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8258
+
8259
+ GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
8260
+ const struct ggml_tensor * src0 = dst->src[a_id + 2];
8261
+
8262
+ ggml_cuda_mul_mat(src0, src1, dst);
8263
+ #endif
8264
+
8265
+ (void) _src0;
8266
+ (void) _src1;
8267
+ }
8268
+
7729
8269
  static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7730
8270
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
7731
8271
  }
@@ -7770,14 +8310,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7770
8310
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
7771
8311
 
7772
8312
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
7773
- ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7774
- ne10, ne11, nb10, nb11, nb12, main_stream);
8313
+ ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7775
8314
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7776
- ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7777
- ne10, ne11, nb10, nb11, nb12, main_stream);
8315
+ ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8316
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
8317
+ ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8318
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
8319
+ ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8320
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
8321
+ ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7778
8322
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7779
- ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7780
- ne10, ne11, nb10, nb11, nb12, main_stream);
8323
+ ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7781
8324
  } else {
7782
8325
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7783
8326
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7788,6 +8331,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7788
8331
  }
7789
8332
 
7790
8333
  static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8334
+ // TODO: why do we pass dst as src1 here?
7791
8335
  ggml_cuda_cpy(src0, dst, nullptr);
7792
8336
  (void) src1;
7793
8337
  }
@@ -7813,6 +8357,16 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
7813
8357
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7814
8358
  }
7815
8359
 
8360
+ static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8361
+ GGML_ASSERT(ggml_is_contiguous(src0));
8362
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
8363
+ }
8364
+
8365
+ static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8366
+ GGML_ASSERT(ggml_is_contiguous(src0));
8367
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
8368
+ }
8369
+
7816
8370
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7817
8371
  (void) src0;
7818
8372
  (void) src1;
@@ -8068,8 +8622,9 @@ void ggml_cuda_set_main_device(const int main_device) {
8068
8622
  main_device, g_device_count, g_main_device);
8069
8623
  return;
8070
8624
  }
8071
- g_main_device = main_device;
8072
- if (g_device_count > 1) {
8625
+
8626
+ if (g_main_device != main_device && g_device_count > 1) {
8627
+ g_main_device = main_device;
8073
8628
  cudaDeviceProp prop;
8074
8629
  CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
8075
8630
  fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
@@ -8095,7 +8650,7 @@ void ggml_cuda_free_scratch() {
8095
8650
  }
8096
8651
 
8097
8652
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8098
- if (!g_cublas_loaded) { return false; }
8653
+ if (!g_cublas_loaded) return false;
8099
8654
 
8100
8655
  ggml_cuda_func_t func;
8101
8656
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8131,6 +8686,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8131
8686
  case GGML_OP_MUL:
8132
8687
  func = ggml_cuda_mul;
8133
8688
  break;
8689
+ case GGML_OP_DIV:
8690
+ func = ggml_cuda_div;
8691
+ break;
8134
8692
  case GGML_OP_UNARY:
8135
8693
  switch (ggml_get_unary_op(tensor)) {
8136
8694
  case GGML_UNARY_OP_GELU:
@@ -8144,7 +8702,8 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8144
8702
  break;
8145
8703
  default:
8146
8704
  return false;
8147
- } break;
8705
+ }
8706
+ break;
8148
8707
  case GGML_OP_NORM:
8149
8708
  func = ggml_cuda_norm;
8150
8709
  break;
@@ -8157,6 +8716,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8157
8716
  }
8158
8717
  func = ggml_cuda_mul_mat;
8159
8718
  break;
8719
+ case GGML_OP_MUL_MAT_ID:
8720
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
8721
+ return false;
8722
+ }
8723
+ func = ggml_cuda_mul_mat_id;
8724
+ break;
8160
8725
  case GGML_OP_SCALE:
8161
8726
  func = ggml_cuda_scale;
8162
8727
  break;
@@ -8196,6 +8761,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8196
8761
  case GGML_OP_IM2COL:
8197
8762
  func = ggml_cuda_im2col;
8198
8763
  break;
8764
+ case GGML_OP_SUM_ROWS:
8765
+ func = ggml_cuda_sum_rows;
8766
+ break;
8767
+ case GGML_OP_ARGSORT:
8768
+ func = ggml_cuda_argsort;
8769
+ break;
8199
8770
  default:
8200
8771
  return false;
8201
8772
  }
@@ -8212,7 +8783,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8212
8783
 
8213
8784
  int ggml_cuda_get_device_count() {
8214
8785
  int device_count;
8215
- CUDA_CHECK(cudaGetDeviceCount(&device_count));
8786
+ if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
8787
+ return 0;
8788
+ }
8216
8789
  return device_count;
8217
8790
  }
8218
8791
 
@@ -8228,27 +8801,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
8228
8801
 
8229
8802
  #define UNUSED GGML_UNUSED
8230
8803
 
8231
- struct ggml_backend_context_cuda {
8232
- };
8233
-
8234
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
8235
- return GGML_CUDA_NAME;
8236
-
8237
- UNUSED(backend);
8238
- }
8239
-
8240
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
8241
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
8242
- delete cuda_ctx;
8243
- delete backend;
8244
- }
8804
+ // cuda buffer
8245
8805
 
8246
8806
  struct ggml_backend_buffer_context_cuda {
8247
- void * device;
8248
-
8807
+ int device;
8808
+ void * dev_ptr = nullptr;
8249
8809
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
8250
8810
  size_t temp_tensor_extra_index = 0;
8251
8811
 
8812
+ ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
8813
+
8252
8814
  ~ggml_backend_buffer_context_cuda() {
8253
8815
  delete[] temp_tensor_extras;
8254
8816
  }
@@ -8269,41 +8831,20 @@ struct ggml_backend_buffer_context_cuda {
8269
8831
 
8270
8832
  static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
8271
8833
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8272
- CUDA_CHECK(cudaFree(ctx->device));
8834
+ CUDA_CHECK(cudaFree(ctx->dev_ptr));
8273
8835
  delete ctx;
8274
8836
  }
8275
8837
 
8276
8838
  static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
8277
8839
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8278
- return ctx->device;
8279
- }
8280
-
8281
- static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8282
- int64_t row_low = 0;
8283
- int64_t row_high = ggml_nrows(tensor);
8284
- int64_t nrows_split = row_high - row_low;
8285
-
8286
- size_t size = ggml_nbytes_split(tensor, nrows_split);
8287
-
8288
- int64_t ne0 = tensor->ne[0];
8289
-
8290
- if (ggml_is_quantized(tensor->type)) {
8291
- if (ne0 % MATRIX_ROW_PADDING != 0) {
8292
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8293
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8294
- }
8295
- }
8296
-
8297
- return size;
8298
-
8299
- UNUSED(buffer);
8840
+ return ctx->dev_ptr;
8300
8841
  }
8301
8842
 
8302
8843
  static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8303
8844
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8304
8845
 
8305
8846
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
8306
- assert(tensor->view_src->buffer->backend == buffer->backend);
8847
+ assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
8307
8848
  tensor->backend = tensor->view_src->backend;
8308
8849
  tensor->extra = tensor->view_src->extra;
8309
8850
  return;
@@ -8311,7 +8852,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8311
8852
 
8312
8853
  ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
8313
8854
 
8314
- extra->data_device[g_main_device] = tensor->data;
8855
+ extra->data_device[ctx->device] = tensor->data;
8315
8856
 
8316
8857
  tensor->backend = GGML_BACKEND_GPU;
8317
8858
  tensor->extra = extra;
@@ -8323,64 +8864,208 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8323
8864
  int64_t nrows_split = row_high - row_low;
8324
8865
 
8325
8866
  size_t original_size = ggml_nbytes_split(tensor, nrows_split);
8326
- size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
8867
+ size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
8327
8868
 
8328
8869
  if (padded_size > original_size && tensor->view_src == nullptr) {
8329
- CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
8870
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
8330
8871
  }
8331
8872
  }
8332
8873
 
8333
8874
  UNUSED(buffer);
8334
8875
  }
8335
8876
 
8877
+ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
8878
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
8879
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8880
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8881
+
8882
+ CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
8883
+
8884
+ UNUSED(buffer);
8885
+ }
8886
+
8887
+ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
8888
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
8889
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8890
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8891
+
8892
+ CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
8893
+
8894
+ UNUSED(buffer);
8895
+ }
8896
+
8336
8897
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
8337
- /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
8338
- /* .get_base = */ ggml_backend_cuda_buffer_get_base,
8339
- /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
8340
- /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
8341
- /* .free_tensor = */ NULL,
8898
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
8899
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
8900
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
8901
+ /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
8902
+ /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
8903
+ /* .cpy_tensor_from = */ NULL,
8904
+ /* .cpy_tensor_to = */ NULL,
8342
8905
  };
8343
8906
 
8344
- static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
8345
- ggml_cuda_set_device(g_main_device);
8907
+ // cuda buffer type
8908
+
8909
+ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
8910
+ int device = (int) (intptr_t) buft->context;
8346
8911
 
8347
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
8912
+ ggml_cuda_set_device(device);
8348
8913
 
8349
8914
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8350
8915
 
8351
- ggml_cuda_set_device(g_main_device);
8352
- CUDA_CHECK(cudaMalloc(&ctx->device, size));
8916
+ void * dev_ptr;
8917
+ CUDA_CHECK(cudaMalloc(&dev_ptr, size));
8918
+
8919
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
8353
8920
 
8354
- return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
8921
+ return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
8355
8922
  }
8356
8923
 
8357
- static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
8924
+ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
8358
8925
  return 128;
8926
+
8927
+ UNUSED(buft);
8928
+ }
8929
+
8930
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
8931
+ int64_t row_low = 0;
8932
+ int64_t row_high = ggml_nrows(tensor);
8933
+ int64_t nrows_split = row_high - row_low;
8934
+
8935
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
8936
+
8937
+ int64_t ne0 = tensor->ne[0];
8938
+
8939
+ if (ggml_is_quantized(tensor->type)) {
8940
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
8941
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8942
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8943
+ }
8944
+ }
8945
+
8946
+ return size;
8947
+
8948
+ UNUSED(buft);
8949
+ }
8950
+
8951
+ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
8952
+ return ggml_backend_is_cuda(backend);
8953
+
8954
+ UNUSED(buft);
8955
+ }
8956
+
8957
+ static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
8958
+ /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
8959
+ /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
8960
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
8961
+ /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
8962
+ };
8963
+
8964
+ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
8965
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
8966
+ static bool ggml_backend_buffer_type_cuda_initialized = false;
8967
+ if (!ggml_backend_buffer_type_cuda_initialized) {
8968
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
8969
+ ggml_backend_buffer_type_cuda[i] = {
8970
+ /* .iface = */ cuda_backend_buffer_type_interface,
8971
+ /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
8972
+ };
8973
+ }
8974
+ ggml_backend_buffer_type_cuda_initialized = true;
8975
+ }
8976
+
8977
+ return &ggml_backend_buffer_type_cuda[device];
8978
+ }
8979
+
8980
+ // host buffer type
8981
+
8982
+ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
8983
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8984
+ CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
8985
+ delete ctx;
8986
+ }
8987
+
8988
+ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
8989
+ void * ptr;
8990
+ CUDA_CHECK(cudaMallocHost(&ptr, size));
8991
+
8992
+ // FIXME: this is a hack to avoid having to implement a new buffer type
8993
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
8994
+ buffer->buft = buft;
8995
+ buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
8996
+
8997
+ return buffer;
8998
+
8999
+ UNUSED(buft);
9000
+ }
9001
+
9002
+ struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9003
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9004
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9005
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9006
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9007
+ };
9008
+
9009
+ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9010
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9011
+ /* .iface = */ cuda_backend_host_buffer_type_interface,
9012
+ /* .context = */ nullptr,
9013
+ };
9014
+
9015
+ return &ggml_backend_buffer_type_cuda_host;
9016
+ }
9017
+
9018
+ // backend
9019
+
9020
+ struct ggml_backend_context_cuda {
9021
+ int device;
9022
+ };
9023
+
9024
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
9025
+ return GGML_CUDA_NAME;
9026
+
8359
9027
  UNUSED(backend);
8360
9028
  }
8361
9029
 
9030
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
9031
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9032
+
9033
+ delete cuda_ctx;
9034
+ delete backend;
9035
+ }
9036
+
9037
+ static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
9038
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9039
+
9040
+ return ggml_backend_cuda_buffer_type(cuda_ctx->device);
9041
+ }
9042
+
8362
9043
  static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9044
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9045
+
9046
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8363
9047
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
8364
9048
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8365
9049
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8366
9050
 
8367
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
8368
-
8369
- UNUSED(backend);
9051
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
8370
9052
  }
8371
9053
 
8372
9054
  static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9055
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9056
+
9057
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8373
9058
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
8374
9059
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8375
9060
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8376
9061
 
8377
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8378
-
8379
- UNUSED(backend);
9062
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
8380
9063
  }
8381
9064
 
8382
9065
  static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
8383
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
9066
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9067
+
9068
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
8384
9069
 
8385
9070
  UNUSED(backend);
8386
9071
  }
@@ -8394,14 +9079,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8394
9079
  UNUSED(cgraph);
8395
9080
  }
8396
9081
 
8397
- [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9082
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8398
9083
  GGML_ASSERT(!"not implemented");
8399
9084
 
8400
9085
  UNUSED(backend);
8401
9086
  UNUSED(plan);
8402
9087
  }
8403
9088
 
8404
- [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9089
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8405
9090
  GGML_ASSERT(!"not implemented");
8406
9091
 
8407
9092
  UNUSED(backend);
@@ -8409,7 +9094,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8409
9094
  }
8410
9095
 
8411
9096
  static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
8412
- ggml_cuda_set_device(g_main_device);
9097
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9098
+
9099
+ ggml_cuda_set_main_device(cuda_ctx->device);
8413
9100
 
8414
9101
  ggml_compute_params params = {};
8415
9102
  params.type = GGML_TASK_COMPUTE;
@@ -8417,13 +9104,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8417
9104
  for (int i = 0; i < cgraph->n_nodes; i++) {
8418
9105
  ggml_tensor * node = cgraph->nodes[i];
8419
9106
 
8420
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
9107
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8421
9108
  continue;
8422
- }
9109
+
8423
9110
  assert(node->backend == GGML_BACKEND_GPU);
9111
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9112
+ assert(node->extra != nullptr);
9113
+
8424
9114
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8425
9115
  if (node->src[j] != nullptr) {
8426
9116
  assert(node->src[j]->backend == GGML_BACKEND_GPU);
9117
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9118
+ assert(node->src[j]->extra != nullptr);
8427
9119
  }
8428
9120
  }
8429
9121
 
@@ -8460,27 +9152,98 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8460
9152
  UNUSED(backend);
8461
9153
  }
8462
9154
 
9155
+ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
9156
+ switch (op->op) {
9157
+ case GGML_OP_UNARY:
9158
+ switch (ggml_get_unary_op(op)) {
9159
+ case GGML_UNARY_OP_GELU:
9160
+ case GGML_UNARY_OP_SILU:
9161
+ case GGML_UNARY_OP_RELU:
9162
+ return true;
9163
+ default:
9164
+ return false;
9165
+ }
9166
+ break;
9167
+ case GGML_OP_MUL_MAT:
9168
+ case GGML_OP_MUL_MAT_ID:
9169
+ {
9170
+ struct ggml_tensor * a;
9171
+ struct ggml_tensor * b;
9172
+ if (op->op == GGML_OP_MUL_MAT) {
9173
+ a = op->src[0];
9174
+ b = op->src[1];
9175
+ } else {
9176
+ a = op->src[2];
9177
+ b = op->src[1];
9178
+ }
9179
+ if (a->ne[3] != b->ne[3]) {
9180
+ return false;
9181
+ }
9182
+ return true;
9183
+ } break;
9184
+ case GGML_OP_NONE:
9185
+ case GGML_OP_RESHAPE:
9186
+ case GGML_OP_VIEW:
9187
+ case GGML_OP_PERMUTE:
9188
+ case GGML_OP_TRANSPOSE:
9189
+ case GGML_OP_NORM:
9190
+ case GGML_OP_REPEAT:
9191
+ case GGML_OP_GET_ROWS:
9192
+ case GGML_OP_DUP:
9193
+ case GGML_OP_ADD:
9194
+ case GGML_OP_MUL:
9195
+ case GGML_OP_DIV:
9196
+ case GGML_OP_RMS_NORM:
9197
+ case GGML_OP_SCALE:
9198
+ case GGML_OP_SQR:
9199
+ case GGML_OP_CLAMP:
9200
+ case GGML_OP_CPY:
9201
+ case GGML_OP_CONT:
9202
+ case GGML_OP_DIAG_MASK_INF:
9203
+ case GGML_OP_SOFT_MAX:
9204
+ case GGML_OP_ROPE:
9205
+ case GGML_OP_ALIBI:
9206
+ case GGML_OP_IM2COL:
9207
+ case GGML_OP_SUM_ROWS:
9208
+ case GGML_OP_ARGSORT:
9209
+ return true;
9210
+ default:
9211
+ return false;
9212
+ }
9213
+
9214
+ UNUSED(backend);
9215
+ }
9216
+
8463
9217
  static ggml_backend_i cuda_backend_i = {
8464
- /* .get_name = */ ggml_backend_cuda_name,
8465
- /* .free = */ ggml_backend_cuda_free,
8466
- /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
8467
- /* .get_alignment = */ ggml_backend_cuda_get_alignment,
8468
- /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
8469
- /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
8470
- /* .synchronize = */ ggml_backend_cuda_synchronize,
8471
- /* .cpy_tensor_from = */ nullptr,
8472
- /* .cpy_tensor_to = */ nullptr,
8473
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
8474
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
8475
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
8476
- /* .graph_compute = */ ggml_backend_cuda_graph_compute,
8477
- /* .supports_op = */ nullptr,
9218
+ /* .get_name = */ ggml_backend_cuda_name,
9219
+ /* .free = */ ggml_backend_cuda_free,
9220
+ /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
9221
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
9222
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
9223
+ /* .cpy_tensor_from_async = */ NULL,
9224
+ /* .cpy_tensor_to_async = */ NULL,
9225
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
9226
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
9227
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
9228
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
9229
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
9230
+ /* .supports_op = */ ggml_backend_cuda_supports_op,
8478
9231
  };
8479
9232
 
8480
- ggml_backend_t ggml_backend_cuda_init() {
9233
+ ggml_backend_t ggml_backend_cuda_init(int device) {
8481
9234
  ggml_init_cublas(); // TODO: remove from ggml.c
8482
9235
 
8483
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
9236
+ if (device < 0 || device >= ggml_cuda_get_device_count()) {
9237
+ fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
9238
+ return nullptr;
9239
+ }
9240
+
9241
+ // not strictly necessary, but it may reduce the overhead of the first graph_compute
9242
+ ggml_cuda_set_main_device(device);
9243
+
9244
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
9245
+ /* .device = */ device
9246
+ };
8484
9247
 
8485
9248
  ggml_backend_t cuda_backend = new ggml_backend {
8486
9249
  /* .interface = */ cuda_backend_i,
@@ -8489,3 +9252,25 @@ ggml_backend_t ggml_backend_cuda_init() {
8489
9252
 
8490
9253
  return cuda_backend;
8491
9254
  }
9255
+
9256
+ bool ggml_backend_is_cuda(ggml_backend_t backend) {
9257
+ return backend->iface.get_name == ggml_backend_cuda_name;
9258
+ }
9259
+
9260
+ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
9261
+ ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
9262
+ return cuda_backend;
9263
+
9264
+ UNUSED(params);
9265
+ }
9266
+
9267
+ extern "C" int ggml_backend_cuda_reg_devices() {
9268
+ int device_count = ggml_cuda_get_device_count();
9269
+ //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
9270
+ for (int i = 0; i < device_count; i++) {
9271
+ char name[128];
9272
+ snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
9273
+ ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
9274
+ }
9275
+ return device_count;
9276
+ }