llama_cpp 0.9.4 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,8 @@
1
1
  #include <algorithm>
2
- #include <cinttypes>
3
2
  #include <cstddef>
4
3
  #include <cstdint>
4
+ #include <cinttypes>
5
+ #include <float.h>
5
6
  #include <limits>
6
7
  #include <stdint.h>
7
8
  #include <stdio.h>
@@ -69,6 +70,7 @@
69
70
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
70
71
  #define cudaSetDevice hipSetDevice
71
72
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
73
+ #define cudaStreamFireAndForget hipStreamFireAndForget
72
74
  #define cudaStreamNonBlocking hipStreamNonBlocking
73
75
  #define cudaStreamSynchronize hipStreamSynchronize
74
76
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
@@ -190,7 +192,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190
192
  fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
191
193
  cudaGetErrorString(err_)); \
192
194
  fprintf(stderr, "current device: %d\n", id); \
193
- exit(1); \
195
+ GGML_ASSERT(!"CUDA error"); \
194
196
  } \
195
197
  } while (0)
196
198
 
@@ -204,7 +206,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
204
206
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
205
207
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
206
208
  fprintf(stderr, "current device: %d\n", id); \
207
- exit(1); \
209
+ GGML_ASSERT(!"cuBLAS error"); \
208
210
  } \
209
211
  } while (0)
210
212
  #else
@@ -216,7 +218,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
216
218
  cudaGetDevice(&id); \
217
219
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
218
220
  fprintf(stderr, "current device: %d\n", id); \
219
- exit(1); \
221
+ GGML_ASSERT(!"cuBLAS error"); \
220
222
  } \
221
223
  } while (0)
222
224
  #endif // CUDART_VERSION >= 11
@@ -433,8 +435,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
435
  #define WARP_SIZE 32
434
436
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
435
437
 
436
- #define CUDA_ADD_BLOCK_SIZE 256
437
- #define CUDA_MUL_BLOCK_SIZE 256
438
438
  #define CUDA_GELU_BLOCK_SIZE 256
439
439
  #define CUDA_SILU_BLOCK_SIZE 256
440
440
  #define CUDA_RELU_BLOCK_SIZE 256
@@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
443
443
  #define CUDA_SCALE_BLOCK_SIZE 256
444
444
  #define CUDA_CLAMP_BLOCK_SIZE 256
445
445
  #define CUDA_ROPE_BLOCK_SIZE 256
446
+ #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
446
447
  #define CUDA_ALIBI_BLOCK_SIZE 32
447
448
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
448
449
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
@@ -501,40 +502,112 @@ static size_t g_scratch_offset = 0;
501
502
 
502
503
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
503
504
 
504
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
505
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
506
-
507
- if (i >= kx) {
508
- return;
505
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
506
+ #pragma unroll
507
+ for (int mask = 16; mask > 0; mask >>= 1) {
508
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
509
509
  }
510
- dst[i] = x[i] + y[i%ky];
510
+ return x;
511
511
  }
512
512
 
513
- static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
514
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
513
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
514
+ #pragma unroll
515
+ for (int mask = 16; mask > 0; mask >>= 1) {
516
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
517
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
518
+ }
519
+ return a;
520
+ }
515
521
 
516
- if (i >= k) {
517
- return;
522
+ static __device__ __forceinline__ float warp_reduce_max(float x) {
523
+ #pragma unroll
524
+ for (int mask = 16; mask > 0; mask >>= 1) {
525
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
518
526
  }
519
- dst[i] = __hadd(x[i], __float2half(y[i]));
527
+ return x;
520
528
  }
521
529
 
522
- static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
523
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
530
+ static __device__ __forceinline__ float op_repeat(const float a, const float b) {
531
+ return b;
532
+ }
524
533
 
525
- if (i >= k) {
534
+ static __device__ __forceinline__ float op_add(const float a, const float b) {
535
+ return a + b;
536
+ }
537
+
538
+ static __device__ __forceinline__ float op_mul(const float a, const float b) {
539
+ return a * b;
540
+ }
541
+
542
+ static __device__ __forceinline__ float op_div(const float a, const float b) {
543
+ return a / b;
544
+ }
545
+
546
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
547
+ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
548
+ int ne0, int ne1, int ne2, int ne3,
549
+ int ne10, int ne11, int ne12, int ne13,
550
+ /*int s0, */ int s1, int s2, int s3,
551
+ /*int s10,*/ int s11, int s12, int s13) {
552
+ const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
553
+ const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
554
+ const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
555
+ const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
556
+
557
+ if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
526
558
  return;
527
559
  }
528
- dst[i] = __half2float(x[i]) + y[i];
560
+
561
+ const int i11 = i1 % ne11;
562
+ const int i12 = i2 % ne12;
563
+ const int i13 = i3 % ne13;
564
+
565
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
566
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
567
+ const size_t i_dst = i_src0;
568
+
569
+ const src0_t * src0_row = src0 + i_src0;
570
+ const src1_t * src1_row = src1 + i_src1;
571
+ dst_t * dst_row = dst + i_dst;
572
+
573
+ for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
574
+ const int i10 = i0 % ne10;
575
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
576
+ }
529
577
  }
530
578
 
531
- static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
579
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
580
+ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
581
+ int ne0, int ne1, int ne2, int ne3,
582
+ int ne10, int ne11, int ne12, int ne13,
583
+ /*int s0, */ int s1, int s2, int s3,
584
+ /*int s10,*/ int s11, int s12, int s13) {
585
+
532
586
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
533
587
 
534
- if (i >= kx) {
588
+ const int i3 = i/(ne2*ne1*ne0);
589
+ const int i2 = (i/(ne1*ne0)) % ne2;
590
+ const int i1 = (i/ne0) % ne1;
591
+ const int i0 = i % ne0;
592
+
593
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
535
594
  return;
536
595
  }
537
- dst[i] = x[i] * y[i%ky];
596
+
597
+ const int i11 = i1 % ne11;
598
+ const int i12 = i2 % ne12;
599
+ const int i13 = i3 % ne13;
600
+
601
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
602
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
603
+ const size_t i_dst = i_src0;
604
+
605
+ const src0_t * src0_row = src0 + i_src0;
606
+ const src1_t * src1_row = src1 + i_src1;
607
+ dst_t * dst_row = dst + i_dst;
608
+
609
+ const int i10 = i0 % ne10;
610
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
538
611
  }
539
612
 
540
613
  static __global__ void gelu_f32(const float * x, float * dst, const int k) {
@@ -577,22 +650,11 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
577
650
  dst[i] = x[i] * x[i];
578
651
  }
579
652
 
580
- static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
581
- #pragma unroll
582
- for (int mask = 16; mask > 0; mask >>= 1) {
583
- a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
584
- a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
585
- }
586
- return a;
587
- }
588
-
589
653
  template <int block_size>
590
- static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
654
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
591
655
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
592
656
  const int tid = threadIdx.x;
593
657
 
594
- const float eps = 1e-5f;
595
-
596
658
  float2 mean_var = make_float2(0.f, 0.f);
597
659
 
598
660
  for (int col = tid; col < ncols; col += block_size) {
@@ -624,14 +686,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
624
686
  }
625
687
  }
626
688
 
627
- static __device__ __forceinline__ float warp_reduce_sum(float x) {
628
- #pragma unroll
629
- for (int mask = 16; mask > 0; mask >>= 1) {
630
- x += __shfl_xor_sync(0xffffffff, x, mask, 32);
631
- }
632
- return x;
633
- }
634
-
635
689
  template <int block_size>
636
690
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
637
691
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -4550,6 +4604,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4550
4604
  cpy_1(cx + x_offset, cdst + dst_offset);
4551
4605
  }
4552
4606
 
4607
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
4608
+ const float * xi = (const float *) cxi;
4609
+ block_q8_0 * dsti = (block_q8_0 *) cdsti;
4610
+
4611
+ float amax = 0.0f; // absolute max
4612
+
4613
+ for (int j = 0; j < QK8_0; j++) {
4614
+ const float v = xi[j];
4615
+ amax = fmaxf(amax, fabsf(v));
4616
+ }
4617
+
4618
+ const float d = amax / ((1 << 7) - 1);
4619
+ const float id = d ? 1.0f/d : 0.0f;
4620
+
4621
+ dsti->d = d;
4622
+
4623
+ for (int j = 0; j < QK8_0; ++j) {
4624
+ const float x0 = xi[j]*id;
4625
+
4626
+ dsti->qs[j] = roundf(x0);
4627
+ }
4628
+ }
4629
+
4630
+ static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
4631
+ const float * xi = (const float *) cxi;
4632
+ block_q4_0 * dsti = (block_q4_0 *) cdsti;
4633
+
4634
+ float amax = 0.0f;
4635
+ float vmax = 0.0f;
4636
+
4637
+ for (int j = 0; j < QK4_0; ++j) {
4638
+ const float v = xi[j];
4639
+ if (amax < fabsf(v)) {
4640
+ amax = fabsf(v);
4641
+ vmax = v;
4642
+ }
4643
+ }
4644
+
4645
+ const float d = vmax / -8;
4646
+ const float id = d ? 1.0f/d : 0.0f;
4647
+
4648
+ dsti->d = d;
4649
+
4650
+ for (int j = 0; j < QK4_0/2; ++j) {
4651
+ const float x0 = xi[0 + j]*id;
4652
+ const float x1 = xi[QK4_0/2 + j]*id;
4653
+
4654
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
4655
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
4656
+
4657
+ dsti->qs[j] = xi0;
4658
+ dsti->qs[j] |= xi1 << 4;
4659
+ }
4660
+ }
4661
+
4662
+ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
4663
+ const float * xi = (const float *) cxi;
4664
+ block_q4_1 * dsti = (block_q4_1 *) cdsti;
4665
+
4666
+ float vmin = FLT_MAX;
4667
+ float vmax = -FLT_MAX;
4668
+
4669
+ for (int j = 0; j < QK4_1; ++j) {
4670
+ const float v = xi[j];
4671
+
4672
+ if (v < vmin) vmin = v;
4673
+ if (v > vmax) vmax = v;
4674
+ }
4675
+
4676
+ const float d = (vmax - vmin) / ((1 << 4) - 1);
4677
+ const float id = d ? 1.0f/d : 0.0f;
4678
+
4679
+ dsti->dm.x = d;
4680
+ dsti->dm.y = vmin;
4681
+
4682
+ for (int j = 0; j < QK4_1/2; ++j) {
4683
+ const float x0 = (xi[0 + j] - vmin)*id;
4684
+ const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
4685
+
4686
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
4687
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
4688
+
4689
+ dsti->qs[j] = xi0;
4690
+ dsti->qs[j] |= xi1 << 4;
4691
+ }
4692
+ }
4693
+
4694
+ template <cpy_kernel_t cpy_blck, int qk>
4695
+ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
4696
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
4697
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
4698
+ const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
4699
+
4700
+ if (i >= ne) {
4701
+ return;
4702
+ }
4703
+
4704
+ const int i02 = i / (ne00*ne01);
4705
+ const int i01 = (i - i02*ne01*ne00) / ne00;
4706
+ const int i00 = (i - i02*ne01*ne00 - i01*ne00);
4707
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
4708
+
4709
+ const int i12 = i / (ne10*ne11);
4710
+ const int i11 = (i - i12*ne10*ne11) / ne10;
4711
+ const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
4712
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
4713
+
4714
+ cpy_blck(cx + x_offset, cdst + dst_offset);
4715
+ }
4716
+
4553
4717
  static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4554
4718
  const float y = (i0 / 2 - low) / max(0.001f, high - low);
4555
4719
  return 1.0f - min(1.0f, max(0.0f, y));
@@ -4610,8 +4774,8 @@ static __global__ void rope(
4610
4774
 
4611
4775
  template<typename T, bool has_pos>
4612
4776
  static __global__ void rope_neox(
4613
- const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4614
- float ext_factor, float attn_factor, rope_corr_dims corr_dims
4777
+ const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
4778
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
4615
4779
  ) {
4616
4780
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4617
4781
 
@@ -4620,23 +4784,25 @@ static __global__ void rope_neox(
4620
4784
  }
4621
4785
 
4622
4786
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4623
- const int i = row*ncols + col/2;
4787
+ const int ib = col / n_dims;
4788
+ const int ic = col % n_dims;
4789
+
4790
+ const int i = row*ncols + ib*n_dims + ic/2;
4624
4791
  const int i2 = row/p_delta_rows;
4625
4792
 
4626
- // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4627
- const float cur_rot = -float(col)/ncols;
4793
+ float cur_rot = inv_ndims * ic - ib;
4628
4794
 
4629
4795
  const int p = has_pos ? pos[i2] : 0;
4630
- const float theta_base = p*powf(freq_base, cur_rot);
4796
+ const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
4631
4797
 
4632
4798
  float cos_theta, sin_theta;
4633
4799
  rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4634
4800
 
4635
4801
  const float x0 = x[i + 0];
4636
- const float x1 = x[i + ncols/2];
4802
+ const float x1 = x[i + n_dims/2];
4637
4803
 
4638
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
4639
- dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4804
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4805
+ dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
4640
4806
  }
4641
4807
 
4642
4808
  static __global__ void rope_glm_f32(
@@ -4702,6 +4868,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
4702
4868
  dst[i] = col * m_k + x[i];
4703
4869
  }
4704
4870
 
4871
+ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
4872
+ const int row = blockIdx.y;
4873
+ const int col = threadIdx.x;
4874
+
4875
+ float sum = 0.0f;
4876
+ for (int i = col; i < ncols; i += blockDim.x) {
4877
+ sum += x[row * ncols + i];
4878
+ }
4879
+
4880
+ sum = warp_reduce_sum(sum);
4881
+
4882
+ if (col == 0) {
4883
+ dst[row] = sum;
4884
+ }
4885
+ }
4886
+
4887
+ template<typename T>
4888
+ static inline __device__ void swap(T & a, T & b) {
4889
+ T tmp = a;
4890
+ a = b;
4891
+ b = tmp;
4892
+ }
4893
+
4894
+ template<ggml_sort_order order>
4895
+ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
4896
+ // bitonic sort
4897
+ int col = threadIdx.x;
4898
+ int row = blockIdx.y;
4899
+
4900
+ if (col >= ncols) return;
4901
+
4902
+ const float * x_row = x + row * ncols;
4903
+ int * dst_row = dst + row * ncols;
4904
+
4905
+ // initialize indices
4906
+ if (col < ncols) {
4907
+ dst_row[col] = col;
4908
+ }
4909
+ __syncthreads();
4910
+
4911
+ for (int k = 2; k <= ncols; k *= 2) {
4912
+ for (int j = k / 2; j > 0; j /= 2) {
4913
+ int ixj = col ^ j;
4914
+ if (ixj > col) {
4915
+ if ((col & k) == 0) {
4916
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
4917
+ swap(dst_row[col], dst_row[ixj]);
4918
+ }
4919
+ } else {
4920
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
4921
+ swap(dst_row[col], dst_row[ixj]);
4922
+ }
4923
+ }
4924
+ }
4925
+ __syncthreads();
4926
+ }
4927
+ }
4928
+ }
4929
+
4705
4930
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4706
4931
  const int col = blockDim.y*blockIdx.y + threadIdx.y;
4707
4932
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
@@ -4711,49 +4936,79 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
4711
4936
  }
4712
4937
 
4713
4938
  const int i = row*ncols + col;
4714
- // dst[i] = col > n_past + row ? -INFINITY : x[i];
4715
- dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4939
+ //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
4940
+ //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4941
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
4716
4942
  }
4717
4943
 
4718
- // the CUDA soft max implementation differs from the CPU implementation
4719
- // instead of doubles floats are used
4720
- static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
4721
- const int row = blockDim.x*blockIdx.x + threadIdx.x;
4722
- const int block_size = blockDim.y;
4723
- const int tid = threadIdx.y;
4944
+ static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
4945
+ const int tid = threadIdx.x;
4946
+ const int rowx = blockIdx.x;
4947
+ const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
4948
+
4949
+ const int block_size = blockDim.x;
4950
+
4951
+ const int warp_id = threadIdx.x / WARP_SIZE;
4952
+ const int lane_id = threadIdx.x % WARP_SIZE;
4953
+
4954
+ __shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
4724
4955
 
4725
4956
  float max_val = -INFINITY;
4726
4957
 
4727
4958
  for (int col = tid; col < ncols; col += block_size) {
4728
- const int i = row*ncols + col;
4729
- max_val = max(max_val, x[i]);
4959
+ const int ix = rowx*ncols + col;
4960
+ const int iy = rowy*ncols + col;
4961
+ max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
4730
4962
  }
4731
4963
 
4732
4964
  // find the max value in the block
4733
- #pragma unroll
4734
- for (int mask = 16; mask > 0; mask >>= 1) {
4735
- max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4965
+ max_val = warp_reduce_max(max_val);
4966
+ if (block_size > WARP_SIZE) {
4967
+ if (warp_id == 0) {
4968
+ buf[lane_id] = -INFINITY;
4969
+ }
4970
+ __syncthreads();
4971
+
4972
+ if (lane_id == 0) {
4973
+ buf[warp_id] = max_val;
4974
+ }
4975
+ __syncthreads();
4976
+
4977
+ max_val = buf[lane_id];
4978
+ max_val = warp_reduce_max(max_val);
4736
4979
  }
4737
4980
 
4738
4981
  float tmp = 0.f;
4739
4982
 
4740
4983
  for (int col = tid; col < ncols; col += block_size) {
4741
- const int i = row*ncols + col;
4742
- const float val = expf(x[i] - max_val);
4984
+ const int ix = rowx*ncols + col;
4985
+ const int iy = rowy*ncols + col;
4986
+ const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
4743
4987
  tmp += val;
4744
- dst[i] = val;
4988
+ dst[ix] = val;
4745
4989
  }
4746
4990
 
4747
- // sum up partial sums
4748
- #pragma unroll
4749
- for (int mask = 16; mask > 0; mask >>= 1) {
4750
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
4991
+ // find the sum of exps in the block
4992
+ tmp = warp_reduce_sum(tmp);
4993
+ if (block_size > WARP_SIZE) {
4994
+ if (warp_id == 0) {
4995
+ buf[lane_id] = 0.f;
4996
+ }
4997
+ __syncthreads();
4998
+
4999
+ if (lane_id == 0) {
5000
+ buf[warp_id] = tmp;
5001
+ }
5002
+ __syncthreads();
5003
+
5004
+ tmp = buf[lane_id];
5005
+ tmp = warp_reduce_sum(tmp);
4751
5006
  }
4752
5007
 
4753
5008
  const float inv_tmp = 1.f / tmp;
4754
5009
 
4755
5010
  for (int col = tid; col < ncols; col += block_size) {
4756
- const int i = row*ncols + col;
5011
+ const int i = rowx*ncols + col;
4757
5012
  dst[i] *= inv_tmp;
4758
5013
  }
4759
5014
  }
@@ -4805,25 +5060,119 @@ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const
4805
5060
  k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4806
5061
  }
4807
5062
 
4808
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4809
- const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4810
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
4811
- }
4812
-
4813
- static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
4814
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4815
- add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4816
- }
4817
-
4818
- static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4819
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4820
- add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4821
- }
5063
+ template<float (*bin_op)(const float, const float)>
5064
+ struct bin_bcast_cuda {
5065
+ template<typename src0_t, typename src1_t, typename dst_t>
5066
+ void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
5067
+ const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
5068
+ cudaStream_t stream) {
5069
+
5070
+ GGML_TENSOR_BINARY_OP_LOCALS
5071
+
5072
+
5073
+ int nr0 = ne10/ne0;
5074
+ int nr1 = ne11/ne1;
5075
+ int nr2 = ne12/ne2;
5076
+ int nr3 = ne13/ne3;
5077
+
5078
+ int nr[4] = { nr0, nr1, nr2, nr3 };
5079
+
5080
+ // collapse dimensions until first broadcast dimension
5081
+ int64_t cne0[] = {ne0, ne1, ne2, ne3};
5082
+ int64_t cne1[] = {ne10, ne11, ne12, ne13};
5083
+ size_t cnb0[] = {nb0, nb1, nb2, nb3};
5084
+ size_t cnb1[] = {nb10, nb11, nb12, nb13};
5085
+ auto collapse = [](int64_t cne[]) {
5086
+ cne[0] *= cne[1];
5087
+ cne[1] = cne[2];
5088
+ cne[2] = cne[3];
5089
+ cne[3] = 1;
5090
+ };
5091
+
5092
+ auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
5093
+ cnb[1] *= cne[1];
5094
+ cnb[2] *= cne[2];
5095
+ cnb[3] *= cne[3];
5096
+ };
5097
+
5098
+ for (int i = 0; i < 4; i++) {
5099
+ if (nr[i] != 1) {
5100
+ break;
5101
+ }
5102
+ if (i > 0) {
5103
+ collapse_nb(cnb0, cne0);
5104
+ collapse_nb(cnb1, cne1);
5105
+ collapse(cne0);
5106
+ collapse(cne1);
5107
+ }
5108
+ }
5109
+ {
5110
+ int64_t ne0 = cne0[0];
5111
+ int64_t ne1 = cne0[1];
5112
+ int64_t ne2 = cne0[2];
5113
+ int64_t ne3 = cne0[3];
5114
+
5115
+ int64_t ne10 = cne1[0];
5116
+ int64_t ne11 = cne1[1];
5117
+ int64_t ne12 = cne1[2];
5118
+ int64_t ne13 = cne1[3];
5119
+
5120
+ //size_t nb0 = cnb0[0];
5121
+ size_t nb1 = cnb0[1];
5122
+ size_t nb2 = cnb0[2];
5123
+ size_t nb3 = cnb0[3];
5124
+
5125
+ //size_t nb10 = cnb1[0];
5126
+ size_t nb11 = cnb1[1];
5127
+ size_t nb12 = cnb1[2];
5128
+ size_t nb13 = cnb1[3];
5129
+
5130
+ //size_t s0 = nb0 / sizeof(src1_t);
5131
+ size_t s1 = nb1 / sizeof(src1_t);
5132
+ size_t s2 = nb2 / sizeof(src1_t);
5133
+ size_t s3 = nb3 / sizeof(src1_t);
5134
+
5135
+ //size_t s10 = nb10 / sizeof(src1_t);
5136
+ size_t s11 = nb11 / sizeof(src1_t);
5137
+ size_t s12 = nb12 / sizeof(src1_t);
5138
+ size_t s13 = nb13 / sizeof(src1_t);
5139
+
5140
+
5141
+ const int block_size = 128;
5142
+
5143
+ int64_t hne0 = std::max(ne0/2LL, 1LL);
5144
+
5145
+ dim3 block_dims;
5146
+ block_dims.x = std::min<unsigned int>(hne0, block_size);
5147
+ block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
5148
+ block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
5149
+
5150
+ dim3 block_nums(
5151
+ (hne0 + block_dims.x - 1) / block_dims.x,
5152
+ (ne1 + block_dims.y - 1) / block_dims.y,
5153
+ (ne2*ne3 + block_dims.z - 1) / block_dims.z
5154
+ );
4822
5155
 
4823
- static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4824
- const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4825
- mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
4826
- }
5156
+ if (block_nums.z > 65535) {
5157
+ // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
5158
+ int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
5159
+ k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
5160
+ src0_dd, src1_dd, dst_dd,
5161
+ ne0, ne1, ne2, ne3,
5162
+ ne10, ne11, ne12, ne13,
5163
+ /* s0, */ s1, s2, s3,
5164
+ /* s10, */ s11, s12, s13);
5165
+ } else {
5166
+ k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
5167
+ src0_dd, src1_dd, dst_dd,
5168
+ ne0, ne1, ne2, ne3,
5169
+ ne10, ne11, ne12, ne13,
5170
+ /* s0, */ s1, s2, s3,
5171
+ /* s10, */ s11, s12, s13);
5172
+ }
5173
+ }
5174
+ }
5175
+ };
4827
5176
 
4828
5177
  static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4829
5178
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
@@ -4845,14 +5194,14 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
4845
5194
  sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4846
5195
  }
4847
5196
 
4848
- static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5197
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4849
5198
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4850
5199
  if (ncols < 1024) {
4851
5200
  const dim3 block_dims(WARP_SIZE, 1, 1);
4852
- norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5201
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4853
5202
  } else {
4854
5203
  const dim3 block_dims(1024, 1, 1);
4855
- norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5204
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4856
5205
  }
4857
5206
  }
4858
5207
 
@@ -4874,34 +5223,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4874
5223
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4875
5224
  }
4876
5225
 
4877
- template<typename dst_t>
4878
- static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4879
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4880
- dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4881
- }
4882
-
4883
- template<typename dst_t>
4884
- static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4885
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4886
- dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4887
- }
4888
-
4889
- template<typename dst_t>
4890
- static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4891
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4892
- dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4893
- }
4894
-
4895
- template<typename dst_t>
4896
- static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4897
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4898
- dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4899
- }
4900
-
4901
- template<typename dst_t>
4902
- static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
5226
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
5227
+ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
4903
5228
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4904
- dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5229
+ dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4905
5230
  }
4906
5231
 
4907
5232
  template<typename dst_t>
@@ -4950,6 +5275,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4950
5275
  #endif
4951
5276
  }
4952
5277
 
5278
+ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5279
+ switch (type) {
5280
+ case GGML_TYPE_Q4_0:
5281
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5282
+ case GGML_TYPE_Q4_1:
5283
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5284
+ case GGML_TYPE_Q5_0:
5285
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5286
+ case GGML_TYPE_Q5_1:
5287
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5288
+ case GGML_TYPE_Q8_0:
5289
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5290
+ case GGML_TYPE_Q2_K:
5291
+ return dequantize_row_q2_K_cuda;
5292
+ case GGML_TYPE_Q3_K:
5293
+ return dequantize_row_q3_K_cuda;
5294
+ case GGML_TYPE_Q4_K:
5295
+ return dequantize_row_q4_K_cuda;
5296
+ case GGML_TYPE_Q5_K:
5297
+ return dequantize_row_q5_K_cuda;
5298
+ case GGML_TYPE_Q6_K:
5299
+ return dequantize_row_q6_K_cuda;
5300
+ case GGML_TYPE_F32:
5301
+ return dequantize_block_cuda<1, 1, convert_f32>;
5302
+ default:
5303
+ return nullptr;
5304
+ }
5305
+ }
5306
+
5307
+ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5308
+ switch (type) {
5309
+ case GGML_TYPE_Q4_0:
5310
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5311
+ case GGML_TYPE_Q4_1:
5312
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5313
+ case GGML_TYPE_Q5_0:
5314
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5315
+ case GGML_TYPE_Q5_1:
5316
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5317
+ case GGML_TYPE_Q8_0:
5318
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5319
+ case GGML_TYPE_Q2_K:
5320
+ return dequantize_row_q2_K_cuda;
5321
+ case GGML_TYPE_Q3_K:
5322
+ return dequantize_row_q3_K_cuda;
5323
+ case GGML_TYPE_Q4_K:
5324
+ return dequantize_row_q4_K_cuda;
5325
+ case GGML_TYPE_Q5_K:
5326
+ return dequantize_row_q5_K_cuda;
5327
+ case GGML_TYPE_Q6_K:
5328
+ return dequantize_row_q6_K_cuda;
5329
+ case GGML_TYPE_F16:
5330
+ return dequantize_block_cuda<1, 1, convert_f16>;
5331
+ default:
5332
+ return nullptr;
5333
+ }
5334
+ }
5335
+
4953
5336
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4954
5337
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4955
5338
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -5038,13 +5421,22 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
5038
5421
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5039
5422
  }
5040
5423
 
5041
- static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5042
- GGML_ASSERT(ncols % QK4_0 == 0);
5424
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5425
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5043
5426
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5044
5427
  const dim3 block_nums(block_num_y, 1, 1);
5045
5428
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5046
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
5047
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5429
+ dequantize_mul_mat_vec<1, 1, convert_f16>
5430
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5431
+ }
5432
+
5433
+ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5434
+ GGML_ASSERT(ncols % QK4_0 == 0);
5435
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5436
+ const dim3 block_nums(block_num_y, 1, 1);
5437
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5438
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
5439
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5048
5440
  }
5049
5441
 
5050
5442
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -5128,83 +5520,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
5128
5520
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5129
5521
  }
5130
5522
 
5131
- static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
5132
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
5133
- dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5134
- }
5135
-
5136
- static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
5137
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
5138
- dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5139
- }
5140
-
5141
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5142
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5143
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5144
- const dim3 block_nums(block_num_y, 1, 1);
5145
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5146
- dequantize_mul_mat_vec<1, 1, convert_f16>
5147
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5148
- }
5149
-
5150
- static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5151
- switch (type) {
5152
- case GGML_TYPE_Q4_0:
5153
- return dequantize_row_q4_0_cuda;
5154
- case GGML_TYPE_Q4_1:
5155
- return dequantize_row_q4_1_cuda;
5156
- case GGML_TYPE_Q5_0:
5157
- return dequantize_row_q5_0_cuda;
5158
- case GGML_TYPE_Q5_1:
5159
- return dequantize_row_q5_1_cuda;
5160
- case GGML_TYPE_Q8_0:
5161
- return dequantize_row_q8_0_cuda;
5162
- case GGML_TYPE_Q2_K:
5163
- return dequantize_row_q2_K_cuda;
5164
- case GGML_TYPE_Q3_K:
5165
- return dequantize_row_q3_K_cuda;
5166
- case GGML_TYPE_Q4_K:
5167
- return dequantize_row_q4_K_cuda;
5168
- case GGML_TYPE_Q5_K:
5169
- return dequantize_row_q5_K_cuda;
5170
- case GGML_TYPE_Q6_K:
5171
- return dequantize_row_q6_K_cuda;
5172
- case GGML_TYPE_F32:
5173
- return convert_fp32_to_fp16_cuda;
5174
- default:
5175
- return nullptr;
5176
- }
5177
- }
5178
-
5179
- static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5180
- switch (type) {
5181
- case GGML_TYPE_Q4_0:
5182
- return dequantize_row_q4_0_cuda;
5183
- case GGML_TYPE_Q4_1:
5184
- return dequantize_row_q4_1_cuda;
5185
- case GGML_TYPE_Q5_0:
5186
- return dequantize_row_q5_0_cuda;
5187
- case GGML_TYPE_Q5_1:
5188
- return dequantize_row_q5_1_cuda;
5189
- case GGML_TYPE_Q8_0:
5190
- return dequantize_row_q8_0_cuda;
5191
- case GGML_TYPE_Q2_K:
5192
- return dequantize_row_q2_K_cuda;
5193
- case GGML_TYPE_Q3_K:
5194
- return dequantize_row_q3_K_cuda;
5195
- case GGML_TYPE_Q4_K:
5196
- return dequantize_row_q4_K_cuda;
5197
- case GGML_TYPE_Q5_K:
5198
- return dequantize_row_q5_K_cuda;
5199
- case GGML_TYPE_Q6_K:
5200
- return dequantize_row_q6_K_cuda;
5201
- case GGML_TYPE_F16:
5202
- return convert_fp16_to_fp32_cuda;
5203
- default:
5204
- return nullptr;
5205
- }
5206
- }
5207
-
5208
5523
  static void ggml_mul_mat_q4_0_q8_1_cuda(
5209
5524
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
5210
5525
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
@@ -5697,6 +6012,39 @@ static void ggml_cpy_f32_f16_cuda(
5697
6012
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5698
6013
  }
5699
6014
 
6015
+ static void ggml_cpy_f32_q8_0_cuda(
6016
+ const char * cx, char * cdst, const int ne,
6017
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6018
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6019
+
6020
+ GGML_ASSERT(ne % QK8_0 == 0);
6021
+ const int num_blocks = ne / QK8_0;
6022
+ cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
6023
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6024
+ }
6025
+
6026
+ static void ggml_cpy_f32_q4_0_cuda(
6027
+ const char * cx, char * cdst, const int ne,
6028
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6029
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6030
+
6031
+ GGML_ASSERT(ne % QK4_0 == 0);
6032
+ const int num_blocks = ne / QK4_0;
6033
+ cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
6034
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6035
+ }
6036
+
6037
+ static void ggml_cpy_f32_q4_1_cuda(
6038
+ const char * cx, char * cdst, const int ne,
6039
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6040
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6041
+
6042
+ GGML_ASSERT(ne % QK4_1 == 0);
6043
+ const int num_blocks = ne / QK4_1;
6044
+ cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
6045
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6046
+ }
6047
+
5700
6048
  static void ggml_cpy_f16_f16_cuda(
5701
6049
  const char * cx, char * cdst, const int ne,
5702
6050
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -5739,20 +6087,26 @@ static void rope_cuda(
5739
6087
 
5740
6088
  template<typename T>
5741
6089
  static void rope_neox_cuda(
5742
- const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
6090
+ const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5743
6091
  float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5744
6092
  ) {
5745
6093
  GGML_ASSERT(ncols % 2 == 0);
5746
6094
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5747
6095
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5748
6096
  const dim3 block_nums(nrows, num_blocks_x, 1);
6097
+
6098
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
6099
+ const float inv_ndims = -1.0f / n_dims;
6100
+
5749
6101
  if (pos == nullptr) {
5750
6102
  rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5751
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
6103
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
6104
+ theta_scale, inv_ndims
5752
6105
  );
5753
6106
  } else {
5754
6107
  rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5755
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
6108
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
6109
+ theta_scale, inv_ndims
5756
6110
  );
5757
6111
  }
5758
6112
  }
@@ -5777,6 +6131,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
5777
6131
  alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
5778
6132
  }
5779
6133
 
6134
+ static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6135
+ const dim3 block_dims(WARP_SIZE, 1, 1);
6136
+ const dim3 block_nums(1, nrows, 1);
6137
+ k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6138
+ }
6139
+
6140
+ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
6141
+ // bitonic sort requires ncols to be power of 2
6142
+ GGML_ASSERT((ncols & (ncols - 1)) == 0);
6143
+
6144
+ const dim3 block_dims(ncols, 1, 1);
6145
+ const dim3 block_nums(1, nrows, 1);
6146
+ if (order == GGML_SORT_ASC) {
6147
+ k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6148
+ } else if (order == GGML_SORT_DESC) {
6149
+ k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6150
+ } else {
6151
+ GGML_ASSERT(false);
6152
+ }
6153
+ }
6154
+
5780
6155
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
5781
6156
  const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
5782
6157
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -5784,10 +6159,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
5784
6159
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
5785
6160
  }
5786
6161
 
5787
- static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
5788
- const dim3 block_dims(1, WARP_SIZE, 1);
6162
+ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
6163
+ int nth = WARP_SIZE;
6164
+ while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
6165
+ const dim3 block_dims(nth, 1, 1);
5789
6166
  const dim3 block_nums(nrows_x, 1, 1);
5790
- soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
6167
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
5791
6168
  }
5792
6169
 
5793
6170
  static void im2col_f32_f16_cuda(const float * x, half * dst,
@@ -5867,7 +6244,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5867
6244
  return ptr;
5868
6245
  }
5869
6246
  #ifdef DEBUG_CUDA_MALLOC
5870
- fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
6247
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5871
6248
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5872
6249
  #endif
5873
6250
  void * ptr;
@@ -6005,7 +6382,7 @@ void * ggml_cuda_host_malloc(size_t size) {
6005
6382
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
6006
6383
  // This can fixed the OOM error in WSL.
6007
6384
  cudaGetLastError();
6008
- fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
6385
+ fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6009
6386
  size/1024.0/1024.0, cudaGetErrorString(err));
6010
6387
  return nullptr;
6011
6388
  }
@@ -6050,75 +6427,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6050
6427
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
6051
6428
  if (nb0 == ts && nb1 == ts*ne0/bs) {
6052
6429
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
6053
- }
6054
- if (nb0 == ts) {
6430
+ } else if (nb0 == ts) {
6055
6431
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
6056
- }
6057
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6058
- const void * rx = (const void *) ((const char *) x + i1*nb1);
6059
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6060
- // pretend the row is a matrix with cols=1
6061
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6062
- if (r != cudaSuccess) { return r; }
6063
- }
6064
- return cudaSuccess;
6065
- }
6066
-
6067
- static void ggml_cuda_op_repeat(
6068
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6069
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
6070
- // guaranteed to be an integer due to the check in ggml_can_repeat
6071
- const int64_t ne0 = dst->ne[0];
6072
- const int64_t ne1 = dst->ne[1];
6073
- const int64_t ne2 = dst->ne[2];
6074
- const int64_t ne3 = dst->ne[3];
6075
-
6076
- const int64_t ne00 = src0->ne[0];
6077
- const int64_t ne01 = src0->ne[1];
6078
- const int64_t ne02 = src0->ne[2];
6079
- const int64_t ne03 = src0->ne[3];
6080
-
6081
- const size_t nb0 = dst->nb[0];
6082
- const size_t nb1 = dst->nb[1];
6083
- const size_t nb2 = dst->nb[2];
6084
- const size_t nb3 = dst->nb[3];
6085
-
6086
- const size_t nb00 = src0->nb[0];
6087
- const size_t nb01 = src0->nb[1];
6088
- const size_t nb02 = src0->nb[2];
6089
- const size_t nb03 = src0->nb[3];
6090
-
6091
- const int nr0 = (int)(ne0/ne00);
6092
- const int nr1 = (int)(ne1/ne01);
6093
- const int nr2 = (int)(ne2/ne02);
6094
- const int nr3 = (int)(ne3/ne03);
6095
-
6096
- // TODO: support for transposed / permuted tensors
6097
- GGML_ASSERT(nb0 == sizeof(float));
6098
- GGML_ASSERT(nb00 == sizeof(float));
6099
-
6100
- // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
6101
- for (int i3 = 0; i3 < nr3; i3++) {
6102
- for (int k3 = 0; k3 < ne03; k3++) {
6103
- for (int i2 = 0; i2 < nr2; i2++) {
6104
- for (int k2 = 0; k2 < ne02; k2++) {
6105
- for (int i1 = 0; i1 < nr1; i1++) {
6106
- for (int k1 = 0; k1 < ne01; k1++) {
6107
- for (int i0 = 0; i0 < nr0; i0++) {
6108
- CUDA_CHECK(cudaMemcpyAsync(
6109
- (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
6110
- (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
6111
- ne00*nb0, cudaMemcpyDeviceToDevice, stream));
6112
- }
6113
- }
6114
- }
6115
- }
6116
- }
6432
+ } else {
6433
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6434
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6435
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6436
+ // pretend the row is a matrix with cols=1
6437
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6438
+ if (r != cudaSuccess) return r;
6117
6439
  }
6440
+ return cudaSuccess;
6118
6441
  }
6119
-
6120
- (void) src1;
6121
- (void) src1_d;
6122
6442
  }
6123
6443
 
6124
6444
  static void ggml_cuda_op_get_rows(
@@ -6165,44 +6485,55 @@ static void ggml_cuda_op_get_rows(
6165
6485
  }
6166
6486
  }
6167
6487
 
6168
- inline void ggml_cuda_op_add(
6488
+ template<class op>
6489
+ inline void ggml_cuda_op_bin_bcast(
6169
6490
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6170
6491
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6171
6492
 
6172
6493
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6173
6494
 
6174
- const int64_t ne10 = src1->ne[0];
6175
- const int64_t ne11 = src1->ne[1];
6176
-
6177
6495
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6178
- add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6496
+ op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6179
6497
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
6180
- add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6498
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
6181
6499
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6182
- add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
6500
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
6183
6501
  } else {
6184
- fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
6502
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
6503
+ ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
6185
6504
  GGML_ASSERT(false);
6186
6505
  }
6506
+ }
6507
+
6508
+ static void ggml_cuda_op_repeat(
6509
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6510
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
6511
+
6512
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
6187
6513
 
6188
6514
  (void) src1;
6189
- (void) dst;
6515
+ (void) src1_d;
6190
6516
  }
6191
6517
 
6192
- inline void ggml_cuda_op_mul(
6518
+ inline void ggml_cuda_op_add(
6193
6519
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6194
6520
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6195
6521
 
6196
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
6197
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
6198
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
6522
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6523
+ }
6199
6524
 
6200
- const int64_t ne10 = src1->ne[0];
6201
- const int64_t ne11 = src1->ne[1];
6525
+ inline void ggml_cuda_op_mul(
6526
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6527
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6202
6528
 
6203
- mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6529
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6530
+ }
6204
6531
 
6205
- (void) dst;
6532
+ inline void ggml_cuda_op_div(
6533
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6534
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6535
+
6536
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6206
6537
  }
6207
6538
 
6208
6539
  inline void ggml_cuda_op_gelu(
@@ -6271,7 +6602,10 @@ inline void ggml_cuda_op_norm(
6271
6602
  const int64_t ne00 = src0->ne[0];
6272
6603
  const int64_t nrows = ggml_nrows(src0);
6273
6604
 
6274
- norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
6605
+ float eps;
6606
+ memcpy(&eps, dst->op_params, sizeof(float));
6607
+
6608
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
6275
6609
 
6276
6610
  (void) src1;
6277
6611
  (void) dst;
@@ -6426,6 +6760,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
6426
6760
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6427
6761
  const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6428
6762
 
6763
+ GGML_ASSERT(ggml_nrows(src1) == 1);
6764
+
6429
6765
  const int64_t ne00 = src0->ne[0];
6430
6766
  const int64_t row_diff = row_high - row_low;
6431
6767
 
@@ -6485,7 +6821,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
6485
6821
  size_t ash;
6486
6822
  dfloat * src1_dfloat = nullptr; // dfloat == half
6487
6823
 
6488
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
6824
+ bool src1_convert_f16 =
6825
+ src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
6489
6826
  src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
6490
6827
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
6491
6828
 
@@ -6707,15 +7044,14 @@ inline void ggml_cuda_op_rope(
6707
7044
  GGML_ASSERT(false);
6708
7045
  rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6709
7046
  } else if (is_neox) {
6710
- GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6711
7047
  if (src0->type == GGML_TYPE_F32) {
6712
7048
  rope_neox_cuda(
6713
- (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
7049
+ (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6714
7050
  attn_factor, corr_dims, main_stream
6715
7051
  );
6716
7052
  } else if (src0->type == GGML_TYPE_F16) {
6717
7053
  rope_neox_cuda(
6718
- (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
7054
+ (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6719
7055
  attn_factor, corr_dims, main_stream
6720
7056
  );
6721
7057
  } else {
@@ -6812,6 +7148,42 @@ inline void ggml_cuda_op_im2col(
6812
7148
  (void) src0_dd;
6813
7149
  }
6814
7150
 
7151
+ inline void ggml_cuda_op_sum_rows(
7152
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7153
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7154
+
7155
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7156
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7157
+
7158
+ const int64_t ncols = src0->ne[0];
7159
+ const int64_t nrows = ggml_nrows(src0);
7160
+
7161
+ sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
7162
+
7163
+ (void) src1;
7164
+ (void) dst;
7165
+ (void) src1_dd;
7166
+ }
7167
+
7168
+ inline void ggml_cuda_op_argsort(
7169
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7170
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7171
+
7172
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7173
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
7174
+
7175
+ const int64_t ncols = src0->ne[0];
7176
+ const int64_t nrows = ggml_nrows(src0);
7177
+
7178
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
7179
+
7180
+ argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
7181
+
7182
+ (void) src1;
7183
+ (void) dst;
7184
+ (void) src1_dd;
7185
+ }
7186
+
6815
7187
  inline void ggml_cuda_op_diag_mask_inf(
6816
7188
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6817
7189
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6839,14 +7211,18 @@ inline void ggml_cuda_op_soft_max(
6839
7211
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6840
7212
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6841
7213
 
7214
+ GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
7215
+
6842
7216
  const int64_t ne00 = src0->ne[0];
6843
- const int64_t nrows = ggml_nrows(src0);
7217
+ const int64_t nrows_x = ggml_nrows(src0);
7218
+ const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
6844
7219
 
6845
- soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
7220
+ float scale = 1.0f;
7221
+ memcpy(&scale, dst->op_params, sizeof(float));
7222
+
7223
+ soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
6846
7224
 
6847
- (void) src1;
6848
7225
  (void) dst;
6849
- (void) src1_dd;
6850
7226
  }
6851
7227
 
6852
7228
  inline void ggml_cuda_op_scale(
@@ -7016,7 +7392,7 @@ static void ggml_cuda_op_mul_mat(
7016
7392
  const int64_t ne01 = src0->ne[1];
7017
7393
  const int64_t ne02 = src0->ne[2];
7018
7394
  const int64_t ne03 = src0->ne[3];
7019
- // const int64_t nrows0 = ggml_nrows(src0);
7395
+ const int64_t nrows0 = ggml_nrows(src0);
7020
7396
 
7021
7397
  const int64_t ne10 = src1->ne[0];
7022
7398
  const int64_t ne11 = src1->ne[1];
@@ -7052,10 +7428,9 @@ static void ggml_cuda_op_mul_mat(
7052
7428
 
7053
7429
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
7054
7430
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
7055
-
7056
7431
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
7057
- const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
7058
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
7432
+
7433
+ const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
7059
7434
 
7060
7435
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7061
7436
  GGML_ASSERT(!(split && ne02 > 1));
@@ -7180,7 +7555,7 @@ static void ggml_cuda_op_mul_mat(
7180
7555
  const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
7181
7556
 
7182
7557
  // for split tensors the data begins at i0 == i0_offset_low
7183
- char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
7558
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
7184
7559
  float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
7185
7560
  char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
7186
7561
  float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
@@ -7325,6 +7700,10 @@ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, gg
7325
7700
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
7326
7701
  }
7327
7702
 
7703
+ static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7704
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
7705
+ }
7706
+
7328
7707
  static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7329
7708
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
7330
7709
  }
@@ -7350,7 +7729,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7350
7729
  }
7351
7730
 
7352
7731
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7353
- if (!g_cublas_loaded) { return false; }
7732
+ if (!g_cublas_loaded) return false;
7354
7733
 
7355
7734
  const int64_t ne10 = src1->ne[0];
7356
7735
 
@@ -7428,7 +7807,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7428
7807
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7429
7808
  }
7430
7809
 
7431
- __global__ static void k_compute_batched_ptrs(
7810
+ static __global__ void k_compute_batched_ptrs(
7432
7811
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7433
7812
  const void ** ptrs_src, void ** ptrs_dst,
7434
7813
  int ne12, int ne13,
@@ -7484,9 +7863,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7484
7863
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7485
7864
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7486
7865
 
7487
- int id;
7488
- CUDA_CHECK(cudaGetDevice(&id));
7489
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
7866
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
7490
7867
 
7491
7868
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7492
7869
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -7543,7 +7920,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7543
7920
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
7544
7921
  // use cublasGemmStridedBatchedEx
7545
7922
  CUBLAS_CHECK(
7546
- cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7923
+ cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7547
7924
  ne01, ne11, ne10,
7548
7925
  &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
7549
7926
  (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
@@ -7577,7 +7954,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7577
7954
  CUDA_CHECK(cudaGetLastError());
7578
7955
 
7579
7956
  CUBLAS_CHECK(
7580
- cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7957
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7581
7958
  ne01, ne11, ne10,
7582
7959
  &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7583
7960
  (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
@@ -7647,10 +8024,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7647
8024
  #ifdef GGML_CUDA_FORCE_DMMV
7648
8025
  const bool use_mul_mat_vec_q = false;
7649
8026
  #else
7650
- const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
8027
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
7651
8028
  #endif // GGML_CUDA_FORCE_DMMV
7652
8029
 
7653
8030
  if (use_mul_mat_vec_q) {
8031
+ // NOTE: this kernel does not support ggml_nrows(src1) > 1
7654
8032
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
7655
8033
  } else {
7656
8034
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
@@ -7675,42 +8053,255 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7675
8053
  }
7676
8054
  }
7677
8055
 
7678
- static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7679
- ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
7680
- }
8056
+ #if 0
8057
+ template<typename ... Srcs>
8058
+ static __global__ void k_compute_batched_ptrs_id(
8059
+ const void ** ptrs_src, void ** ptrs_dst,
8060
+ int ne12, int ne13,
8061
+ int ne23,
8062
+ int nb02, int nb03,
8063
+ int nb12, int nb13,
8064
+ int nb2, int nb3,
8065
+ int r2, int r3,
8066
+ ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
8067
+ const half * src1_f16, half * dst_f16,
8068
+ const int32_t * ids, const int id,
8069
+ Srcs... src0s) {
8070
+
8071
+ int i = ids[id];
8072
+
8073
+ half * src0_f16;
8074
+ const void * srcs_ar[] = { (const half *) src0s... };
8075
+ if (src0_type == GGML_TYPE_F16) {
8076
+ src0_f16 = (half *) srcs_ar[i];
8077
+ } else {
8078
+ src0_f16 = src0_as_f16;
8079
+ if (threadIdx.x == 0 && threadIdx.y == 0) {
8080
+ const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
8081
+ to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
8082
+ }
8083
+ }
7681
8084
 
7682
- static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7683
- ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
8085
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
8086
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
8087
+
8088
+ if (i13 >= ne13 || i12 >= ne12) {
8089
+ return;
8090
+ }
8091
+
8092
+ int i03 = i13 / r3;
8093
+ int i02 = i12 / r2;
8094
+
8095
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
8096
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
8097
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7684
8098
  }
7685
8099
 
7686
- static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7687
- const int64_t ne = ggml_nelements(src0);
7688
- GGML_ASSERT(ne == ggml_nelements(src1));
8100
+ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
8101
+ const struct ggml_tensor * ids = dst->src[0];
8102
+ const struct ggml_tensor * src1 = dst->src[1];
8103
+ const struct ggml_tensor * src00 = dst->src[2];
7689
8104
 
7690
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
7691
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
8105
+ const int id = dst->op_params[0];
7692
8106
 
7693
- GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
7694
- GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
8107
+ GGML_ASSERT(!ggml_is_transposed(src00));
8108
+ GGML_ASSERT(!ggml_is_transposed(src1));
7695
8109
 
7696
- const int64_t ne00 = src0->ne[0];
7697
- const int64_t ne01 = src0->ne[1];
7698
- GGML_ASSERT(src0->ne[3] == 1);
8110
+ GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
8111
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7699
8112
 
7700
- const int64_t nb00 = src0->nb[0];
7701
- const int64_t nb01 = src0->nb[1];
7702
- const int64_t nb02 = src0->nb[2];
8113
+ const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
8114
+ const int64_t ne01 = src00->ne[1];
8115
+ const int64_t ne02 = src00->ne[2];
8116
+ const int64_t ne03 = src00->ne[3];
8117
+
8118
+ //const int64_t nb01 = src00->nb[1];
8119
+ const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
8120
+ const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
7703
8121
 
7704
8122
  const int64_t ne10 = src1->ne[0];
7705
8123
  const int64_t ne11 = src1->ne[1];
7706
- GGML_ASSERT(src1->ne[3] == 1);
8124
+ const int64_t ne12 = src1->ne[2];
8125
+ const int64_t ne13 = src1->ne[3];
7707
8126
 
7708
- const int64_t nb10 = src1->nb[0];
7709
- const int64_t nb11 = src1->nb[1];
7710
- const int64_t nb12 = src1->nb[2];
8127
+ //const int64_t nb11 = src1->nb[1];
8128
+ const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
8129
+ const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
7711
8130
 
7712
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7713
- cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8131
+ const int64_t ne1 = ggml_nelements(src1);
8132
+ const int64_t ne = ggml_nelements(dst);
8133
+
8134
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8135
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8136
+
8137
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
8138
+
8139
+ //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
8140
+ //void * src0_ddq = src0_extra->data_device[g_main_device];
8141
+ //half * src0_as_f16 = (half *) src0_ddq;
8142
+
8143
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
8144
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
8145
+
8146
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
8147
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
8148
+
8149
+ // convert src1 to fp16
8150
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8151
+ GGML_ASSERT(to_fp16_cuda != nullptr);
8152
+
8153
+ size_t src1_as = 0;
8154
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
8155
+ to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8156
+
8157
+ size_t dst_as = 0;
8158
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8159
+
8160
+ GGML_ASSERT(ne12 % ne02 == 0);
8161
+ GGML_ASSERT(ne13 % ne03 == 0);
8162
+
8163
+ // broadcast factors
8164
+ const int64_t r2 = ne12/ne02;
8165
+ const int64_t r3 = ne13/ne03;
8166
+
8167
+ const half alpha_f16 = 1.0f;
8168
+ const half beta_f16 = 0.0f;
8169
+
8170
+ // use cublasGemmBatchedEx
8171
+ const int ne23 = ne12*ne13;
8172
+
8173
+ const void ** ptrs_src = nullptr;
8174
+ void ** ptrs_dst = nullptr;
8175
+
8176
+ size_t ptrs_src_s = 0;
8177
+ size_t ptrs_dst_s = 0;
8178
+
8179
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
8180
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
8181
+
8182
+ int64_t src0_ne = ggml_nelements(src00);
8183
+ half * src0_as_f16 = nullptr;
8184
+ size_t src0_as = 0;
8185
+ if (src00->type != GGML_TYPE_F16) {
8186
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
8187
+ }
8188
+
8189
+ static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
8190
+ dim3 block_dims(ne13, ne12);
8191
+ k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
8192
+ ptrs_src, ptrs_dst,
8193
+ ne12, ne13,
8194
+ ne23,
8195
+ ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
8196
+ nb12, nb13,
8197
+ dst->nb[2], dst->nb[3],
8198
+ r2, r3,
8199
+ src00->type, src0_as_f16, src0_ne,
8200
+ src1_as_f16, dst_f16,
8201
+ (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
8202
+ dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
8203
+ dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
8204
+ dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
8205
+ dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
8206
+ );
8207
+ CUDA_CHECK(cudaGetLastError());
8208
+
8209
+ CUBLAS_CHECK(
8210
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8211
+ ne01, ne11, ne10,
8212
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
8213
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
8214
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
8215
+ ne23,
8216
+ CUBLAS_COMPUTE_16F,
8217
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8218
+
8219
+ if (src0_as != 0) {
8220
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
8221
+ }
8222
+ if (ptrs_src_s != 0) {
8223
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
8224
+ }
8225
+ if (ptrs_dst_s != 0) {
8226
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
8227
+ }
8228
+
8229
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8230
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8231
+
8232
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
8233
+ ggml_cuda_pool_free(dst_f16, dst_as);
8234
+ }
8235
+ #endif
8236
+
8237
+ static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
8238
+ #if 0
8239
+ //#ifdef CUDA_USE_TENSOR_CORES
8240
+ // const bool use_tensor_cores = true;
8241
+ //#else
8242
+ // const bool use_tensor_cores = false;
8243
+ //#endif
8244
+
8245
+ ggml_cuda_mul_mat_id_cublas(dst);
8246
+
8247
+ // TODO: mmq/mmv support
8248
+ #else
8249
+ const struct ggml_tensor * ids = dst->src[0];
8250
+ const struct ggml_tensor * src1 = dst->src[1];
8251
+ const int id = dst->op_params[0];
8252
+
8253
+ int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8254
+
8255
+ int32_t a_id;
8256
+ CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8257
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8258
+
8259
+ GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
8260
+ const struct ggml_tensor * src0 = dst->src[a_id + 2];
8261
+
8262
+ ggml_cuda_mul_mat(src0, src1, dst);
8263
+ #endif
8264
+
8265
+ (void) _src0;
8266
+ (void) _src1;
8267
+ }
8268
+
8269
+ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8270
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
8271
+ }
8272
+
8273
+ static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8274
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
8275
+ }
8276
+
8277
+ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8278
+ const int64_t ne = ggml_nelements(src0);
8279
+ GGML_ASSERT(ne == ggml_nelements(src1));
8280
+
8281
+ GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
8282
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
8283
+
8284
+ GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
8285
+ GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
8286
+
8287
+ const int64_t ne00 = src0->ne[0];
8288
+ const int64_t ne01 = src0->ne[1];
8289
+ GGML_ASSERT(src0->ne[3] == 1);
8290
+
8291
+ const int64_t nb00 = src0->nb[0];
8292
+ const int64_t nb01 = src0->nb[1];
8293
+ const int64_t nb02 = src0->nb[2];
8294
+
8295
+ const int64_t ne10 = src1->ne[0];
8296
+ const int64_t ne11 = src1->ne[1];
8297
+ GGML_ASSERT(src1->ne[3] == 1);
8298
+
8299
+ const int64_t nb10 = src1->nb[0];
8300
+ const int64_t nb11 = src1->nb[1];
8301
+ const int64_t nb12 = src1->nb[2];
8302
+
8303
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8304
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7714
8305
 
7715
8306
  const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7716
8307
  const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -7719,14 +8310,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7719
8310
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
7720
8311
 
7721
8312
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
7722
- ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7723
- ne10, ne11, nb10, nb11, nb12, main_stream);
8313
+ ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7724
8314
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7725
- ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7726
- ne10, ne11, nb10, nb11, nb12, main_stream);
8315
+ ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8316
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
8317
+ ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8318
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
8319
+ ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8320
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
8321
+ ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7727
8322
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7728
- ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7729
- ne10, ne11, nb10, nb11, nb12, main_stream);
8323
+ ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7730
8324
  } else {
7731
8325
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7732
8326
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7737,6 +8331,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7737
8331
  }
7738
8332
 
7739
8333
  static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8334
+ // TODO: why do we pass dst as src1 here?
7740
8335
  ggml_cuda_cpy(src0, dst, nullptr);
7741
8336
  (void) src1;
7742
8337
  }
@@ -7762,6 +8357,16 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
7762
8357
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7763
8358
  }
7764
8359
 
8360
+ static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8361
+ GGML_ASSERT(ggml_is_contiguous(src0));
8362
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
8363
+ }
8364
+
8365
+ static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8366
+ GGML_ASSERT(ggml_is_contiguous(src0));
8367
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
8368
+ }
8369
+
7765
8370
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7766
8371
  (void) src0;
7767
8372
  (void) src1;
@@ -8017,8 +8622,9 @@ void ggml_cuda_set_main_device(const int main_device) {
8017
8622
  main_device, g_device_count, g_main_device);
8018
8623
  return;
8019
8624
  }
8020
- g_main_device = main_device;
8021
- if (g_device_count > 1) {
8625
+
8626
+ if (g_main_device != main_device && g_device_count > 1) {
8627
+ g_main_device = main_device;
8022
8628
  cudaDeviceProp prop;
8023
8629
  CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
8024
8630
  fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
@@ -8044,7 +8650,7 @@ void ggml_cuda_free_scratch() {
8044
8650
  }
8045
8651
 
8046
8652
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8047
- if (!g_cublas_loaded) { return false; }
8653
+ if (!g_cublas_loaded) return false;
8048
8654
 
8049
8655
  ggml_cuda_func_t func;
8050
8656
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8080,6 +8686,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8080
8686
  case GGML_OP_MUL:
8081
8687
  func = ggml_cuda_mul;
8082
8688
  break;
8689
+ case GGML_OP_DIV:
8690
+ func = ggml_cuda_div;
8691
+ break;
8083
8692
  case GGML_OP_UNARY:
8084
8693
  switch (ggml_get_unary_op(tensor)) {
8085
8694
  case GGML_UNARY_OP_GELU:
@@ -8093,7 +8702,8 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8093
8702
  break;
8094
8703
  default:
8095
8704
  return false;
8096
- } break;
8705
+ }
8706
+ break;
8097
8707
  case GGML_OP_NORM:
8098
8708
  func = ggml_cuda_norm;
8099
8709
  break;
@@ -8106,6 +8716,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8106
8716
  }
8107
8717
  func = ggml_cuda_mul_mat;
8108
8718
  break;
8719
+ case GGML_OP_MUL_MAT_ID:
8720
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
8721
+ return false;
8722
+ }
8723
+ func = ggml_cuda_mul_mat_id;
8724
+ break;
8109
8725
  case GGML_OP_SCALE:
8110
8726
  func = ggml_cuda_scale;
8111
8727
  break;
@@ -8145,6 +8761,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8145
8761
  case GGML_OP_IM2COL:
8146
8762
  func = ggml_cuda_im2col;
8147
8763
  break;
8764
+ case GGML_OP_SUM_ROWS:
8765
+ func = ggml_cuda_sum_rows;
8766
+ break;
8767
+ case GGML_OP_ARGSORT:
8768
+ func = ggml_cuda_argsort;
8769
+ break;
8148
8770
  default:
8149
8771
  return false;
8150
8772
  }
@@ -8161,7 +8783,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8161
8783
 
8162
8784
  int ggml_cuda_get_device_count() {
8163
8785
  int device_count;
8164
- CUDA_CHECK(cudaGetDeviceCount(&device_count));
8786
+ if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
8787
+ return 0;
8788
+ }
8165
8789
  return device_count;
8166
8790
  }
8167
8791
 
@@ -8177,27 +8801,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
8177
8801
 
8178
8802
  #define UNUSED GGML_UNUSED
8179
8803
 
8180
- struct ggml_backend_context_cuda {
8181
- };
8182
-
8183
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
8184
- return GGML_CUDA_NAME;
8185
-
8186
- UNUSED(backend);
8187
- }
8188
-
8189
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
8190
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
8191
- delete cuda_ctx;
8192
- delete backend;
8193
- }
8804
+ // cuda buffer
8194
8805
 
8195
8806
  struct ggml_backend_buffer_context_cuda {
8196
- void * device;
8197
-
8807
+ int device;
8808
+ void * dev_ptr = nullptr;
8198
8809
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
8199
8810
  size_t temp_tensor_extra_index = 0;
8200
8811
 
8812
+ ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
8813
+
8201
8814
  ~ggml_backend_buffer_context_cuda() {
8202
8815
  delete[] temp_tensor_extras;
8203
8816
  }
@@ -8218,41 +8831,20 @@ struct ggml_backend_buffer_context_cuda {
8218
8831
 
8219
8832
  static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
8220
8833
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8221
- CUDA_CHECK(cudaFree(ctx->device));
8834
+ CUDA_CHECK(cudaFree(ctx->dev_ptr));
8222
8835
  delete ctx;
8223
8836
  }
8224
8837
 
8225
8838
  static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
8226
8839
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8227
- return ctx->device;
8228
- }
8229
-
8230
- static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8231
- int64_t row_low = 0;
8232
- int64_t row_high = ggml_nrows(tensor);
8233
- int64_t nrows_split = row_high - row_low;
8234
-
8235
- size_t size = ggml_nbytes_split(tensor, nrows_split);
8236
-
8237
- int64_t ne0 = tensor->ne[0];
8238
-
8239
- if (ggml_is_quantized(tensor->type)) {
8240
- if (ne0 % MATRIX_ROW_PADDING != 0) {
8241
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8242
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8243
- }
8244
- }
8245
-
8246
- return size;
8247
-
8248
- UNUSED(buffer);
8840
+ return ctx->dev_ptr;
8249
8841
  }
8250
8842
 
8251
8843
  static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8252
8844
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8253
8845
 
8254
8846
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
8255
- assert(tensor->view_src->buffer->backend == buffer->backend);
8847
+ assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
8256
8848
  tensor->backend = tensor->view_src->backend;
8257
8849
  tensor->extra = tensor->view_src->extra;
8258
8850
  return;
@@ -8260,7 +8852,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8260
8852
 
8261
8853
  ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
8262
8854
 
8263
- extra->data_device[g_main_device] = tensor->data;
8855
+ extra->data_device[ctx->device] = tensor->data;
8264
8856
 
8265
8857
  tensor->backend = GGML_BACKEND_GPU;
8266
8858
  tensor->extra = extra;
@@ -8272,64 +8864,208 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8272
8864
  int64_t nrows_split = row_high - row_low;
8273
8865
 
8274
8866
  size_t original_size = ggml_nbytes_split(tensor, nrows_split);
8275
- size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
8867
+ size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
8276
8868
 
8277
8869
  if (padded_size > original_size && tensor->view_src == nullptr) {
8278
- CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
8870
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
8279
8871
  }
8280
8872
  }
8281
8873
 
8282
8874
  UNUSED(buffer);
8283
8875
  }
8284
8876
 
8877
+ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
8878
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
8879
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8880
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8881
+
8882
+ CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
8883
+
8884
+ UNUSED(buffer);
8885
+ }
8886
+
8887
+ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
8888
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
8889
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8890
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8891
+
8892
+ CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
8893
+
8894
+ UNUSED(buffer);
8895
+ }
8896
+
8285
8897
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
8286
- /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
8287
- /* .get_base = */ ggml_backend_cuda_buffer_get_base,
8288
- /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
8289
- /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
8290
- /* .free_tensor = */ NULL,
8898
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
8899
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
8900
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
8901
+ /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
8902
+ /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
8903
+ /* .cpy_tensor_from = */ NULL,
8904
+ /* .cpy_tensor_to = */ NULL,
8291
8905
  };
8292
8906
 
8293
- static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
8294
- ggml_cuda_set_device(g_main_device);
8907
+ // cuda buffer type
8295
8908
 
8296
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
8909
+ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
8910
+ int device = (int) (intptr_t) buft->context;
8911
+
8912
+ ggml_cuda_set_device(device);
8297
8913
 
8298
8914
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8299
8915
 
8300
- ggml_cuda_set_device(g_main_device);
8301
- CUDA_CHECK(cudaMalloc(&ctx->device, size));
8916
+ void * dev_ptr;
8917
+ CUDA_CHECK(cudaMalloc(&dev_ptr, size));
8302
8918
 
8303
- return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
8919
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
8920
+
8921
+ return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
8304
8922
  }
8305
8923
 
8306
- static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
8924
+ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
8307
8925
  return 128;
8926
+
8927
+ UNUSED(buft);
8928
+ }
8929
+
8930
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
8931
+ int64_t row_low = 0;
8932
+ int64_t row_high = ggml_nrows(tensor);
8933
+ int64_t nrows_split = row_high - row_low;
8934
+
8935
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
8936
+
8937
+ int64_t ne0 = tensor->ne[0];
8938
+
8939
+ if (ggml_is_quantized(tensor->type)) {
8940
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
8941
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8942
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8943
+ }
8944
+ }
8945
+
8946
+ return size;
8947
+
8948
+ UNUSED(buft);
8949
+ }
8950
+
8951
+ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
8952
+ return ggml_backend_is_cuda(backend);
8953
+
8954
+ UNUSED(buft);
8955
+ }
8956
+
8957
+ static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
8958
+ /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
8959
+ /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
8960
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
8961
+ /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
8962
+ };
8963
+
8964
+ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
8965
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
8966
+ static bool ggml_backend_buffer_type_cuda_initialized = false;
8967
+ if (!ggml_backend_buffer_type_cuda_initialized) {
8968
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
8969
+ ggml_backend_buffer_type_cuda[i] = {
8970
+ /* .iface = */ cuda_backend_buffer_type_interface,
8971
+ /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
8972
+ };
8973
+ }
8974
+ ggml_backend_buffer_type_cuda_initialized = true;
8975
+ }
8976
+
8977
+ return &ggml_backend_buffer_type_cuda[device];
8978
+ }
8979
+
8980
+ // host buffer type
8981
+
8982
+ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
8983
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8984
+ CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
8985
+ delete ctx;
8986
+ }
8987
+
8988
+ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
8989
+ void * ptr;
8990
+ CUDA_CHECK(cudaMallocHost(&ptr, size));
8991
+
8992
+ // FIXME: this is a hack to avoid having to implement a new buffer type
8993
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
8994
+ buffer->buft = buft;
8995
+ buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
8996
+
8997
+ return buffer;
8998
+
8999
+ UNUSED(buft);
9000
+ }
9001
+
9002
+ struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9003
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9004
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9005
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9006
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9007
+ };
9008
+
9009
+ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9010
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9011
+ /* .iface = */ cuda_backend_host_buffer_type_interface,
9012
+ /* .context = */ nullptr,
9013
+ };
9014
+
9015
+ return &ggml_backend_buffer_type_cuda_host;
9016
+ }
9017
+
9018
+ // backend
9019
+
9020
+ struct ggml_backend_context_cuda {
9021
+ int device;
9022
+ };
9023
+
9024
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
9025
+ return GGML_CUDA_NAME;
9026
+
8308
9027
  UNUSED(backend);
8309
9028
  }
8310
9029
 
9030
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
9031
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9032
+
9033
+ delete cuda_ctx;
9034
+ delete backend;
9035
+ }
9036
+
9037
+ static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
9038
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9039
+
9040
+ return ggml_backend_cuda_buffer_type(cuda_ctx->device);
9041
+ }
9042
+
8311
9043
  static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9044
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9045
+
9046
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8312
9047
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
8313
9048
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8314
9049
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8315
9050
 
8316
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
8317
-
8318
- UNUSED(backend);
9051
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
8319
9052
  }
8320
9053
 
8321
9054
  static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9055
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9056
+
9057
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8322
9058
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
8323
9059
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8324
9060
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8325
9061
 
8326
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8327
-
8328
- UNUSED(backend);
9062
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
8329
9063
  }
8330
9064
 
8331
9065
  static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
8332
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
9066
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9067
+
9068
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
8333
9069
 
8334
9070
  UNUSED(backend);
8335
9071
  }
@@ -8343,14 +9079,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8343
9079
  UNUSED(cgraph);
8344
9080
  }
8345
9081
 
8346
- [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9082
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8347
9083
  GGML_ASSERT(!"not implemented");
8348
9084
 
8349
9085
  UNUSED(backend);
8350
9086
  UNUSED(plan);
8351
9087
  }
8352
9088
 
8353
- [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9089
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8354
9090
  GGML_ASSERT(!"not implemented");
8355
9091
 
8356
9092
  UNUSED(backend);
@@ -8358,7 +9094,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8358
9094
  }
8359
9095
 
8360
9096
  static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
8361
- ggml_cuda_set_device(g_main_device);
9097
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9098
+
9099
+ ggml_cuda_set_main_device(cuda_ctx->device);
8362
9100
 
8363
9101
  ggml_compute_params params = {};
8364
9102
  params.type = GGML_TASK_COMPUTE;
@@ -8366,13 +9104,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8366
9104
  for (int i = 0; i < cgraph->n_nodes; i++) {
8367
9105
  ggml_tensor * node = cgraph->nodes[i];
8368
9106
 
8369
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
9107
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8370
9108
  continue;
8371
- }
9109
+
8372
9110
  assert(node->backend == GGML_BACKEND_GPU);
9111
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9112
+ assert(node->extra != nullptr);
9113
+
8373
9114
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8374
9115
  if (node->src[j] != nullptr) {
8375
9116
  assert(node->src[j]->backend == GGML_BACKEND_GPU);
9117
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9118
+ assert(node->src[j]->extra != nullptr);
8376
9119
  }
8377
9120
  }
8378
9121
 
@@ -8409,27 +9152,98 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8409
9152
  UNUSED(backend);
8410
9153
  }
8411
9154
 
9155
+ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
9156
+ switch (op->op) {
9157
+ case GGML_OP_UNARY:
9158
+ switch (ggml_get_unary_op(op)) {
9159
+ case GGML_UNARY_OP_GELU:
9160
+ case GGML_UNARY_OP_SILU:
9161
+ case GGML_UNARY_OP_RELU:
9162
+ return true;
9163
+ default:
9164
+ return false;
9165
+ }
9166
+ break;
9167
+ case GGML_OP_MUL_MAT:
9168
+ case GGML_OP_MUL_MAT_ID:
9169
+ {
9170
+ struct ggml_tensor * a;
9171
+ struct ggml_tensor * b;
9172
+ if (op->op == GGML_OP_MUL_MAT) {
9173
+ a = op->src[0];
9174
+ b = op->src[1];
9175
+ } else {
9176
+ a = op->src[2];
9177
+ b = op->src[1];
9178
+ }
9179
+ if (a->ne[3] != b->ne[3]) {
9180
+ return false;
9181
+ }
9182
+ return true;
9183
+ } break;
9184
+ case GGML_OP_NONE:
9185
+ case GGML_OP_RESHAPE:
9186
+ case GGML_OP_VIEW:
9187
+ case GGML_OP_PERMUTE:
9188
+ case GGML_OP_TRANSPOSE:
9189
+ case GGML_OP_NORM:
9190
+ case GGML_OP_REPEAT:
9191
+ case GGML_OP_GET_ROWS:
9192
+ case GGML_OP_DUP:
9193
+ case GGML_OP_ADD:
9194
+ case GGML_OP_MUL:
9195
+ case GGML_OP_DIV:
9196
+ case GGML_OP_RMS_NORM:
9197
+ case GGML_OP_SCALE:
9198
+ case GGML_OP_SQR:
9199
+ case GGML_OP_CLAMP:
9200
+ case GGML_OP_CPY:
9201
+ case GGML_OP_CONT:
9202
+ case GGML_OP_DIAG_MASK_INF:
9203
+ case GGML_OP_SOFT_MAX:
9204
+ case GGML_OP_ROPE:
9205
+ case GGML_OP_ALIBI:
9206
+ case GGML_OP_IM2COL:
9207
+ case GGML_OP_SUM_ROWS:
9208
+ case GGML_OP_ARGSORT:
9209
+ return true;
9210
+ default:
9211
+ return false;
9212
+ }
9213
+
9214
+ UNUSED(backend);
9215
+ }
9216
+
8412
9217
  static ggml_backend_i cuda_backend_i = {
8413
- /* .get_name = */ ggml_backend_cuda_name,
8414
- /* .free = */ ggml_backend_cuda_free,
8415
- /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
8416
- /* .get_alignment = */ ggml_backend_cuda_get_alignment,
8417
- /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
8418
- /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
8419
- /* .synchronize = */ ggml_backend_cuda_synchronize,
8420
- /* .cpy_tensor_from = */ nullptr,
8421
- /* .cpy_tensor_to = */ nullptr,
8422
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
8423
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
8424
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
8425
- /* .graph_compute = */ ggml_backend_cuda_graph_compute,
8426
- /* .supports_op = */ nullptr,
9218
+ /* .get_name = */ ggml_backend_cuda_name,
9219
+ /* .free = */ ggml_backend_cuda_free,
9220
+ /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
9221
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
9222
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
9223
+ /* .cpy_tensor_from_async = */ NULL,
9224
+ /* .cpy_tensor_to_async = */ NULL,
9225
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
9226
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
9227
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
9228
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
9229
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
9230
+ /* .supports_op = */ ggml_backend_cuda_supports_op,
8427
9231
  };
8428
9232
 
8429
- ggml_backend_t ggml_backend_cuda_init() {
9233
+ ggml_backend_t ggml_backend_cuda_init(int device) {
8430
9234
  ggml_init_cublas(); // TODO: remove from ggml.c
8431
9235
 
8432
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
9236
+ if (device < 0 || device >= ggml_cuda_get_device_count()) {
9237
+ fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
9238
+ return nullptr;
9239
+ }
9240
+
9241
+ // not strictly necessary, but it may reduce the overhead of the first graph_compute
9242
+ ggml_cuda_set_main_device(device);
9243
+
9244
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
9245
+ /* .device = */ device
9246
+ };
8433
9247
 
8434
9248
  ggml_backend_t cuda_backend = new ggml_backend {
8435
9249
  /* .interface = */ cuda_backend_i,
@@ -8438,3 +9252,25 @@ ggml_backend_t ggml_backend_cuda_init() {
8438
9252
 
8439
9253
  return cuda_backend;
8440
9254
  }
9255
+
9256
+ bool ggml_backend_is_cuda(ggml_backend_t backend) {
9257
+ return backend->iface.get_name == ggml_backend_cuda_name;
9258
+ }
9259
+
9260
+ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
9261
+ ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
9262
+ return cuda_backend;
9263
+
9264
+ UNUSED(params);
9265
+ }
9266
+
9267
+ extern "C" int ggml_backend_cuda_reg_devices() {
9268
+ int device_count = ggml_cuda_get_device_count();
9269
+ //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
9270
+ for (int i = 0; i < device_count; i++) {
9271
+ char name[128];
9272
+ snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
9273
+ ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
9274
+ }
9275
+ return device_count;
9276
+ }