llama_cpp 0.9.4 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,8 @@
1
1
  #include <algorithm>
2
- #include <cinttypes>
3
2
  #include <cstddef>
4
3
  #include <cstdint>
4
+ #include <cinttypes>
5
+ #include <float.h>
5
6
  #include <limits>
6
7
  #include <stdint.h>
7
8
  #include <stdio.h>
@@ -69,6 +70,7 @@
69
70
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
70
71
  #define cudaSetDevice hipSetDevice
71
72
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
73
+ #define cudaStreamFireAndForget hipStreamFireAndForget
72
74
  #define cudaStreamNonBlocking hipStreamNonBlocking
73
75
  #define cudaStreamSynchronize hipStreamSynchronize
74
76
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
@@ -190,7 +192,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190
192
  fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
191
193
  cudaGetErrorString(err_)); \
192
194
  fprintf(stderr, "current device: %d\n", id); \
193
- exit(1); \
195
+ GGML_ASSERT(!"CUDA error"); \
194
196
  } \
195
197
  } while (0)
196
198
 
@@ -204,7 +206,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
204
206
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
205
207
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
206
208
  fprintf(stderr, "current device: %d\n", id); \
207
- exit(1); \
209
+ GGML_ASSERT(!"cuBLAS error"); \
208
210
  } \
209
211
  } while (0)
210
212
  #else
@@ -216,7 +218,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
216
218
  cudaGetDevice(&id); \
217
219
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
218
220
  fprintf(stderr, "current device: %d\n", id); \
219
- exit(1); \
221
+ GGML_ASSERT(!"cuBLAS error"); \
220
222
  } \
221
223
  } while (0)
222
224
  #endif // CUDART_VERSION >= 11
@@ -433,8 +435,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
435
  #define WARP_SIZE 32
434
436
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
435
437
 
436
- #define CUDA_ADD_BLOCK_SIZE 256
437
- #define CUDA_MUL_BLOCK_SIZE 256
438
438
  #define CUDA_GELU_BLOCK_SIZE 256
439
439
  #define CUDA_SILU_BLOCK_SIZE 256
440
440
  #define CUDA_RELU_BLOCK_SIZE 256
@@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
443
443
  #define CUDA_SCALE_BLOCK_SIZE 256
444
444
  #define CUDA_CLAMP_BLOCK_SIZE 256
445
445
  #define CUDA_ROPE_BLOCK_SIZE 256
446
+ #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
446
447
  #define CUDA_ALIBI_BLOCK_SIZE 32
447
448
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
448
449
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
@@ -501,40 +502,112 @@ static size_t g_scratch_offset = 0;
501
502
 
502
503
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
503
504
 
504
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
505
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
506
-
507
- if (i >= kx) {
508
- return;
505
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
506
+ #pragma unroll
507
+ for (int mask = 16; mask > 0; mask >>= 1) {
508
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
509
509
  }
510
- dst[i] = x[i] + y[i%ky];
510
+ return x;
511
511
  }
512
512
 
513
- static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
514
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
513
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
514
+ #pragma unroll
515
+ for (int mask = 16; mask > 0; mask >>= 1) {
516
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
517
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
518
+ }
519
+ return a;
520
+ }
515
521
 
516
- if (i >= k) {
517
- return;
522
+ static __device__ __forceinline__ float warp_reduce_max(float x) {
523
+ #pragma unroll
524
+ for (int mask = 16; mask > 0; mask >>= 1) {
525
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
518
526
  }
519
- dst[i] = __hadd(x[i], __float2half(y[i]));
527
+ return x;
520
528
  }
521
529
 
522
- static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
523
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
530
+ static __device__ __forceinline__ float op_repeat(const float a, const float b) {
531
+ return b;
532
+ }
524
533
 
525
- if (i >= k) {
534
+ static __device__ __forceinline__ float op_add(const float a, const float b) {
535
+ return a + b;
536
+ }
537
+
538
+ static __device__ __forceinline__ float op_mul(const float a, const float b) {
539
+ return a * b;
540
+ }
541
+
542
+ static __device__ __forceinline__ float op_div(const float a, const float b) {
543
+ return a / b;
544
+ }
545
+
546
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
547
+ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
548
+ int ne0, int ne1, int ne2, int ne3,
549
+ int ne10, int ne11, int ne12, int ne13,
550
+ /*int s0, */ int s1, int s2, int s3,
551
+ /*int s10,*/ int s11, int s12, int s13) {
552
+ const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
553
+ const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
554
+ const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
555
+ const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
556
+
557
+ if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
526
558
  return;
527
559
  }
528
- dst[i] = __half2float(x[i]) + y[i];
560
+
561
+ const int i11 = i1 % ne11;
562
+ const int i12 = i2 % ne12;
563
+ const int i13 = i3 % ne13;
564
+
565
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
566
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
567
+ const size_t i_dst = i_src0;
568
+
569
+ const src0_t * src0_row = src0 + i_src0;
570
+ const src1_t * src1_row = src1 + i_src1;
571
+ dst_t * dst_row = dst + i_dst;
572
+
573
+ for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
574
+ const int i10 = i0 % ne10;
575
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
576
+ }
529
577
  }
530
578
 
531
- static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
579
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
580
+ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
581
+ int ne0, int ne1, int ne2, int ne3,
582
+ int ne10, int ne11, int ne12, int ne13,
583
+ /*int s0, */ int s1, int s2, int s3,
584
+ /*int s10,*/ int s11, int s12, int s13) {
585
+
532
586
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
533
587
 
534
- if (i >= kx) {
588
+ const int i3 = i/(ne2*ne1*ne0);
589
+ const int i2 = (i/(ne1*ne0)) % ne2;
590
+ const int i1 = (i/ne0) % ne1;
591
+ const int i0 = i % ne0;
592
+
593
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
535
594
  return;
536
595
  }
537
- dst[i] = x[i] * y[i%ky];
596
+
597
+ const int i11 = i1 % ne11;
598
+ const int i12 = i2 % ne12;
599
+ const int i13 = i3 % ne13;
600
+
601
+ const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
602
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
603
+ const size_t i_dst = i_src0;
604
+
605
+ const src0_t * src0_row = src0 + i_src0;
606
+ const src1_t * src1_row = src1 + i_src1;
607
+ dst_t * dst_row = dst + i_dst;
608
+
609
+ const int i10 = i0 % ne10;
610
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
538
611
  }
539
612
 
540
613
  static __global__ void gelu_f32(const float * x, float * dst, const int k) {
@@ -577,22 +650,11 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
577
650
  dst[i] = x[i] * x[i];
578
651
  }
579
652
 
580
- static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
581
- #pragma unroll
582
- for (int mask = 16; mask > 0; mask >>= 1) {
583
- a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
584
- a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
585
- }
586
- return a;
587
- }
588
-
589
653
  template <int block_size>
590
- static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
654
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
591
655
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
592
656
  const int tid = threadIdx.x;
593
657
 
594
- const float eps = 1e-5f;
595
-
596
658
  float2 mean_var = make_float2(0.f, 0.f);
597
659
 
598
660
  for (int col = tid; col < ncols; col += block_size) {
@@ -624,14 +686,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
624
686
  }
625
687
  }
626
688
 
627
- static __device__ __forceinline__ float warp_reduce_sum(float x) {
628
- #pragma unroll
629
- for (int mask = 16; mask > 0; mask >>= 1) {
630
- x += __shfl_xor_sync(0xffffffff, x, mask, 32);
631
- }
632
- return x;
633
- }
634
-
635
689
  template <int block_size>
636
690
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
637
691
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -4550,6 +4604,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4550
4604
  cpy_1(cx + x_offset, cdst + dst_offset);
4551
4605
  }
4552
4606
 
4607
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
4608
+ const float * xi = (const float *) cxi;
4609
+ block_q8_0 * dsti = (block_q8_0 *) cdsti;
4610
+
4611
+ float amax = 0.0f; // absolute max
4612
+
4613
+ for (int j = 0; j < QK8_0; j++) {
4614
+ const float v = xi[j];
4615
+ amax = fmaxf(amax, fabsf(v));
4616
+ }
4617
+
4618
+ const float d = amax / ((1 << 7) - 1);
4619
+ const float id = d ? 1.0f/d : 0.0f;
4620
+
4621
+ dsti->d = d;
4622
+
4623
+ for (int j = 0; j < QK8_0; ++j) {
4624
+ const float x0 = xi[j]*id;
4625
+
4626
+ dsti->qs[j] = roundf(x0);
4627
+ }
4628
+ }
4629
+
4630
+ static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
4631
+ const float * xi = (const float *) cxi;
4632
+ block_q4_0 * dsti = (block_q4_0 *) cdsti;
4633
+
4634
+ float amax = 0.0f;
4635
+ float vmax = 0.0f;
4636
+
4637
+ for (int j = 0; j < QK4_0; ++j) {
4638
+ const float v = xi[j];
4639
+ if (amax < fabsf(v)) {
4640
+ amax = fabsf(v);
4641
+ vmax = v;
4642
+ }
4643
+ }
4644
+
4645
+ const float d = vmax / -8;
4646
+ const float id = d ? 1.0f/d : 0.0f;
4647
+
4648
+ dsti->d = d;
4649
+
4650
+ for (int j = 0; j < QK4_0/2; ++j) {
4651
+ const float x0 = xi[0 + j]*id;
4652
+ const float x1 = xi[QK4_0/2 + j]*id;
4653
+
4654
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
4655
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
4656
+
4657
+ dsti->qs[j] = xi0;
4658
+ dsti->qs[j] |= xi1 << 4;
4659
+ }
4660
+ }
4661
+
4662
+ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
4663
+ const float * xi = (const float *) cxi;
4664
+ block_q4_1 * dsti = (block_q4_1 *) cdsti;
4665
+
4666
+ float vmin = FLT_MAX;
4667
+ float vmax = -FLT_MAX;
4668
+
4669
+ for (int j = 0; j < QK4_1; ++j) {
4670
+ const float v = xi[j];
4671
+
4672
+ if (v < vmin) vmin = v;
4673
+ if (v > vmax) vmax = v;
4674
+ }
4675
+
4676
+ const float d = (vmax - vmin) / ((1 << 4) - 1);
4677
+ const float id = d ? 1.0f/d : 0.0f;
4678
+
4679
+ dsti->dm.x = d;
4680
+ dsti->dm.y = vmin;
4681
+
4682
+ for (int j = 0; j < QK4_1/2; ++j) {
4683
+ const float x0 = (xi[0 + j] - vmin)*id;
4684
+ const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
4685
+
4686
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
4687
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
4688
+
4689
+ dsti->qs[j] = xi0;
4690
+ dsti->qs[j] |= xi1 << 4;
4691
+ }
4692
+ }
4693
+
4694
+ template <cpy_kernel_t cpy_blck, int qk>
4695
+ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
4696
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
4697
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
4698
+ const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
4699
+
4700
+ if (i >= ne) {
4701
+ return;
4702
+ }
4703
+
4704
+ const int i02 = i / (ne00*ne01);
4705
+ const int i01 = (i - i02*ne01*ne00) / ne00;
4706
+ const int i00 = (i - i02*ne01*ne00 - i01*ne00);
4707
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
4708
+
4709
+ const int i12 = i / (ne10*ne11);
4710
+ const int i11 = (i - i12*ne10*ne11) / ne10;
4711
+ const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
4712
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
4713
+
4714
+ cpy_blck(cx + x_offset, cdst + dst_offset);
4715
+ }
4716
+
4553
4717
  static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4554
4718
  const float y = (i0 / 2 - low) / max(0.001f, high - low);
4555
4719
  return 1.0f - min(1.0f, max(0.0f, y));
@@ -4610,8 +4774,8 @@ static __global__ void rope(
4610
4774
 
4611
4775
  template<typename T, bool has_pos>
4612
4776
  static __global__ void rope_neox(
4613
- const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4614
- float ext_factor, float attn_factor, rope_corr_dims corr_dims
4777
+ const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
4778
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
4615
4779
  ) {
4616
4780
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4617
4781
 
@@ -4620,23 +4784,25 @@ static __global__ void rope_neox(
4620
4784
  }
4621
4785
 
4622
4786
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4623
- const int i = row*ncols + col/2;
4787
+ const int ib = col / n_dims;
4788
+ const int ic = col % n_dims;
4789
+
4790
+ const int i = row*ncols + ib*n_dims + ic/2;
4624
4791
  const int i2 = row/p_delta_rows;
4625
4792
 
4626
- // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4627
- const float cur_rot = -float(col)/ncols;
4793
+ float cur_rot = inv_ndims * ic - ib;
4628
4794
 
4629
4795
  const int p = has_pos ? pos[i2] : 0;
4630
- const float theta_base = p*powf(freq_base, cur_rot);
4796
+ const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
4631
4797
 
4632
4798
  float cos_theta, sin_theta;
4633
4799
  rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4634
4800
 
4635
4801
  const float x0 = x[i + 0];
4636
- const float x1 = x[i + ncols/2];
4802
+ const float x1 = x[i + n_dims/2];
4637
4803
 
4638
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
4639
- dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4804
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4805
+ dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
4640
4806
  }
4641
4807
 
4642
4808
  static __global__ void rope_glm_f32(
@@ -4702,6 +4868,65 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
4702
4868
  dst[i] = col * m_k + x[i];
4703
4869
  }
4704
4870
 
4871
+ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
4872
+ const int row = blockIdx.y;
4873
+ const int col = threadIdx.x;
4874
+
4875
+ float sum = 0.0f;
4876
+ for (int i = col; i < ncols; i += blockDim.x) {
4877
+ sum += x[row * ncols + i];
4878
+ }
4879
+
4880
+ sum = warp_reduce_sum(sum);
4881
+
4882
+ if (col == 0) {
4883
+ dst[row] = sum;
4884
+ }
4885
+ }
4886
+
4887
+ template<typename T>
4888
+ static inline __device__ void swap(T & a, T & b) {
4889
+ T tmp = a;
4890
+ a = b;
4891
+ b = tmp;
4892
+ }
4893
+
4894
+ template<ggml_sort_order order>
4895
+ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
4896
+ // bitonic sort
4897
+ int col = threadIdx.x;
4898
+ int row = blockIdx.y;
4899
+
4900
+ if (col >= ncols) return;
4901
+
4902
+ const float * x_row = x + row * ncols;
4903
+ int * dst_row = dst + row * ncols;
4904
+
4905
+ // initialize indices
4906
+ if (col < ncols) {
4907
+ dst_row[col] = col;
4908
+ }
4909
+ __syncthreads();
4910
+
4911
+ for (int k = 2; k <= ncols; k *= 2) {
4912
+ for (int j = k / 2; j > 0; j /= 2) {
4913
+ int ixj = col ^ j;
4914
+ if (ixj > col) {
4915
+ if ((col & k) == 0) {
4916
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
4917
+ swap(dst_row[col], dst_row[ixj]);
4918
+ }
4919
+ } else {
4920
+ if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
4921
+ swap(dst_row[col], dst_row[ixj]);
4922
+ }
4923
+ }
4924
+ }
4925
+ __syncthreads();
4926
+ }
4927
+ }
4928
+ }
4929
+
4705
4930
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4706
4931
  const int col = blockDim.y*blockIdx.y + threadIdx.y;
4707
4932
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
@@ -4711,49 +4936,79 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
4711
4936
  }
4712
4937
 
4713
4938
  const int i = row*ncols + col;
4714
- // dst[i] = col > n_past + row ? -INFINITY : x[i];
4715
- dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4939
+ //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
4940
+ //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4941
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
4716
4942
  }
4717
4943
 
4718
- // the CUDA soft max implementation differs from the CPU implementation
4719
- // instead of doubles floats are used
4720
- static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
4721
- const int row = blockDim.x*blockIdx.x + threadIdx.x;
4722
- const int block_size = blockDim.y;
4723
- const int tid = threadIdx.y;
4944
+ static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
4945
+ const int tid = threadIdx.x;
4946
+ const int rowx = blockIdx.x;
4947
+ const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
4948
+
4949
+ const int block_size = blockDim.x;
4950
+
4951
+ const int warp_id = threadIdx.x / WARP_SIZE;
4952
+ const int lane_id = threadIdx.x % WARP_SIZE;
4953
+
4954
+ __shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
4724
4955
 
4725
4956
  float max_val = -INFINITY;
4726
4957
 
4727
4958
  for (int col = tid; col < ncols; col += block_size) {
4728
- const int i = row*ncols + col;
4729
- max_val = max(max_val, x[i]);
4959
+ const int ix = rowx*ncols + col;
4960
+ const int iy = rowy*ncols + col;
4961
+ max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
4730
4962
  }
4731
4963
 
4732
4964
  // find the max value in the block
4733
- #pragma unroll
4734
- for (int mask = 16; mask > 0; mask >>= 1) {
4735
- max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4965
+ max_val = warp_reduce_max(max_val);
4966
+ if (block_size > WARP_SIZE) {
4967
+ if (warp_id == 0) {
4968
+ buf[lane_id] = -INFINITY;
4969
+ }
4970
+ __syncthreads();
4971
+
4972
+ if (lane_id == 0) {
4973
+ buf[warp_id] = max_val;
4974
+ }
4975
+ __syncthreads();
4976
+
4977
+ max_val = buf[lane_id];
4978
+ max_val = warp_reduce_max(max_val);
4736
4979
  }
4737
4980
 
4738
4981
  float tmp = 0.f;
4739
4982
 
4740
4983
  for (int col = tid; col < ncols; col += block_size) {
4741
- const int i = row*ncols + col;
4742
- const float val = expf(x[i] - max_val);
4984
+ const int ix = rowx*ncols + col;
4985
+ const int iy = rowy*ncols + col;
4986
+ const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
4743
4987
  tmp += val;
4744
- dst[i] = val;
4988
+ dst[ix] = val;
4745
4989
  }
4746
4990
 
4747
- // sum up partial sums
4748
- #pragma unroll
4749
- for (int mask = 16; mask > 0; mask >>= 1) {
4750
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
4991
+ // find the sum of exps in the block
4992
+ tmp = warp_reduce_sum(tmp);
4993
+ if (block_size > WARP_SIZE) {
4994
+ if (warp_id == 0) {
4995
+ buf[lane_id] = 0.f;
4996
+ }
4997
+ __syncthreads();
4998
+
4999
+ if (lane_id == 0) {
5000
+ buf[warp_id] = tmp;
5001
+ }
5002
+ __syncthreads();
5003
+
5004
+ tmp = buf[lane_id];
5005
+ tmp = warp_reduce_sum(tmp);
4751
5006
  }
4752
5007
 
4753
5008
  const float inv_tmp = 1.f / tmp;
4754
5009
 
4755
5010
  for (int col = tid; col < ncols; col += block_size) {
4756
- const int i = row*ncols + col;
5011
+ const int i = rowx*ncols + col;
4757
5012
  dst[i] *= inv_tmp;
4758
5013
  }
4759
5014
  }
@@ -4805,25 +5060,119 @@ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const
4805
5060
  k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4806
5061
  }
4807
5062
 
4808
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4809
- const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4810
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
4811
- }
4812
-
4813
- static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
4814
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4815
- add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4816
- }
4817
-
4818
- static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4819
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4820
- add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4821
- }
5063
+ template<float (*bin_op)(const float, const float)>
5064
+ struct bin_bcast_cuda {
5065
+ template<typename src0_t, typename src1_t, typename dst_t>
5066
+ void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
5067
+ const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
5068
+ cudaStream_t stream) {
5069
+
5070
+ GGML_TENSOR_BINARY_OP_LOCALS
5071
+
5072
+
5073
+ int nr0 = ne10/ne0;
5074
+ int nr1 = ne11/ne1;
5075
+ int nr2 = ne12/ne2;
5076
+ int nr3 = ne13/ne3;
5077
+
5078
+ int nr[4] = { nr0, nr1, nr2, nr3 };
5079
+
5080
+ // collapse dimensions until first broadcast dimension
5081
+ int64_t cne0[] = {ne0, ne1, ne2, ne3};
5082
+ int64_t cne1[] = {ne10, ne11, ne12, ne13};
5083
+ size_t cnb0[] = {nb0, nb1, nb2, nb3};
5084
+ size_t cnb1[] = {nb10, nb11, nb12, nb13};
5085
+ auto collapse = [](int64_t cne[]) {
5086
+ cne[0] *= cne[1];
5087
+ cne[1] = cne[2];
5088
+ cne[2] = cne[3];
5089
+ cne[3] = 1;
5090
+ };
5091
+
5092
+ auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
5093
+ cnb[1] *= cne[1];
5094
+ cnb[2] *= cne[2];
5095
+ cnb[3] *= cne[3];
5096
+ };
5097
+
5098
+ for (int i = 0; i < 4; i++) {
5099
+ if (nr[i] != 1) {
5100
+ break;
5101
+ }
5102
+ if (i > 0) {
5103
+ collapse_nb(cnb0, cne0);
5104
+ collapse_nb(cnb1, cne1);
5105
+ collapse(cne0);
5106
+ collapse(cne1);
5107
+ }
5108
+ }
5109
+ {
5110
+ int64_t ne0 = cne0[0];
5111
+ int64_t ne1 = cne0[1];
5112
+ int64_t ne2 = cne0[2];
5113
+ int64_t ne3 = cne0[3];
5114
+
5115
+ int64_t ne10 = cne1[0];
5116
+ int64_t ne11 = cne1[1];
5117
+ int64_t ne12 = cne1[2];
5118
+ int64_t ne13 = cne1[3];
5119
+
5120
+ //size_t nb0 = cnb0[0];
5121
+ size_t nb1 = cnb0[1];
5122
+ size_t nb2 = cnb0[2];
5123
+ size_t nb3 = cnb0[3];
5124
+
5125
+ //size_t nb10 = cnb1[0];
5126
+ size_t nb11 = cnb1[1];
5127
+ size_t nb12 = cnb1[2];
5128
+ size_t nb13 = cnb1[3];
5129
+
5130
+ //size_t s0 = nb0 / sizeof(src1_t);
5131
+ size_t s1 = nb1 / sizeof(src1_t);
5132
+ size_t s2 = nb2 / sizeof(src1_t);
5133
+ size_t s3 = nb3 / sizeof(src1_t);
5134
+
5135
+ //size_t s10 = nb10 / sizeof(src1_t);
5136
+ size_t s11 = nb11 / sizeof(src1_t);
5137
+ size_t s12 = nb12 / sizeof(src1_t);
5138
+ size_t s13 = nb13 / sizeof(src1_t);
5139
+
5140
+
5141
+ const int block_size = 128;
5142
+
5143
+ int64_t hne0 = std::max(ne0/2LL, 1LL);
5144
+
5145
+ dim3 block_dims;
5146
+ block_dims.x = std::min<unsigned int>(hne0, block_size);
5147
+ block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
5148
+ block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
5149
+
5150
+ dim3 block_nums(
5151
+ (hne0 + block_dims.x - 1) / block_dims.x,
5152
+ (ne1 + block_dims.y - 1) / block_dims.y,
5153
+ (ne2*ne3 + block_dims.z - 1) / block_dims.z
5154
+ );
4822
5155
 
4823
- static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4824
- const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4825
- mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
4826
- }
5156
+ if (block_nums.z > 65535) {
5157
+ // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
5158
+ int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
5159
+ k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
5160
+ src0_dd, src1_dd, dst_dd,
5161
+ ne0, ne1, ne2, ne3,
5162
+ ne10, ne11, ne12, ne13,
5163
+ /* s0, */ s1, s2, s3,
5164
+ /* s10, */ s11, s12, s13);
5165
+ } else {
5166
+ k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
5167
+ src0_dd, src1_dd, dst_dd,
5168
+ ne0, ne1, ne2, ne3,
5169
+ ne10, ne11, ne12, ne13,
5170
+ /* s0, */ s1, s2, s3,
5171
+ /* s10, */ s11, s12, s13);
5172
+ }
5173
+ }
5174
+ }
5175
+ };
4827
5176
 
4828
5177
  static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4829
5178
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
@@ -4845,14 +5194,14 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
4845
5194
  sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4846
5195
  }
4847
5196
 
4848
- static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5197
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4849
5198
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4850
5199
  if (ncols < 1024) {
4851
5200
  const dim3 block_dims(WARP_SIZE, 1, 1);
4852
- norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5201
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4853
5202
  } else {
4854
5203
  const dim3 block_dims(1024, 1, 1);
4855
- norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
5204
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4856
5205
  }
4857
5206
  }
4858
5207
 
@@ -4874,34 +5223,10 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
4874
5223
  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
4875
5224
  }
4876
5225
 
4877
- template<typename dst_t>
4878
- static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4879
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4880
- dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4881
- }
4882
-
4883
- template<typename dst_t>
4884
- static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4885
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4886
- dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4887
- }
4888
-
4889
- template<typename dst_t>
4890
- static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4891
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4892
- dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4893
- }
4894
-
4895
- template<typename dst_t>
4896
- static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
4897
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4898
- dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4899
- }
4900
-
4901
- template<typename dst_t>
4902
- static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
5226
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
5227
+ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
4903
5228
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
4904
- dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5229
+ dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4905
5230
  }
4906
5231
 
4907
5232
  template<typename dst_t>
@@ -4950,6 +5275,64 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4950
5275
  #endif
4951
5276
  }
4952
5277
 
5278
+ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5279
+ switch (type) {
5280
+ case GGML_TYPE_Q4_0:
5281
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5282
+ case GGML_TYPE_Q4_1:
5283
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5284
+ case GGML_TYPE_Q5_0:
5285
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5286
+ case GGML_TYPE_Q5_1:
5287
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5288
+ case GGML_TYPE_Q8_0:
5289
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5290
+ case GGML_TYPE_Q2_K:
5291
+ return dequantize_row_q2_K_cuda;
5292
+ case GGML_TYPE_Q3_K:
5293
+ return dequantize_row_q3_K_cuda;
5294
+ case GGML_TYPE_Q4_K:
5295
+ return dequantize_row_q4_K_cuda;
5296
+ case GGML_TYPE_Q5_K:
5297
+ return dequantize_row_q5_K_cuda;
5298
+ case GGML_TYPE_Q6_K:
5299
+ return dequantize_row_q6_K_cuda;
5300
+ case GGML_TYPE_F32:
5301
+ return dequantize_block_cuda<1, 1, convert_f32>;
5302
+ default:
5303
+ return nullptr;
5304
+ }
5305
+ }
5306
+
5307
+ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5308
+ switch (type) {
5309
+ case GGML_TYPE_Q4_0:
5310
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
5311
+ case GGML_TYPE_Q4_1:
5312
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
5313
+ case GGML_TYPE_Q5_0:
5314
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5315
+ case GGML_TYPE_Q5_1:
5316
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5317
+ case GGML_TYPE_Q8_0:
5318
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5319
+ case GGML_TYPE_Q2_K:
5320
+ return dequantize_row_q2_K_cuda;
5321
+ case GGML_TYPE_Q3_K:
5322
+ return dequantize_row_q3_K_cuda;
5323
+ case GGML_TYPE_Q4_K:
5324
+ return dequantize_row_q4_K_cuda;
5325
+ case GGML_TYPE_Q5_K:
5326
+ return dequantize_row_q5_K_cuda;
5327
+ case GGML_TYPE_Q6_K:
5328
+ return dequantize_row_q6_K_cuda;
5329
+ case GGML_TYPE_F16:
5330
+ return dequantize_block_cuda<1, 1, convert_f16>;
5331
+ default:
5332
+ return nullptr;
5333
+ }
5334
+ }
5335
+
4953
5336
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4954
5337
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4955
5338
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -5038,13 +5421,22 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
5038
5421
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5039
5422
  }
5040
5423
 
5041
- static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5042
- GGML_ASSERT(ncols % QK4_0 == 0);
5424
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5425
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5043
5426
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5044
5427
  const dim3 block_nums(block_num_y, 1, 1);
5045
5428
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5046
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
5047
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5429
+ dequantize_mul_mat_vec<1, 1, convert_f16>
5430
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5431
+ }
5432
+
5433
+ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5434
+ GGML_ASSERT(ncols % QK4_0 == 0);
5435
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5436
+ const dim3 block_nums(block_num_y, 1, 1);
5437
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5438
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
5439
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5048
5440
  }
5049
5441
 
5050
5442
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -5128,83 +5520,6 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
5128
5520
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5129
5521
  }
5130
5522
 
5131
- static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
5132
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
5133
- dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5134
- }
5135
-
5136
- static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
5137
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
5138
- dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5139
- }
5140
-
5141
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5142
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5143
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5144
- const dim3 block_nums(block_num_y, 1, 1);
5145
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5146
- dequantize_mul_mat_vec<1, 1, convert_f16>
5147
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
5148
- }
5149
-
5150
- static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5151
- switch (type) {
5152
- case GGML_TYPE_Q4_0:
5153
- return dequantize_row_q4_0_cuda;
5154
- case GGML_TYPE_Q4_1:
5155
- return dequantize_row_q4_1_cuda;
5156
- case GGML_TYPE_Q5_0:
5157
- return dequantize_row_q5_0_cuda;
5158
- case GGML_TYPE_Q5_1:
5159
- return dequantize_row_q5_1_cuda;
5160
- case GGML_TYPE_Q8_0:
5161
- return dequantize_row_q8_0_cuda;
5162
- case GGML_TYPE_Q2_K:
5163
- return dequantize_row_q2_K_cuda;
5164
- case GGML_TYPE_Q3_K:
5165
- return dequantize_row_q3_K_cuda;
5166
- case GGML_TYPE_Q4_K:
5167
- return dequantize_row_q4_K_cuda;
5168
- case GGML_TYPE_Q5_K:
5169
- return dequantize_row_q5_K_cuda;
5170
- case GGML_TYPE_Q6_K:
5171
- return dequantize_row_q6_K_cuda;
5172
- case GGML_TYPE_F32:
5173
- return convert_fp32_to_fp16_cuda;
5174
- default:
5175
- return nullptr;
5176
- }
5177
- }
5178
-
5179
- static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5180
- switch (type) {
5181
- case GGML_TYPE_Q4_0:
5182
- return dequantize_row_q4_0_cuda;
5183
- case GGML_TYPE_Q4_1:
5184
- return dequantize_row_q4_1_cuda;
5185
- case GGML_TYPE_Q5_0:
5186
- return dequantize_row_q5_0_cuda;
5187
- case GGML_TYPE_Q5_1:
5188
- return dequantize_row_q5_1_cuda;
5189
- case GGML_TYPE_Q8_0:
5190
- return dequantize_row_q8_0_cuda;
5191
- case GGML_TYPE_Q2_K:
5192
- return dequantize_row_q2_K_cuda;
5193
- case GGML_TYPE_Q3_K:
5194
- return dequantize_row_q3_K_cuda;
5195
- case GGML_TYPE_Q4_K:
5196
- return dequantize_row_q4_K_cuda;
5197
- case GGML_TYPE_Q5_K:
5198
- return dequantize_row_q5_K_cuda;
5199
- case GGML_TYPE_Q6_K:
5200
- return dequantize_row_q6_K_cuda;
5201
- case GGML_TYPE_F16:
5202
- return convert_fp16_to_fp32_cuda;
5203
- default:
5204
- return nullptr;
5205
- }
5206
- }
5207
-
5208
5523
  static void ggml_mul_mat_q4_0_q8_1_cuda(
5209
5524
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
5210
5525
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
@@ -5697,6 +6012,39 @@ static void ggml_cpy_f32_f16_cuda(
5697
6012
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5698
6013
  }
5699
6014
 
6015
+ static void ggml_cpy_f32_q8_0_cuda(
6016
+ const char * cx, char * cdst, const int ne,
6017
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6018
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6019
+
6020
+ GGML_ASSERT(ne % QK8_0 == 0);
6021
+ const int num_blocks = ne / QK8_0;
6022
+ cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
6023
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6024
+ }
6025
+
6026
+ static void ggml_cpy_f32_q4_0_cuda(
6027
+ const char * cx, char * cdst, const int ne,
6028
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6029
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6030
+
6031
+ GGML_ASSERT(ne % QK4_0 == 0);
6032
+ const int num_blocks = ne / QK4_0;
6033
+ cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
6034
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6035
+ }
6036
+
6037
+ static void ggml_cpy_f32_q4_1_cuda(
6038
+ const char * cx, char * cdst, const int ne,
6039
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
6040
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
6041
+
6042
+ GGML_ASSERT(ne % QK4_1 == 0);
6043
+ const int num_blocks = ne / QK4_1;
6044
+ cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
6045
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
6046
+ }
6047
+
5700
6048
  static void ggml_cpy_f16_f16_cuda(
5701
6049
  const char * cx, char * cdst, const int ne,
5702
6050
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -5739,20 +6087,26 @@ static void rope_cuda(
5739
6087
 
5740
6088
  template<typename T>
5741
6089
  static void rope_neox_cuda(
5742
- const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
6090
+ const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5743
6091
  float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5744
6092
  ) {
5745
6093
  GGML_ASSERT(ncols % 2 == 0);
5746
6094
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5747
6095
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5748
6096
  const dim3 block_nums(nrows, num_blocks_x, 1);
6097
+
6098
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
6099
+ const float inv_ndims = -1.0f / n_dims;
6100
+
5749
6101
  if (pos == nullptr) {
5750
6102
  rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5751
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
6103
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
6104
+ theta_scale, inv_ndims
5752
6105
  );
5753
6106
  } else {
5754
6107
  rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5755
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
6108
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
6109
+ theta_scale, inv_ndims
5756
6110
  );
5757
6111
  }
5758
6112
  }
@@ -5777,6 +6131,27 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
5777
6131
  alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
5778
6132
  }
5779
6133
 
6134
+ static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6135
+ const dim3 block_dims(WARP_SIZE, 1, 1);
6136
+ const dim3 block_nums(1, nrows, 1);
6137
+ k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6138
+ }
6139
+
6140
+ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
6141
+ // bitonic sort requires ncols to be power of 2
6142
+ GGML_ASSERT((ncols & (ncols - 1)) == 0);
6143
+
6144
+ const dim3 block_dims(ncols, 1, 1);
6145
+ const dim3 block_nums(1, nrows, 1);
6146
+ if (order == GGML_SORT_ASC) {
6147
+ k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6148
+ } else if (order == GGML_SORT_DESC) {
6149
+ k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6150
+ } else {
6151
+ GGML_ASSERT(false);
6152
+ }
6153
+ }
6154
+
5780
6155
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
5781
6156
  const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
5782
6157
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -5784,10 +6159,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
5784
6159
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
5785
6160
  }
5786
6161
 
5787
- static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
5788
- const dim3 block_dims(1, WARP_SIZE, 1);
6162
+ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
6163
+ int nth = WARP_SIZE;
6164
+ while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
6165
+ const dim3 block_dims(nth, 1, 1);
5789
6166
  const dim3 block_nums(nrows_x, 1, 1);
5790
- soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
6167
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
5791
6168
  }
5792
6169
 
5793
6170
  static void im2col_f32_f16_cuda(const float * x, half * dst,
@@ -5867,7 +6244,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5867
6244
  return ptr;
5868
6245
  }
5869
6246
  #ifdef DEBUG_CUDA_MALLOC
5870
- fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
6247
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5871
6248
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5872
6249
  #endif
5873
6250
  void * ptr;
@@ -6005,7 +6382,7 @@ void * ggml_cuda_host_malloc(size_t size) {
6005
6382
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
6006
6383
  // This can fixed the OOM error in WSL.
6007
6384
  cudaGetLastError();
6008
- fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
6385
+ fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6009
6386
  size/1024.0/1024.0, cudaGetErrorString(err));
6010
6387
  return nullptr;
6011
6388
  }
@@ -6050,75 +6427,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6050
6427
  const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
6051
6428
  if (nb0 == ts && nb1 == ts*ne0/bs) {
6052
6429
  return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
6053
- }
6054
- if (nb0 == ts) {
6430
+ } else if (nb0 == ts) {
6055
6431
  return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
6056
- }
6057
- for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6058
- const void * rx = (const void *) ((const char *) x + i1*nb1);
6059
- void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6060
- // pretend the row is a matrix with cols=1
6061
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6062
- if (r != cudaSuccess) { return r; }
6063
- }
6064
- return cudaSuccess;
6065
- }
6066
-
6067
- static void ggml_cuda_op_repeat(
6068
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6069
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
6070
- // guaranteed to be an integer due to the check in ggml_can_repeat
6071
- const int64_t ne0 = dst->ne[0];
6072
- const int64_t ne1 = dst->ne[1];
6073
- const int64_t ne2 = dst->ne[2];
6074
- const int64_t ne3 = dst->ne[3];
6075
-
6076
- const int64_t ne00 = src0->ne[0];
6077
- const int64_t ne01 = src0->ne[1];
6078
- const int64_t ne02 = src0->ne[2];
6079
- const int64_t ne03 = src0->ne[3];
6080
-
6081
- const size_t nb0 = dst->nb[0];
6082
- const size_t nb1 = dst->nb[1];
6083
- const size_t nb2 = dst->nb[2];
6084
- const size_t nb3 = dst->nb[3];
6085
-
6086
- const size_t nb00 = src0->nb[0];
6087
- const size_t nb01 = src0->nb[1];
6088
- const size_t nb02 = src0->nb[2];
6089
- const size_t nb03 = src0->nb[3];
6090
-
6091
- const int nr0 = (int)(ne0/ne00);
6092
- const int nr1 = (int)(ne1/ne01);
6093
- const int nr2 = (int)(ne2/ne02);
6094
- const int nr3 = (int)(ne3/ne03);
6095
-
6096
- // TODO: support for transposed / permuted tensors
6097
- GGML_ASSERT(nb0 == sizeof(float));
6098
- GGML_ASSERT(nb00 == sizeof(float));
6099
-
6100
- // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
6101
- for (int i3 = 0; i3 < nr3; i3++) {
6102
- for (int k3 = 0; k3 < ne03; k3++) {
6103
- for (int i2 = 0; i2 < nr2; i2++) {
6104
- for (int k2 = 0; k2 < ne02; k2++) {
6105
- for (int i1 = 0; i1 < nr1; i1++) {
6106
- for (int k1 = 0; k1 < ne01; k1++) {
6107
- for (int i0 = 0; i0 < nr0; i0++) {
6108
- CUDA_CHECK(cudaMemcpyAsync(
6109
- (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
6110
- (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
6111
- ne00*nb0, cudaMemcpyDeviceToDevice, stream));
6112
- }
6113
- }
6114
- }
6115
- }
6116
- }
6432
+ } else {
6433
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
6434
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
6435
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
6436
+ // pretend the row is a matrix with cols=1
6437
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
6438
+ if (r != cudaSuccess) return r;
6117
6439
  }
6440
+ return cudaSuccess;
6118
6441
  }
6119
-
6120
- (void) src1;
6121
- (void) src1_d;
6122
6442
  }
6123
6443
 
6124
6444
  static void ggml_cuda_op_get_rows(
@@ -6165,44 +6485,55 @@ static void ggml_cuda_op_get_rows(
6165
6485
  }
6166
6486
  }
6167
6487
 
6168
- inline void ggml_cuda_op_add(
6488
+ template<class op>
6489
+ inline void ggml_cuda_op_bin_bcast(
6169
6490
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6170
6491
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6171
6492
 
6172
6493
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6173
6494
 
6174
- const int64_t ne10 = src1->ne[0];
6175
- const int64_t ne11 = src1->ne[1];
6176
-
6177
6495
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6178
- add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6496
+ op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6179
6497
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
6180
- add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6498
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
6181
6499
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6182
- add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
6500
+ op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
6183
6501
  } else {
6184
- fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
6502
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
6503
+ ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
6185
6504
  GGML_ASSERT(false);
6186
6505
  }
6506
+ }
6507
+
6508
+ static void ggml_cuda_op_repeat(
6509
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6510
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
6511
+
6512
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
6187
6513
 
6188
6514
  (void) src1;
6189
- (void) dst;
6515
+ (void) src1_d;
6190
6516
  }
6191
6517
 
6192
- inline void ggml_cuda_op_mul(
6518
+ inline void ggml_cuda_op_add(
6193
6519
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6194
6520
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6195
6521
 
6196
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
6197
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
6198
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
6522
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6523
+ }
6199
6524
 
6200
- const int64_t ne10 = src1->ne[0];
6201
- const int64_t ne11 = src1->ne[1];
6525
+ inline void ggml_cuda_op_mul(
6526
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6527
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6202
6528
 
6203
- mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
6529
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6530
+ }
6204
6531
 
6205
- (void) dst;
6532
+ inline void ggml_cuda_op_div(
6533
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6534
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6535
+
6536
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6206
6537
  }
6207
6538
 
6208
6539
  inline void ggml_cuda_op_gelu(
@@ -6271,7 +6602,10 @@ inline void ggml_cuda_op_norm(
6271
6602
  const int64_t ne00 = src0->ne[0];
6272
6603
  const int64_t nrows = ggml_nrows(src0);
6273
6604
 
6274
- norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
6605
+ float eps;
6606
+ memcpy(&eps, dst->op_params, sizeof(float));
6607
+
6608
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
6275
6609
 
6276
6610
  (void) src1;
6277
6611
  (void) dst;
@@ -6426,6 +6760,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
6426
6760
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6427
6761
  const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6428
6762
 
6763
+ GGML_ASSERT(ggml_nrows(src1) == 1);
6764
+
6429
6765
  const int64_t ne00 = src0->ne[0];
6430
6766
  const int64_t row_diff = row_high - row_low;
6431
6767
 
@@ -6485,7 +6821,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
6485
6821
  size_t ash;
6486
6822
  dfloat * src1_dfloat = nullptr; // dfloat == half
6487
6823
 
6488
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
6824
+ bool src1_convert_f16 =
6825
+ src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
6489
6826
  src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
6490
6827
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
6491
6828
 
@@ -6707,15 +7044,14 @@ inline void ggml_cuda_op_rope(
6707
7044
  GGML_ASSERT(false);
6708
7045
  rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6709
7046
  } else if (is_neox) {
6710
- GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6711
7047
  if (src0->type == GGML_TYPE_F32) {
6712
7048
  rope_neox_cuda(
6713
- (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
7049
+ (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6714
7050
  attn_factor, corr_dims, main_stream
6715
7051
  );
6716
7052
  } else if (src0->type == GGML_TYPE_F16) {
6717
7053
  rope_neox_cuda(
6718
- (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
7054
+ (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6719
7055
  attn_factor, corr_dims, main_stream
6720
7056
  );
6721
7057
  } else {
@@ -6812,6 +7148,42 @@ inline void ggml_cuda_op_im2col(
6812
7148
  (void) src0_dd;
6813
7149
  }
6814
7150
 
7151
+ inline void ggml_cuda_op_sum_rows(
7152
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7153
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7154
+
7155
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7156
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7157
+
7158
+ const int64_t ncols = src0->ne[0];
7159
+ const int64_t nrows = ggml_nrows(src0);
7160
+
7161
+ sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
7162
+
7163
+ (void) src1;
7164
+ (void) dst;
7165
+ (void) src1_dd;
7166
+ }
7167
+
7168
+ inline void ggml_cuda_op_argsort(
7169
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7170
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7171
+
7172
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7173
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
7174
+
7175
+ const int64_t ncols = src0->ne[0];
7176
+ const int64_t nrows = ggml_nrows(src0);
7177
+
7178
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
7179
+
7180
+ argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
7181
+
7182
+ (void) src1;
7183
+ (void) dst;
7184
+ (void) src1_dd;
7185
+ }
7186
+
6815
7187
  inline void ggml_cuda_op_diag_mask_inf(
6816
7188
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6817
7189
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6839,14 +7211,18 @@ inline void ggml_cuda_op_soft_max(
6839
7211
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6840
7212
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6841
7213
 
7214
+ GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
7215
+
6842
7216
  const int64_t ne00 = src0->ne[0];
6843
- const int64_t nrows = ggml_nrows(src0);
7217
+ const int64_t nrows_x = ggml_nrows(src0);
7218
+ const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
6844
7219
 
6845
- soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
7220
+ float scale = 1.0f;
7221
+ memcpy(&scale, dst->op_params, sizeof(float));
7222
+
7223
+ soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
6846
7224
 
6847
- (void) src1;
6848
7225
  (void) dst;
6849
- (void) src1_dd;
6850
7226
  }
6851
7227
 
6852
7228
  inline void ggml_cuda_op_scale(
@@ -7016,7 +7392,7 @@ static void ggml_cuda_op_mul_mat(
7016
7392
  const int64_t ne01 = src0->ne[1];
7017
7393
  const int64_t ne02 = src0->ne[2];
7018
7394
  const int64_t ne03 = src0->ne[3];
7019
- // const int64_t nrows0 = ggml_nrows(src0);
7395
+ const int64_t nrows0 = ggml_nrows(src0);
7020
7396
 
7021
7397
  const int64_t ne10 = src1->ne[0];
7022
7398
  const int64_t ne11 = src1->ne[1];
@@ -7052,10 +7428,9 @@ static void ggml_cuda_op_mul_mat(
7052
7428
 
7053
7429
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
7054
7430
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
7055
-
7056
7431
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
7057
- const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
7058
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
7432
+
7433
+ const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
7059
7434
 
7060
7435
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7061
7436
  GGML_ASSERT(!(split && ne02 > 1));
@@ -7180,7 +7555,7 @@ static void ggml_cuda_op_mul_mat(
7180
7555
  const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
7181
7556
 
7182
7557
  // for split tensors the data begins at i0 == i0_offset_low
7183
- char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
7558
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
7184
7559
  float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
7185
7560
  char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
7186
7561
  float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
@@ -7325,6 +7700,10 @@ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, gg
7325
7700
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
7326
7701
  }
7327
7702
 
7703
+ static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7704
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
7705
+ }
7706
+
7328
7707
  static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7329
7708
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
7330
7709
  }
@@ -7350,7 +7729,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7350
7729
  }
7351
7730
 
7352
7731
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7353
- if (!g_cublas_loaded) { return false; }
7732
+ if (!g_cublas_loaded) return false;
7354
7733
 
7355
7734
  const int64_t ne10 = src1->ne[0];
7356
7735
 
@@ -7428,7 +7807,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7428
7807
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7429
7808
  }
7430
7809
 
7431
- __global__ static void k_compute_batched_ptrs(
7810
+ static __global__ void k_compute_batched_ptrs(
7432
7811
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7433
7812
  const void ** ptrs_src, void ** ptrs_dst,
7434
7813
  int ne12, int ne13,
@@ -7484,9 +7863,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7484
7863
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7485
7864
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7486
7865
 
7487
- int id;
7488
- CUDA_CHECK(cudaGetDevice(&id));
7489
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
7866
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
7490
7867
 
7491
7868
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7492
7869
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -7543,7 +7920,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7543
7920
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
7544
7921
  // use cublasGemmStridedBatchedEx
7545
7922
  CUBLAS_CHECK(
7546
- cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7923
+ cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7547
7924
  ne01, ne11, ne10,
7548
7925
  &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
7549
7926
  (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
@@ -7577,7 +7954,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7577
7954
  CUDA_CHECK(cudaGetLastError());
7578
7955
 
7579
7956
  CUBLAS_CHECK(
7580
- cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7957
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
7581
7958
  ne01, ne11, ne10,
7582
7959
  &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7583
7960
  (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
@@ -7647,10 +8024,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7647
8024
  #ifdef GGML_CUDA_FORCE_DMMV
7648
8025
  const bool use_mul_mat_vec_q = false;
7649
8026
  #else
7650
- const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
8027
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
7651
8028
  #endif // GGML_CUDA_FORCE_DMMV
7652
8029
 
7653
8030
  if (use_mul_mat_vec_q) {
8031
+ // NOTE: this kernel does not support ggml_nrows(src1) > 1
7654
8032
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
7655
8033
  } else {
7656
8034
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
@@ -7675,42 +8053,255 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7675
8053
  }
7676
8054
  }
7677
8055
 
7678
- static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7679
- ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
7680
- }
8056
+ #if 0
8057
+ template<typename ... Srcs>
8058
+ static __global__ void k_compute_batched_ptrs_id(
8059
+ const void ** ptrs_src, void ** ptrs_dst,
8060
+ int ne12, int ne13,
8061
+ int ne23,
8062
+ int nb02, int nb03,
8063
+ int nb12, int nb13,
8064
+ int nb2, int nb3,
8065
+ int r2, int r3,
8066
+ ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
8067
+ const half * src1_f16, half * dst_f16,
8068
+ const int32_t * ids, const int id,
8069
+ Srcs... src0s) {
8070
+
8071
+ int i = ids[id];
8072
+
8073
+ half * src0_f16;
8074
+ const void * srcs_ar[] = { (const half *) src0s... };
8075
+ if (src0_type == GGML_TYPE_F16) {
8076
+ src0_f16 = (half *) srcs_ar[i];
8077
+ } else {
8078
+ src0_f16 = src0_as_f16;
8079
+ if (threadIdx.x == 0 && threadIdx.y == 0) {
8080
+ const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
8081
+ to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
8082
+ }
8083
+ }
7681
8084
 
7682
- static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7683
- ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
8085
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
8086
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
8087
+
8088
+ if (i13 >= ne13 || i12 >= ne12) {
8089
+ return;
8090
+ }
8091
+
8092
+ int i03 = i13 / r3;
8093
+ int i02 = i12 / r2;
8094
+
8095
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
8096
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
8097
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7684
8098
  }
7685
8099
 
7686
- static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7687
- const int64_t ne = ggml_nelements(src0);
7688
- GGML_ASSERT(ne == ggml_nelements(src1));
8100
+ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
8101
+ const struct ggml_tensor * ids = dst->src[0];
8102
+ const struct ggml_tensor * src1 = dst->src[1];
8103
+ const struct ggml_tensor * src00 = dst->src[2];
7689
8104
 
7690
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
7691
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
8105
+ const int id = dst->op_params[0];
7692
8106
 
7693
- GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
7694
- GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
8107
+ GGML_ASSERT(!ggml_is_transposed(src00));
8108
+ GGML_ASSERT(!ggml_is_transposed(src1));
7695
8109
 
7696
- const int64_t ne00 = src0->ne[0];
7697
- const int64_t ne01 = src0->ne[1];
7698
- GGML_ASSERT(src0->ne[3] == 1);
8110
+ GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
8111
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7699
8112
 
7700
- const int64_t nb00 = src0->nb[0];
7701
- const int64_t nb01 = src0->nb[1];
7702
- const int64_t nb02 = src0->nb[2];
8113
+ const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
8114
+ const int64_t ne01 = src00->ne[1];
8115
+ const int64_t ne02 = src00->ne[2];
8116
+ const int64_t ne03 = src00->ne[3];
8117
+
8118
+ //const int64_t nb01 = src00->nb[1];
8119
+ const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
8120
+ const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
7703
8121
 
7704
8122
  const int64_t ne10 = src1->ne[0];
7705
8123
  const int64_t ne11 = src1->ne[1];
7706
- GGML_ASSERT(src1->ne[3] == 1);
8124
+ const int64_t ne12 = src1->ne[2];
8125
+ const int64_t ne13 = src1->ne[3];
7707
8126
 
7708
- const int64_t nb10 = src1->nb[0];
7709
- const int64_t nb11 = src1->nb[1];
7710
- const int64_t nb12 = src1->nb[2];
8127
+ //const int64_t nb11 = src1->nb[1];
8128
+ const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
8129
+ const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
7711
8130
 
7712
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7713
- cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8131
+ const int64_t ne1 = ggml_nelements(src1);
8132
+ const int64_t ne = ggml_nelements(dst);
8133
+
8134
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8135
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8136
+
8137
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
8138
+
8139
+ //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
8140
+ //void * src0_ddq = src0_extra->data_device[g_main_device];
8141
+ //half * src0_as_f16 = (half *) src0_ddq;
8142
+
8143
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
8144
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
8145
+
8146
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
8147
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
8148
+
8149
+ // convert src1 to fp16
8150
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8151
+ GGML_ASSERT(to_fp16_cuda != nullptr);
8152
+
8153
+ size_t src1_as = 0;
8154
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
8155
+ to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8156
+
8157
+ size_t dst_as = 0;
8158
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8159
+
8160
+ GGML_ASSERT(ne12 % ne02 == 0);
8161
+ GGML_ASSERT(ne13 % ne03 == 0);
8162
+
8163
+ // broadcast factors
8164
+ const int64_t r2 = ne12/ne02;
8165
+ const int64_t r3 = ne13/ne03;
8166
+
8167
+ const half alpha_f16 = 1.0f;
8168
+ const half beta_f16 = 0.0f;
8169
+
8170
+ // use cublasGemmBatchedEx
8171
+ const int ne23 = ne12*ne13;
8172
+
8173
+ const void ** ptrs_src = nullptr;
8174
+ void ** ptrs_dst = nullptr;
8175
+
8176
+ size_t ptrs_src_s = 0;
8177
+ size_t ptrs_dst_s = 0;
8178
+
8179
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
8180
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
8181
+
8182
+ int64_t src0_ne = ggml_nelements(src00);
8183
+ half * src0_as_f16 = nullptr;
8184
+ size_t src0_as = 0;
8185
+ if (src00->type != GGML_TYPE_F16) {
8186
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
8187
+ }
8188
+
8189
+ static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
8190
+ dim3 block_dims(ne13, ne12);
8191
+ k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
8192
+ ptrs_src, ptrs_dst,
8193
+ ne12, ne13,
8194
+ ne23,
8195
+ ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
8196
+ nb12, nb13,
8197
+ dst->nb[2], dst->nb[3],
8198
+ r2, r3,
8199
+ src00->type, src0_as_f16, src0_ne,
8200
+ src1_as_f16, dst_f16,
8201
+ (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
8202
+ dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
8203
+ dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
8204
+ dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
8205
+ dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
8206
+ );
8207
+ CUDA_CHECK(cudaGetLastError());
8208
+
8209
+ CUBLAS_CHECK(
8210
+ cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8211
+ ne01, ne11, ne10,
8212
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
8213
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
8214
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
8215
+ ne23,
8216
+ CUBLAS_COMPUTE_16F,
8217
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8218
+
8219
+ if (src0_as != 0) {
8220
+ ggml_cuda_pool_free(src0_as_f16, src0_as);
8221
+ }
8222
+ if (ptrs_src_s != 0) {
8223
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
8224
+ }
8225
+ if (ptrs_dst_s != 0) {
8226
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
8227
+ }
8228
+
8229
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8230
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8231
+
8232
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
8233
+ ggml_cuda_pool_free(dst_f16, dst_as);
8234
+ }
8235
+ #endif
8236
+
8237
+ static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) {
8238
+ #if 0
8239
+ //#ifdef CUDA_USE_TENSOR_CORES
8240
+ // const bool use_tensor_cores = true;
8241
+ //#else
8242
+ // const bool use_tensor_cores = false;
8243
+ //#endif
8244
+
8245
+ ggml_cuda_mul_mat_id_cublas(dst);
8246
+
8247
+ // TODO: mmq/mmv support
8248
+ #else
8249
+ const struct ggml_tensor * ids = dst->src[0];
8250
+ const struct ggml_tensor * src1 = dst->src[1];
8251
+ const int id = dst->op_params[0];
8252
+
8253
+ int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8254
+
8255
+ int32_t a_id;
8256
+ CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8257
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8258
+
8259
+ GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
8260
+ const struct ggml_tensor * src0 = dst->src[a_id + 2];
8261
+
8262
+ ggml_cuda_mul_mat(src0, src1, dst);
8263
+ #endif
8264
+
8265
+ (void) _src0;
8266
+ (void) _src1;
8267
+ }
8268
+
8269
+ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8270
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
8271
+ }
8272
+
8273
+ static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8274
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
8275
+ }
8276
+
8277
+ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8278
+ const int64_t ne = ggml_nelements(src0);
8279
+ GGML_ASSERT(ne == ggml_nelements(src1));
8280
+
8281
+ GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
8282
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
8283
+
8284
+ GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
8285
+ GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
8286
+
8287
+ const int64_t ne00 = src0->ne[0];
8288
+ const int64_t ne01 = src0->ne[1];
8289
+ GGML_ASSERT(src0->ne[3] == 1);
8290
+
8291
+ const int64_t nb00 = src0->nb[0];
8292
+ const int64_t nb01 = src0->nb[1];
8293
+ const int64_t nb02 = src0->nb[2];
8294
+
8295
+ const int64_t ne10 = src1->ne[0];
8296
+ const int64_t ne11 = src1->ne[1];
8297
+ GGML_ASSERT(src1->ne[3] == 1);
8298
+
8299
+ const int64_t nb10 = src1->nb[0];
8300
+ const int64_t nb11 = src1->nb[1];
8301
+ const int64_t nb12 = src1->nb[2];
8302
+
8303
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8304
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7714
8305
 
7715
8306
  const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7716
8307
  const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -7719,14 +8310,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7719
8310
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
7720
8311
 
7721
8312
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
7722
- ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7723
- ne10, ne11, nb10, nb11, nb12, main_stream);
8313
+ ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7724
8314
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7725
- ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7726
- ne10, ne11, nb10, nb11, nb12, main_stream);
8315
+ ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8316
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
8317
+ ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8318
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
8319
+ ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
8320
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
8321
+ ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7727
8322
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7728
- ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7729
- ne10, ne11, nb10, nb11, nb12, main_stream);
8323
+ ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
7730
8324
  } else {
7731
8325
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7732
8326
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7737,6 +8331,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7737
8331
  }
7738
8332
 
7739
8333
  static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8334
+ // TODO: why do we pass dst as src1 here?
7740
8335
  ggml_cuda_cpy(src0, dst, nullptr);
7741
8336
  (void) src1;
7742
8337
  }
@@ -7762,6 +8357,16 @@ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1,
7762
8357
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7763
8358
  }
7764
8359
 
8360
+ static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8361
+ GGML_ASSERT(ggml_is_contiguous(src0));
8362
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
8363
+ }
8364
+
8365
+ static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8366
+ GGML_ASSERT(ggml_is_contiguous(src0));
8367
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
8368
+ }
8369
+
7765
8370
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7766
8371
  (void) src0;
7767
8372
  (void) src1;
@@ -8017,8 +8622,9 @@ void ggml_cuda_set_main_device(const int main_device) {
8017
8622
  main_device, g_device_count, g_main_device);
8018
8623
  return;
8019
8624
  }
8020
- g_main_device = main_device;
8021
- if (g_device_count > 1) {
8625
+
8626
+ if (g_main_device != main_device && g_device_count > 1) {
8627
+ g_main_device = main_device;
8022
8628
  cudaDeviceProp prop;
8023
8629
  CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
8024
8630
  fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
@@ -8044,7 +8650,7 @@ void ggml_cuda_free_scratch() {
8044
8650
  }
8045
8651
 
8046
8652
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8047
- if (!g_cublas_loaded) { return false; }
8653
+ if (!g_cublas_loaded) return false;
8048
8654
 
8049
8655
  ggml_cuda_func_t func;
8050
8656
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -8080,6 +8686,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8080
8686
  case GGML_OP_MUL:
8081
8687
  func = ggml_cuda_mul;
8082
8688
  break;
8689
+ case GGML_OP_DIV:
8690
+ func = ggml_cuda_div;
8691
+ break;
8083
8692
  case GGML_OP_UNARY:
8084
8693
  switch (ggml_get_unary_op(tensor)) {
8085
8694
  case GGML_UNARY_OP_GELU:
@@ -8093,7 +8702,8 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8093
8702
  break;
8094
8703
  default:
8095
8704
  return false;
8096
- } break;
8705
+ }
8706
+ break;
8097
8707
  case GGML_OP_NORM:
8098
8708
  func = ggml_cuda_norm;
8099
8709
  break;
@@ -8106,6 +8716,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8106
8716
  }
8107
8717
  func = ggml_cuda_mul_mat;
8108
8718
  break;
8719
+ case GGML_OP_MUL_MAT_ID:
8720
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
8721
+ return false;
8722
+ }
8723
+ func = ggml_cuda_mul_mat_id;
8724
+ break;
8109
8725
  case GGML_OP_SCALE:
8110
8726
  func = ggml_cuda_scale;
8111
8727
  break;
@@ -8145,6 +8761,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8145
8761
  case GGML_OP_IM2COL:
8146
8762
  func = ggml_cuda_im2col;
8147
8763
  break;
8764
+ case GGML_OP_SUM_ROWS:
8765
+ func = ggml_cuda_sum_rows;
8766
+ break;
8767
+ case GGML_OP_ARGSORT:
8768
+ func = ggml_cuda_argsort;
8769
+ break;
8148
8770
  default:
8149
8771
  return false;
8150
8772
  }
@@ -8161,7 +8783,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
8161
8783
 
8162
8784
  int ggml_cuda_get_device_count() {
8163
8785
  int device_count;
8164
- CUDA_CHECK(cudaGetDeviceCount(&device_count));
8786
+ if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
8787
+ return 0;
8788
+ }
8165
8789
  return device_count;
8166
8790
  }
8167
8791
 
@@ -8177,27 +8801,16 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
8177
8801
 
8178
8802
  #define UNUSED GGML_UNUSED
8179
8803
 
8180
- struct ggml_backend_context_cuda {
8181
- };
8182
-
8183
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
8184
- return GGML_CUDA_NAME;
8185
-
8186
- UNUSED(backend);
8187
- }
8188
-
8189
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
8190
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
8191
- delete cuda_ctx;
8192
- delete backend;
8193
- }
8804
+ // cuda buffer
8194
8805
 
8195
8806
  struct ggml_backend_buffer_context_cuda {
8196
- void * device;
8197
-
8807
+ int device;
8808
+ void * dev_ptr = nullptr;
8198
8809
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
8199
8810
  size_t temp_tensor_extra_index = 0;
8200
8811
 
8812
+ ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
8813
+
8201
8814
  ~ggml_backend_buffer_context_cuda() {
8202
8815
  delete[] temp_tensor_extras;
8203
8816
  }
@@ -8218,41 +8831,20 @@ struct ggml_backend_buffer_context_cuda {
8218
8831
 
8219
8832
  static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
8220
8833
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8221
- CUDA_CHECK(cudaFree(ctx->device));
8834
+ CUDA_CHECK(cudaFree(ctx->dev_ptr));
8222
8835
  delete ctx;
8223
8836
  }
8224
8837
 
8225
8838
  static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
8226
8839
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8227
- return ctx->device;
8228
- }
8229
-
8230
- static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8231
- int64_t row_low = 0;
8232
- int64_t row_high = ggml_nrows(tensor);
8233
- int64_t nrows_split = row_high - row_low;
8234
-
8235
- size_t size = ggml_nbytes_split(tensor, nrows_split);
8236
-
8237
- int64_t ne0 = tensor->ne[0];
8238
-
8239
- if (ggml_is_quantized(tensor->type)) {
8240
- if (ne0 % MATRIX_ROW_PADDING != 0) {
8241
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8242
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8243
- }
8244
- }
8245
-
8246
- return size;
8247
-
8248
- UNUSED(buffer);
8840
+ return ctx->dev_ptr;
8249
8841
  }
8250
8842
 
8251
8843
  static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
8252
8844
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8253
8845
 
8254
8846
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
8255
- assert(tensor->view_src->buffer->backend == buffer->backend);
8847
+ assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
8256
8848
  tensor->backend = tensor->view_src->backend;
8257
8849
  tensor->extra = tensor->view_src->extra;
8258
8850
  return;
@@ -8260,7 +8852,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8260
8852
 
8261
8853
  ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
8262
8854
 
8263
- extra->data_device[g_main_device] = tensor->data;
8855
+ extra->data_device[ctx->device] = tensor->data;
8264
8856
 
8265
8857
  tensor->backend = GGML_BACKEND_GPU;
8266
8858
  tensor->extra = extra;
@@ -8272,64 +8864,208 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
8272
8864
  int64_t nrows_split = row_high - row_low;
8273
8865
 
8274
8866
  size_t original_size = ggml_nbytes_split(tensor, nrows_split);
8275
- size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
8867
+ size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
8276
8868
 
8277
8869
  if (padded_size > original_size && tensor->view_src == nullptr) {
8278
- CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
8870
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
8279
8871
  }
8280
8872
  }
8281
8873
 
8282
8874
  UNUSED(buffer);
8283
8875
  }
8284
8876
 
8877
+ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
8878
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
8879
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8880
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8881
+
8882
+ CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
8883
+
8884
+ UNUSED(buffer);
8885
+ }
8886
+
8887
+ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
8888
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
8889
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8890
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8891
+
8892
+ CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
8893
+
8894
+ UNUSED(buffer);
8895
+ }
8896
+
8285
8897
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
8286
- /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
8287
- /* .get_base = */ ggml_backend_cuda_buffer_get_base,
8288
- /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
8289
- /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
8290
- /* .free_tensor = */ NULL,
8898
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
8899
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
8900
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
8901
+ /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
8902
+ /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
8903
+ /* .cpy_tensor_from = */ NULL,
8904
+ /* .cpy_tensor_to = */ NULL,
8291
8905
  };
8292
8906
 
8293
- static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
8294
- ggml_cuda_set_device(g_main_device);
8907
+ // cuda buffer type
8295
8908
 
8296
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
8909
+ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
8910
+ int device = (int) (intptr_t) buft->context;
8911
+
8912
+ ggml_cuda_set_device(device);
8297
8913
 
8298
8914
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8299
8915
 
8300
- ggml_cuda_set_device(g_main_device);
8301
- CUDA_CHECK(cudaMalloc(&ctx->device, size));
8916
+ void * dev_ptr;
8917
+ CUDA_CHECK(cudaMalloc(&dev_ptr, size));
8302
8918
 
8303
- return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
8919
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
8920
+
8921
+ return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
8304
8922
  }
8305
8923
 
8306
- static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
8924
+ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
8307
8925
  return 128;
8926
+
8927
+ UNUSED(buft);
8928
+ }
8929
+
8930
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
8931
+ int64_t row_low = 0;
8932
+ int64_t row_high = ggml_nrows(tensor);
8933
+ int64_t nrows_split = row_high - row_low;
8934
+
8935
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
8936
+
8937
+ int64_t ne0 = tensor->ne[0];
8938
+
8939
+ if (ggml_is_quantized(tensor->type)) {
8940
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
8941
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
8942
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
8943
+ }
8944
+ }
8945
+
8946
+ return size;
8947
+
8948
+ UNUSED(buft);
8949
+ }
8950
+
8951
+ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
8952
+ return ggml_backend_is_cuda(backend);
8953
+
8954
+ UNUSED(buft);
8955
+ }
8956
+
8957
+ static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
8958
+ /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
8959
+ /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
8960
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
8961
+ /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
8962
+ };
8963
+
8964
+ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
8965
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
8966
+ static bool ggml_backend_buffer_type_cuda_initialized = false;
8967
+ if (!ggml_backend_buffer_type_cuda_initialized) {
8968
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
8969
+ ggml_backend_buffer_type_cuda[i] = {
8970
+ /* .iface = */ cuda_backend_buffer_type_interface,
8971
+ /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
8972
+ };
8973
+ }
8974
+ ggml_backend_buffer_type_cuda_initialized = true;
8975
+ }
8976
+
8977
+ return &ggml_backend_buffer_type_cuda[device];
8978
+ }
8979
+
8980
+ // host buffer type
8981
+
8982
+ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
8983
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
8984
+ CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
8985
+ delete ctx;
8986
+ }
8987
+
8988
+ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
8989
+ void * ptr;
8990
+ CUDA_CHECK(cudaMallocHost(&ptr, size));
8991
+
8992
+ // FIXME: this is a hack to avoid having to implement a new buffer type
8993
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
8994
+ buffer->buft = buft;
8995
+ buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
8996
+
8997
+ return buffer;
8998
+
8999
+ UNUSED(buft);
9000
+ }
9001
+
9002
+ struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9003
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9004
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9005
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9006
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9007
+ };
9008
+
9009
+ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9010
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9011
+ /* .iface = */ cuda_backend_host_buffer_type_interface,
9012
+ /* .context = */ nullptr,
9013
+ };
9014
+
9015
+ return &ggml_backend_buffer_type_cuda_host;
9016
+ }
9017
+
9018
+ // backend
9019
+
9020
+ struct ggml_backend_context_cuda {
9021
+ int device;
9022
+ };
9023
+
9024
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
9025
+ return GGML_CUDA_NAME;
9026
+
8308
9027
  UNUSED(backend);
8309
9028
  }
8310
9029
 
9030
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
9031
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9032
+
9033
+ delete cuda_ctx;
9034
+ delete backend;
9035
+ }
9036
+
9037
+ static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
9038
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9039
+
9040
+ return ggml_backend_cuda_buffer_type(cuda_ctx->device);
9041
+ }
9042
+
8311
9043
  static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9044
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9045
+
9046
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8312
9047
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
8313
9048
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8314
9049
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8315
9050
 
8316
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
8317
-
8318
- UNUSED(backend);
9051
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
8319
9052
  }
8320
9053
 
8321
9054
  static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9055
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9056
+
9057
+ GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
8322
9058
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
8323
9059
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
8324
9060
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
8325
9061
 
8326
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8327
-
8328
- UNUSED(backend);
9062
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
8329
9063
  }
8330
9064
 
8331
9065
  static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
8332
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
9066
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9067
+
9068
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
8333
9069
 
8334
9070
  UNUSED(backend);
8335
9071
  }
@@ -8343,14 +9079,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8343
9079
  UNUSED(cgraph);
8344
9080
  }
8345
9081
 
8346
- [[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9082
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8347
9083
  GGML_ASSERT(!"not implemented");
8348
9084
 
8349
9085
  UNUSED(backend);
8350
9086
  UNUSED(plan);
8351
9087
  }
8352
9088
 
8353
- [[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9089
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
8354
9090
  GGML_ASSERT(!"not implemented");
8355
9091
 
8356
9092
  UNUSED(backend);
@@ -8358,7 +9094,9 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
8358
9094
  }
8359
9095
 
8360
9096
  static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
8361
- ggml_cuda_set_device(g_main_device);
9097
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9098
+
9099
+ ggml_cuda_set_main_device(cuda_ctx->device);
8362
9100
 
8363
9101
  ggml_compute_params params = {};
8364
9102
  params.type = GGML_TASK_COMPUTE;
@@ -8366,13 +9104,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8366
9104
  for (int i = 0; i < cgraph->n_nodes; i++) {
8367
9105
  ggml_tensor * node = cgraph->nodes[i];
8368
9106
 
8369
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
9107
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8370
9108
  continue;
8371
- }
9109
+
8372
9110
  assert(node->backend == GGML_BACKEND_GPU);
9111
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9112
+ assert(node->extra != nullptr);
9113
+
8373
9114
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8374
9115
  if (node->src[j] != nullptr) {
8375
9116
  assert(node->src[j]->backend == GGML_BACKEND_GPU);
9117
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9118
+ assert(node->src[j]->extra != nullptr);
8376
9119
  }
8377
9120
  }
8378
9121
 
@@ -8409,27 +9152,98 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8409
9152
  UNUSED(backend);
8410
9153
  }
8411
9154
 
9155
+ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
9156
+ switch (op->op) {
9157
+ case GGML_OP_UNARY:
9158
+ switch (ggml_get_unary_op(op)) {
9159
+ case GGML_UNARY_OP_GELU:
9160
+ case GGML_UNARY_OP_SILU:
9161
+ case GGML_UNARY_OP_RELU:
9162
+ return true;
9163
+ default:
9164
+ return false;
9165
+ }
9166
+ break;
9167
+ case GGML_OP_MUL_MAT:
9168
+ case GGML_OP_MUL_MAT_ID:
9169
+ {
9170
+ struct ggml_tensor * a;
9171
+ struct ggml_tensor * b;
9172
+ if (op->op == GGML_OP_MUL_MAT) {
9173
+ a = op->src[0];
9174
+ b = op->src[1];
9175
+ } else {
9176
+ a = op->src[2];
9177
+ b = op->src[1];
9178
+ }
9179
+ if (a->ne[3] != b->ne[3]) {
9180
+ return false;
9181
+ }
9182
+ return true;
9183
+ } break;
9184
+ case GGML_OP_NONE:
9185
+ case GGML_OP_RESHAPE:
9186
+ case GGML_OP_VIEW:
9187
+ case GGML_OP_PERMUTE:
9188
+ case GGML_OP_TRANSPOSE:
9189
+ case GGML_OP_NORM:
9190
+ case GGML_OP_REPEAT:
9191
+ case GGML_OP_GET_ROWS:
9192
+ case GGML_OP_DUP:
9193
+ case GGML_OP_ADD:
9194
+ case GGML_OP_MUL:
9195
+ case GGML_OP_DIV:
9196
+ case GGML_OP_RMS_NORM:
9197
+ case GGML_OP_SCALE:
9198
+ case GGML_OP_SQR:
9199
+ case GGML_OP_CLAMP:
9200
+ case GGML_OP_CPY:
9201
+ case GGML_OP_CONT:
9202
+ case GGML_OP_DIAG_MASK_INF:
9203
+ case GGML_OP_SOFT_MAX:
9204
+ case GGML_OP_ROPE:
9205
+ case GGML_OP_ALIBI:
9206
+ case GGML_OP_IM2COL:
9207
+ case GGML_OP_SUM_ROWS:
9208
+ case GGML_OP_ARGSORT:
9209
+ return true;
9210
+ default:
9211
+ return false;
9212
+ }
9213
+
9214
+ UNUSED(backend);
9215
+ }
9216
+
8412
9217
  static ggml_backend_i cuda_backend_i = {
8413
- /* .get_name = */ ggml_backend_cuda_name,
8414
- /* .free = */ ggml_backend_cuda_free,
8415
- /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
8416
- /* .get_alignment = */ ggml_backend_cuda_get_alignment,
8417
- /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
8418
- /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
8419
- /* .synchronize = */ ggml_backend_cuda_synchronize,
8420
- /* .cpy_tensor_from = */ nullptr,
8421
- /* .cpy_tensor_to = */ nullptr,
8422
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
8423
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
8424
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
8425
- /* .graph_compute = */ ggml_backend_cuda_graph_compute,
8426
- /* .supports_op = */ nullptr,
9218
+ /* .get_name = */ ggml_backend_cuda_name,
9219
+ /* .free = */ ggml_backend_cuda_free,
9220
+ /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
9221
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
9222
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
9223
+ /* .cpy_tensor_from_async = */ NULL,
9224
+ /* .cpy_tensor_to_async = */ NULL,
9225
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
9226
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
9227
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
9228
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
9229
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
9230
+ /* .supports_op = */ ggml_backend_cuda_supports_op,
8427
9231
  };
8428
9232
 
8429
- ggml_backend_t ggml_backend_cuda_init() {
9233
+ ggml_backend_t ggml_backend_cuda_init(int device) {
8430
9234
  ggml_init_cublas(); // TODO: remove from ggml.c
8431
9235
 
8432
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
9236
+ if (device < 0 || device >= ggml_cuda_get_device_count()) {
9237
+ fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
9238
+ return nullptr;
9239
+ }
9240
+
9241
+ // not strictly necessary, but it may reduce the overhead of the first graph_compute
9242
+ ggml_cuda_set_main_device(device);
9243
+
9244
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
9245
+ /* .device = */ device
9246
+ };
8433
9247
 
8434
9248
  ggml_backend_t cuda_backend = new ggml_backend {
8435
9249
  /* .interface = */ cuda_backend_i,
@@ -8438,3 +9252,25 @@ ggml_backend_t ggml_backend_cuda_init() {
8438
9252
 
8439
9253
  return cuda_backend;
8440
9254
  }
9255
+
9256
+ bool ggml_backend_is_cuda(ggml_backend_t backend) {
9257
+ return backend->iface.get_name == ggml_backend_cuda_name;
9258
+ }
9259
+
9260
+ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
9261
+ ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
9262
+ return cuda_backend;
9263
+
9264
+ UNUSED(params);
9265
+ }
9266
+
9267
+ extern "C" int ggml_backend_cuda_reg_devices() {
9268
+ int device_count = ggml_cuda_get_device_count();
9269
+ //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
9270
+ for (int i = 0; i < device_count; i++) {
9271
+ char name[128];
9272
+ snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
9273
+ ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
9274
+ }
9275
+ return device_count;
9276
+ }