llama_cpp 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -81,12 +81,15 @@
81
81
 
82
82
  #include "ggml-cuda.h"
83
83
  #include "ggml.h"
84
+ #include "ggml-backend-impl.h"
84
85
 
85
86
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
86
87
  #define CC_VOLTA 700
87
88
  #define CC_OFFSET_AMD 1000000
88
89
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
90
 
91
+ #define GGML_CUDA_MAX_NODES 8192
92
+
90
93
  // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
94
  // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
95
  // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
@@ -433,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
436
  #define CUDA_MUL_BLOCK_SIZE 256
434
437
  #define CUDA_GELU_BLOCK_SIZE 256
435
438
  #define CUDA_SILU_BLOCK_SIZE 256
439
+ #define CUDA_RELU_BLOCK_SIZE 256
440
+ #define CUDA_SQR_BLOCK_SIZE 256
436
441
  #define CUDA_CPY_BLOCK_SIZE 32
437
442
  #define CUDA_SCALE_BLOCK_SIZE 256
438
443
  #define CUDA_CLAMP_BLOCK_SIZE 256
@@ -553,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
553
558
  dst[i] = x[i] / (1.0f + expf(-x[i]));
554
559
  }
555
560
 
561
+ static __global__ void relu_f32(const float * x, float * dst, const int k) {
562
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
563
+
564
+ if (i >= k) {
565
+ return;
566
+ }
567
+ dst[i] = fmaxf(x[i], 0);
568
+ }
569
+
570
+ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
571
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
572
+
573
+ if (i >= k) {
574
+ return;
575
+ }
576
+ dst[i] = x[i] * x[i];
577
+ }
578
+
556
579
  static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
557
580
  #pragma unroll
558
581
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -982,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
982
1005
 
983
1006
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
984
1007
 
985
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1008
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
986
1009
  if (row > nrows) return;
987
1010
 
988
1011
  const int num_blocks_per_row = ncols / QK_K;
@@ -1086,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
1086
1109
 
1087
1110
  static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1088
1111
 
1089
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1112
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1090
1113
  if (row > nrows) return;
1091
1114
 
1092
1115
  const int num_blocks_per_row = ncols / QK_K;
@@ -1190,7 +1213,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
1190
1213
 
1191
1214
  static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1192
1215
 
1193
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1216
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1194
1217
  if (row > nrows) return;
1195
1218
  const int num_blocks_per_row = ncols / QK_K;
1196
1219
  const int ib0 = row*num_blocks_per_row;
@@ -1444,7 +1467,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
1444
1467
 
1445
1468
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1446
1469
 
1447
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1470
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1448
1471
  if (row > nrows) return;
1449
1472
 
1450
1473
  const int num_blocks_per_row = ncols / QK_K;
@@ -4254,7 +4277,7 @@ template <bool need_check> static __global__ void
4254
4277
 
4255
4278
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
4256
4279
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
4257
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4280
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4258
4281
 
4259
4282
  if (row >= nrows) {
4260
4283
  return;
@@ -4294,7 +4317,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
4294
4317
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
4295
4318
  // qk = quantized weights per x block
4296
4319
  // qr = number of quantized weights per data value in x block
4297
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4320
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4298
4321
 
4299
4322
  if (row >= nrows) {
4300
4323
  return;
@@ -4468,6 +4491,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
4468
4491
  *dsti = __float2half(*xi);
4469
4492
  }
4470
4493
 
4494
+ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
4495
+ const half * xi = (const half *) cxi;
4496
+ half * dsti = (half *) cdsti;
4497
+
4498
+ *dsti = *xi;
4499
+ }
4500
+
4471
4501
  template <cpy_kernel_t cpy_1>
4472
4502
  static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4473
4503
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -4721,6 +4751,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
4721
4751
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4722
4752
  }
4723
4753
 
4754
+ static __global__ void im2col_f32_f16(
4755
+ const float * x, half * dst,
4756
+ int ofs0, int ofs1, int IW, int IH, int CHW,
4757
+ int s0, int s1, int p0, int p1, int d0, int d1) {
4758
+ const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
4759
+ const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
4760
+
4761
+ const int offset_dst =
4762
+ (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
4763
+ (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
4764
+
4765
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
4766
+ dst[offset_dst] = __float2half(0.0f);
4767
+ } else {
4768
+ const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
4769
+ dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
4770
+ }
4771
+ }
4772
+
4724
4773
  template<int qk, int qr, dequantize_kernel_t dq>
4725
4774
  static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4726
4775
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
@@ -4759,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4759
4808
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4760
4809
  }
4761
4810
 
4811
+ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4812
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
4813
+ relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4814
+ }
4815
+
4816
+ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4817
+ const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
4818
+ sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4819
+ }
4820
+
4762
4821
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4763
4822
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4764
4823
  if (ncols < 1024) {
@@ -4867,7 +4926,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4867
4926
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4868
4927
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4869
4928
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4870
- const dim3 block_nums(1, block_num_y, 1);
4929
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
4930
+ const dim3 block_nums(block_num_y, 1, 1);
4871
4931
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4872
4932
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
4873
4933
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4876,7 +4936,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
4876
4936
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4877
4937
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4878
4938
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4879
- const dim3 block_nums(1, block_num_y, 1);
4939
+ const dim3 block_nums(block_num_y, 1, 1);
4880
4940
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4881
4941
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
4882
4942
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4885,7 +4945,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
4885
4945
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4886
4946
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4887
4947
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4888
- const dim3 block_nums(1, block_num_y, 1);
4948
+ const dim3 block_nums(block_num_y, 1, 1);
4889
4949
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4890
4950
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
4891
4951
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4894,7 +4954,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
4894
4954
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4895
4955
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4896
4956
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4897
- const dim3 block_nums(1, block_num_y, 1);
4957
+ const dim3 block_nums(block_num_y, 1, 1);
4898
4958
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4899
4959
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
4900
4960
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4903,7 +4963,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
4903
4963
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4904
4964
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4905
4965
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4906
- const dim3 block_nums(1, block_num_y, 1);
4966
+ const dim3 block_nums(block_num_y, 1, 1);
4907
4967
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4908
4968
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
4909
4969
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4913,7 +4973,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
4913
4973
  GGML_ASSERT(ncols % QK_K == 0);
4914
4974
  const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
4915
4975
  const int block_num_y = (nrows + ny - 1) / ny;
4916
- const dim3 block_nums(1, block_num_y, 1);
4976
+ const dim3 block_nums(block_num_y, 1, 1);
4917
4977
  const dim3 block_dims(32, ny, 1);
4918
4978
  dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4919
4979
  }
@@ -4922,7 +4982,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
4922
4982
  GGML_ASSERT(ncols % QK_K == 0);
4923
4983
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4924
4984
  const int block_num_y = (nrows + ny - 1) / ny;
4925
- const dim3 block_nums(1, block_num_y, 1);
4985
+ const dim3 block_nums(block_num_y, 1, 1);
4926
4986
  const dim3 block_dims(32, ny, 1);
4927
4987
  dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4928
4988
  }
@@ -4931,7 +4991,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
4931
4991
  GGML_ASSERT(ncols % QK_K == 0);
4932
4992
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4933
4993
  const int block_num_y = (nrows + ny - 1) / ny;
4934
- const dim3 block_nums(1, block_num_y, 1);
4994
+ const dim3 block_nums(block_num_y, 1, 1);
4935
4995
  const dim3 block_dims(32, ny, 1);
4936
4996
  dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4937
4997
  }
@@ -4946,7 +5006,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4946
5006
  GGML_ASSERT(ncols % QK_K == 0);
4947
5007
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4948
5008
  const int block_num_y = (nrows + ny - 1) / ny;
4949
- const dim3 block_nums(1, block_num_y, 1);
5009
+ const dim3 block_nums(block_num_y, 1, 1);
4950
5010
  const dim3 block_dims(32, ny, 1);
4951
5011
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4952
5012
  }
@@ -4954,7 +5014,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4954
5014
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4955
5015
  GGML_ASSERT(ncols % QK4_0 == 0);
4956
5016
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4957
- const dim3 block_nums(1, block_num_y, 1);
5017
+ const dim3 block_nums(block_num_y, 1, 1);
4958
5018
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4959
5019
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
4960
5020
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4963,7 +5023,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
4963
5023
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4964
5024
  GGML_ASSERT(ncols % QK4_1 == 0);
4965
5025
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4966
- const dim3 block_nums(1, block_num_y, 1);
5026
+ const dim3 block_nums(block_num_y, 1, 1);
4967
5027
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4968
5028
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
4969
5029
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4972,7 +5032,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
4972
5032
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4973
5033
  GGML_ASSERT(ncols % QK5_0 == 0);
4974
5034
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4975
- const dim3 block_nums(1, block_num_y, 1);
5035
+ const dim3 block_nums(block_num_y, 1, 1);
4976
5036
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4977
5037
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
4978
5038
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4981,7 +5041,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
4981
5041
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4982
5042
  GGML_ASSERT(ncols % QK5_1 == 0);
4983
5043
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4984
- const dim3 block_nums(1, block_num_y, 1);
5044
+ const dim3 block_nums(block_num_y, 1, 1);
4985
5045
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4986
5046
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
4987
5047
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4990,7 +5050,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
4990
5050
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4991
5051
  GGML_ASSERT(ncols % QK8_0 == 0);
4992
5052
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4993
- const dim3 block_nums(1, block_num_y, 1);
5053
+ const dim3 block_nums(block_num_y, 1, 1);
4994
5054
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4995
5055
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
4996
5056
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4999,7 +5059,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
4999
5059
  static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5000
5060
  GGML_ASSERT(ncols % QK_K == 0);
5001
5061
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5002
- const dim3 block_nums(1, block_num_y, 1);
5062
+ const dim3 block_nums(block_num_y, 1, 1);
5003
5063
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5004
5064
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
5005
5065
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5008,7 +5068,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
5008
5068
  static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5009
5069
  GGML_ASSERT(ncols % QK_K == 0);
5010
5070
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5011
- const dim3 block_nums(1, block_num_y, 1);
5071
+ const dim3 block_nums(block_num_y, 1, 1);
5012
5072
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5013
5073
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
5014
5074
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5017,7 +5077,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
5017
5077
  static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5018
5078
  GGML_ASSERT(ncols % QK_K == 0);
5019
5079
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5020
- const dim3 block_nums(1, block_num_y, 1);
5080
+ const dim3 block_nums(block_num_y, 1, 1);
5021
5081
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5022
5082
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
5023
5083
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5026,7 +5086,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
5026
5086
  static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5027
5087
  GGML_ASSERT(ncols % QK_K == 0);
5028
5088
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5029
- const dim3 block_nums(1, block_num_y, 1);
5089
+ const dim3 block_nums(block_num_y, 1, 1);
5030
5090
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5031
5091
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
5032
5092
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5035,7 +5095,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
5035
5095
  static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5036
5096
  GGML_ASSERT(ncols % QK_K == 0);
5037
5097
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5038
- const dim3 block_nums(1, block_num_y, 1);
5098
+ const dim3 block_nums(block_num_y, 1, 1);
5039
5099
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5040
5100
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
5041
5101
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5054,7 +5114,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
5054
5114
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5055
5115
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5056
5116
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5057
- const dim3 block_nums(1, block_num_y, 1);
5117
+ const dim3 block_nums(block_num_y, 1, 1);
5058
5118
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5059
5119
  dequantize_mul_mat_vec<1, 1, convert_f16>
5060
5120
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -5610,6 +5670,16 @@ static void ggml_cpy_f32_f16_cuda(
5610
5670
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5611
5671
  }
5612
5672
 
5673
+ static void ggml_cpy_f16_f16_cuda(
5674
+ const char * cx, char * cdst, const int ne,
5675
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
5676
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
5677
+
5678
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
5679
+ cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
5680
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5681
+ }
5682
+
5613
5683
  static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
5614
5684
  const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
5615
5685
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -5693,6 +5763,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
5693
5763
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5694
5764
  }
5695
5765
 
5766
+ static void im2col_f32_f16_cuda(const float * x, half * dst,
5767
+ int OH, int IW, int IH, int OW, int IC,
5768
+ int KH, int KW, int N, int ofs0, int ofs1,
5769
+ int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
5770
+ dim3 block_nums(IC, OH, OW);
5771
+ dim3 block_dims(N, KH, KW);
5772
+ im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
5773
+ }
5774
+
5696
5775
  // buffer pool for cuda
5697
5776
  #define MAX_CUDA_BUFFERS 256
5698
5777
 
@@ -5761,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5761
5840
  return ptr;
5762
5841
  }
5763
5842
  #ifdef DEBUG_CUDA_MALLOC
5764
- fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5843
+ fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
5765
5844
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5766
5845
  #endif
5767
5846
  void * ptr;
@@ -5789,6 +5868,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
5789
5868
  CUDA_CHECK(cudaFree(ptr));
5790
5869
  }
5791
5870
 
5871
+ static bool g_cublas_loaded = false;
5872
+
5873
+ bool ggml_cublas_loaded(void) {
5874
+ return g_cublas_loaded;
5875
+ }
5792
5876
 
5793
5877
  void ggml_init_cublas() {
5794
5878
  static bool initialized = false;
@@ -5802,7 +5886,12 @@ void ggml_init_cublas() {
5802
5886
  CUDA_CHECK(cudaDeviceSynchronize());
5803
5887
  #endif
5804
5888
 
5805
- CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5889
+ if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
5890
+ initialized = true;
5891
+ g_cublas_loaded = false;
5892
+ return;
5893
+ }
5894
+
5806
5895
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5807
5896
  int64_t total_vram = 0;
5808
5897
  #if defined(GGML_CUDA_FORCE_MMQ)
@@ -5850,6 +5939,7 @@ void ggml_init_cublas() {
5850
5939
  // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
5851
5940
 
5852
5941
  initialized = true;
5942
+ g_cublas_loaded = true;
5853
5943
  }
5854
5944
  }
5855
5945
 
@@ -5888,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
5888
5978
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
5889
5979
  // This can fixed the OOM error in WSL.
5890
5980
  cudaGetLastError();
5891
- fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
5981
+ fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
5892
5982
  size/1024.0/1024.0, cudaGetErrorString(err));
5893
5983
  return nullptr;
5894
5984
  }
@@ -6116,6 +6206,34 @@ inline void ggml_cuda_op_silu(
6116
6206
  (void) src1_dd;
6117
6207
  }
6118
6208
 
6209
+ inline void ggml_cuda_op_relu(
6210
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6211
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6212
+
6213
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6214
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6215
+
6216
+ relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6217
+
6218
+ (void) src1;
6219
+ (void) dst;
6220
+ (void) src1_dd;
6221
+ }
6222
+
6223
+ inline void ggml_cuda_op_sqr(
6224
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6225
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6226
+
6227
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6228
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6229
+
6230
+ sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6231
+
6232
+ (void) src1;
6233
+ (void) dst;
6234
+ (void) src1_dd;
6235
+ }
6236
+
6119
6237
  inline void ggml_cuda_op_norm(
6120
6238
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6121
6239
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6238,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
6238
6356
  case GGML_TYPE_Q8_0:
6239
6357
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
6240
6358
  case GGML_TYPE_F16:
6359
+ case GGML_TYPE_F32:
6241
6360
  return 1;
6242
6361
  case GGML_TYPE_Q2_K:
6243
6362
  return max_compute_capability >= CC_RDNA2 ? 128 : 32;
@@ -6260,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
6260
6379
  case GGML_TYPE_Q8_0:
6261
6380
  return 64;
6262
6381
  case GGML_TYPE_F16:
6382
+ case GGML_TYPE_F32:
6263
6383
  return 1;
6264
6384
  case GGML_TYPE_Q2_K:
6265
6385
  case GGML_TYPE_Q3_K:
@@ -6451,8 +6571,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6451
6571
  src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6452
6572
  to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6453
6573
  }
6454
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6455
-
6574
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
6456
6575
  size_t dst_as = 0;
6457
6576
  half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6458
6577
 
@@ -6627,6 +6746,45 @@ inline void ggml_cuda_op_alibi(
6627
6746
  (void) src1_dd;
6628
6747
  }
6629
6748
 
6749
+ inline void ggml_cuda_op_im2col(
6750
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6751
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6752
+
6753
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
6754
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6755
+ GGML_ASSERT( dst->type == GGML_TYPE_F16);
6756
+
6757
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
6758
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
6759
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
6760
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
6761
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
6762
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
6763
+
6764
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
6765
+
6766
+ const int64_t N = src1->ne[is_2D ? 3 : 2];
6767
+ const int64_t IC = src1->ne[is_2D ? 2 : 1];
6768
+ const int64_t IH = is_2D ? src1->ne[1] : 1;
6769
+ const int64_t IW = src1->ne[0];
6770
+
6771
+ const int64_t KH = is_2D ? src0->ne[1] : 1;
6772
+ const int64_t KW = src0->ne[0];
6773
+
6774
+ const int64_t OH = is_2D ? dst->ne[2] : 1;
6775
+ const int64_t OW = dst->ne[1];
6776
+
6777
+ const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
6778
+ const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
6779
+
6780
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
6781
+ OH, IW, IH, OW, IC, KH, KW, N,
6782
+ ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
6783
+
6784
+ (void) src0;
6785
+ (void) src0_dd;
6786
+ }
6787
+
6630
6788
  inline void ggml_cuda_op_diag_mask_inf(
6631
6789
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6632
6790
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6892,6 +7050,8 @@ static void ggml_cuda_op_mul_mat(
6892
7050
  int64_t row_low[GGML_CUDA_MAX_DEVICES];
6893
7051
  int64_t row_high[GGML_CUDA_MAX_DEVICES];
6894
7052
 
7053
+ int used_devices = 0;
7054
+
6895
7055
  for (int64_t id = 0; id < g_device_count; ++id) {
6896
7056
  // by default, use all rows
6897
7057
  row_low[id] = 0;
@@ -6919,6 +7079,8 @@ static void ggml_cuda_op_mul_mat(
6919
7079
  continue;
6920
7080
  }
6921
7081
 
7082
+ used_devices++;
7083
+
6922
7084
  const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6923
7085
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6924
7086
 
@@ -6957,12 +7119,12 @@ static void ggml_cuda_op_mul_mat(
6957
7119
 
6958
7120
  // if multiple devices are used they need to wait for the main device
6959
7121
  // here an event is recorded that signals that the main device has finished calculating the input data
6960
- if (split && g_device_count > 1) {
7122
+ if (split && used_devices > 1) {
6961
7123
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6962
7124
  CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6963
7125
  }
6964
7126
 
6965
- const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
7127
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6966
7128
  for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6967
7129
  const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6968
7130
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@@ -7078,6 +7240,9 @@ static void ggml_cuda_op_mul_mat(
7078
7240
  }
7079
7241
 
7080
7242
  for (int64_t id = 0; id < g_device_count; ++id) {
7243
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
7244
+ continue;
7245
+ }
7081
7246
  CUDA_CHECK(ggml_cuda_set_device(id));
7082
7247
 
7083
7248
  // free buffers again when done
@@ -7102,6 +7267,9 @@ static void ggml_cuda_op_mul_mat(
7102
7267
 
7103
7268
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7104
7269
  for (int64_t id = 0; id < g_device_count; ++id) {
7270
+ if (row_low[id] == row_high[id]) {
7271
+ continue;
7272
+ }
7105
7273
  for (int64_t is = 0; is < is_max; ++is) {
7106
7274
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
7107
7275
  }
@@ -7138,6 +7306,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7138
7306
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7139
7307
  }
7140
7308
 
7309
+ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7310
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7311
+ }
7312
+
7313
+ static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7314
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7315
+ }
7316
+
7141
7317
  static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7142
7318
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7143
7319
  }
@@ -7147,6 +7323,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7147
7323
  }
7148
7324
 
7149
7325
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7326
+ if (!g_cublas_loaded) return false;
7327
+
7150
7328
  const int64_t ne10 = src1->ne[0];
7151
7329
 
7152
7330
  const int64_t ne0 = dst->ne[0];
@@ -7225,7 +7403,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7225
7403
 
7226
7404
  __global__ void k_compute_batched_ptrs(
7227
7405
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7228
- void ** ptrs,
7406
+ const void ** ptrs_src, void ** ptrs_dst,
7229
7407
  int ne12, int ne13,
7230
7408
  int ne23,
7231
7409
  int nb02, int nb03,
@@ -7242,9 +7420,9 @@ __global__ void k_compute_batched_ptrs(
7242
7420
  int i03 = i13 / r3;
7243
7421
  int i02 = i12 / r2;
7244
7422
 
7245
- ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
7246
- ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7247
- ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7423
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
7424
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7425
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7248
7426
  }
7249
7427
 
7250
7428
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7350,14 +7528,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7350
7528
  // use cublasGemmBatchedEx
7351
7529
  const int ne23 = ne12*ne13;
7352
7530
 
7353
- void ** ptrs_as = nullptr;
7354
- size_t ptrs_s = 0;
7355
- ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7531
+ const void ** ptrs_src = nullptr;
7532
+ void ** ptrs_dst = nullptr;
7533
+
7534
+ size_t ptrs_src_s = 0;
7535
+ size_t ptrs_dst_s = 0;
7536
+
7537
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
7538
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
7356
7539
 
7357
7540
  dim3 block_dims(ne13, ne12);
7358
7541
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7359
7542
  src0_as_f16, src1_as_f16, dst_f16,
7360
- ptrs_as,
7543
+ ptrs_src, ptrs_dst,
7361
7544
  ne12, ne13,
7362
7545
  ne23,
7363
7546
  nb02, nb03,
@@ -7369,14 +7552,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7369
7552
  CUBLAS_CHECK(
7370
7553
  cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7371
7554
  ne01, ne11, ne10,
7372
- &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7373
- (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7374
- &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7555
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7556
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7557
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
7375
7558
  ne23,
7376
7559
  CUBLAS_COMPUTE_16F,
7377
7560
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7378
7561
 
7379
- ggml_cuda_pool_free(ptrs_as, ptrs_s);
7562
+ if (ptrs_src_s != 0) {
7563
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
7564
+ }
7565
+ if (ptrs_dst_s != 0) {
7566
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
7567
+ }
7380
7568
  }
7381
7569
  #endif
7382
7570
 
@@ -7389,10 +7577,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7389
7577
 
7390
7578
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7391
7579
  const bool all_on_device =
7392
- (src0->backend == GGML_BACKEND_GPU) &&
7580
+ (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7393
7581
  (src1->backend == GGML_BACKEND_GPU) &&
7394
7582
  ( dst->backend == GGML_BACKEND_GPU);
7395
7583
 
7584
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7585
+
7396
7586
  int64_t min_compute_capability = INT_MAX;
7397
7587
  for (int64_t id = 0; id < g_device_count; ++id) {
7398
7588
  if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
@@ -7414,13 +7604,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7414
7604
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7415
7605
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7416
7606
 
7417
- if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7607
+ if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7418
7608
  // KQ single-batch
7419
7609
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7420
- } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7610
+ } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7421
7611
  // KQV single-batch
7422
7612
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7423
- } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7613
+ } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7424
7614
  // KQ + KQV multi-batch
7425
7615
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7426
7616
  } else if (src0->type == GGML_TYPE_F32) {
@@ -7507,6 +7697,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7507
7697
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7508
7698
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7509
7699
  ne10, ne11, nb10, nb11, nb12, main_stream);
7700
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7701
+ ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7702
+ ne10, ne11, nb10, nb11, nb12, main_stream);
7510
7703
  } else {
7511
7704
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7512
7705
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7538,6 +7731,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
7538
7731
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
7539
7732
  }
7540
7733
 
7734
+ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7735
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7736
+ }
7737
+
7541
7738
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7542
7739
  (void) src0;
7543
7740
  (void) src1;
@@ -7649,11 +7846,11 @@ static size_t g_temp_tensor_extra_index = 0;
7649
7846
 
7650
7847
  static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7651
7848
  if (g_temp_tensor_extras == nullptr) {
7652
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7849
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
7653
7850
  }
7654
7851
 
7655
7852
  size_t alloc_index = g_temp_tensor_extra_index;
7656
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7853
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
7657
7854
  ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7658
7855
  memset(extra, 0, sizeof(*extra));
7659
7856
 
@@ -7820,6 +8017,8 @@ void ggml_cuda_free_scratch() {
7820
8017
  }
7821
8018
 
7822
8019
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8020
+ if (!g_cublas_loaded) return false;
8021
+
7823
8022
  ggml_cuda_func_t func;
7824
8023
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7825
8024
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
@@ -7829,6 +8028,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7829
8028
  return false;
7830
8029
  }
7831
8030
 
8031
+ if (tensor->op == GGML_OP_MUL_MAT) {
8032
+ if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8033
+ #ifndef NDEBUG
8034
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8035
+ #endif
8036
+ return false;
8037
+ }
8038
+ }
8039
+
7832
8040
  switch (tensor->op) {
7833
8041
  case GGML_OP_REPEAT:
7834
8042
  func = ggml_cuda_repeat;
@@ -7853,6 +8061,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7853
8061
  case GGML_UNARY_OP_SILU:
7854
8062
  func = ggml_cuda_silu;
7855
8063
  break;
8064
+ case GGML_UNARY_OP_RELU:
8065
+ func = ggml_cuda_relu;
8066
+ break;
7856
8067
  default:
7857
8068
  return false;
7858
8069
  } break;
@@ -7871,6 +8082,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7871
8082
  case GGML_OP_SCALE:
7872
8083
  func = ggml_cuda_scale;
7873
8084
  break;
8085
+ case GGML_OP_SQR:
8086
+ func = ggml_cuda_sqr;
8087
+ break;
7874
8088
  case GGML_OP_CLAMP:
7875
8089
  if (!any_on_device) {
7876
8090
  return false;
@@ -7901,6 +8115,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7901
8115
  case GGML_OP_ALIBI:
7902
8116
  func = ggml_cuda_alibi;
7903
8117
  break;
8118
+ case GGML_OP_IM2COL:
8119
+ func = ggml_cuda_im2col;
8120
+ break;
7904
8121
  default:
7905
8122
  return false;
7906
8123
  }
@@ -7960,11 +8177,11 @@ struct ggml_backend_buffer_context_cuda {
7960
8177
 
7961
8178
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7962
8179
  if (temp_tensor_extras == nullptr) {
7963
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
8180
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
7964
8181
  }
7965
8182
 
7966
8183
  size_t alloc_index = temp_tensor_extra_index;
7967
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
8184
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
7968
8185
  ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
7969
8186
  memset(extra, 0, sizeof(*extra));
7970
8187
 
@@ -8050,7 +8267,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
8050
8267
  ggml_cuda_set_device(g_main_device);
8051
8268
 
8052
8269
  ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
8270
+
8271
+ size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8272
+
8273
+ ggml_cuda_set_device(g_main_device);
8053
8274
  CUDA_CHECK(cudaMalloc(&ctx->device, size));
8275
+
8054
8276
  return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
8055
8277
  }
8056
8278
 
@@ -8117,6 +8339,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8117
8339
  for (int i = 0; i < cgraph->n_nodes; i++) {
8118
8340
  ggml_tensor * node = cgraph->nodes[i];
8119
8341
 
8342
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8343
+ continue;
8120
8344
  assert(node->backend == GGML_BACKEND_GPU);
8121
8345
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8122
8346
  if (node->src[j] != nullptr) {