llama_cpp 0.9.1 → 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -81,12 +81,15 @@
81
81
 
82
82
  #include "ggml-cuda.h"
83
83
  #include "ggml.h"
84
+ #include "ggml-backend-impl.h"
84
85
 
85
86
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
86
87
  #define CC_VOLTA 700
87
88
  #define CC_OFFSET_AMD 1000000
88
89
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
90
 
91
+ #define GGML_CUDA_MAX_NODES 8192
92
+
90
93
  // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
94
  // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
95
  // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
@@ -433,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
436
  #define CUDA_MUL_BLOCK_SIZE 256
434
437
  #define CUDA_GELU_BLOCK_SIZE 256
435
438
  #define CUDA_SILU_BLOCK_SIZE 256
439
+ #define CUDA_RELU_BLOCK_SIZE 256
440
+ #define CUDA_SQR_BLOCK_SIZE 256
436
441
  #define CUDA_CPY_BLOCK_SIZE 32
437
442
  #define CUDA_SCALE_BLOCK_SIZE 256
438
443
  #define CUDA_CLAMP_BLOCK_SIZE 256
@@ -553,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
553
558
  dst[i] = x[i] / (1.0f + expf(-x[i]));
554
559
  }
555
560
 
561
+ static __global__ void relu_f32(const float * x, float * dst, const int k) {
562
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
563
+
564
+ if (i >= k) {
565
+ return;
566
+ }
567
+ dst[i] = fmaxf(x[i], 0);
568
+ }
569
+
570
+ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
571
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
572
+
573
+ if (i >= k) {
574
+ return;
575
+ }
576
+ dst[i] = x[i] * x[i];
577
+ }
578
+
556
579
  static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
557
580
  #pragma unroll
558
581
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -982,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
982
1005
 
983
1006
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
984
1007
 
985
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1008
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
986
1009
  if (row > nrows) return;
987
1010
 
988
1011
  const int num_blocks_per_row = ncols / QK_K;
@@ -1086,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
1086
1109
 
1087
1110
  static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1088
1111
 
1089
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1112
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1090
1113
  if (row > nrows) return;
1091
1114
 
1092
1115
  const int num_blocks_per_row = ncols / QK_K;
@@ -1190,7 +1213,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
1190
1213
 
1191
1214
  static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1192
1215
 
1193
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1216
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1194
1217
  if (row > nrows) return;
1195
1218
  const int num_blocks_per_row = ncols / QK_K;
1196
1219
  const int ib0 = row*num_blocks_per_row;
@@ -1444,7 +1467,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
1444
1467
 
1445
1468
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1446
1469
 
1447
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1470
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1448
1471
  if (row > nrows) return;
1449
1472
 
1450
1473
  const int num_blocks_per_row = ncols / QK_K;
@@ -4254,7 +4277,7 @@ template <bool need_check> static __global__ void
4254
4277
 
4255
4278
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
4256
4279
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
4257
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4280
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4258
4281
 
4259
4282
  if (row >= nrows) {
4260
4283
  return;
@@ -4294,7 +4317,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
4294
4317
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
4295
4318
  // qk = quantized weights per x block
4296
4319
  // qr = number of quantized weights per data value in x block
4297
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4320
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4298
4321
 
4299
4322
  if (row >= nrows) {
4300
4323
  return;
@@ -4468,6 +4491,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
4468
4491
  *dsti = __float2half(*xi);
4469
4492
  }
4470
4493
 
4494
+ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
4495
+ const half * xi = (const half *) cxi;
4496
+ half * dsti = (half *) cdsti;
4497
+
4498
+ *dsti = *xi;
4499
+ }
4500
+
4471
4501
  template <cpy_kernel_t cpy_1>
4472
4502
  static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4473
4503
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -4721,6 +4751,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
4721
4751
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4722
4752
  }
4723
4753
 
4754
+ static __global__ void im2col_f32_f16(
4755
+ const float * x, half * dst,
4756
+ int ofs0, int ofs1, int IW, int IH, int CHW,
4757
+ int s0, int s1, int p0, int p1, int d0, int d1) {
4758
+ const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
4759
+ const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
4760
+
4761
+ const int offset_dst =
4762
+ (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
4763
+ (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
4764
+
4765
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
4766
+ dst[offset_dst] = __float2half(0.0f);
4767
+ } else {
4768
+ const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
4769
+ dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
4770
+ }
4771
+ }
4772
+
4724
4773
  template<int qk, int qr, dequantize_kernel_t dq>
4725
4774
  static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4726
4775
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
@@ -4759,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4759
4808
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4760
4809
  }
4761
4810
 
4811
+ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4812
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
4813
+ relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4814
+ }
4815
+
4816
+ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4817
+ const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
4818
+ sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4819
+ }
4820
+
4762
4821
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4763
4822
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4764
4823
  if (ncols < 1024) {
@@ -4867,7 +4926,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4867
4926
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4868
4927
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4869
4928
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4870
- const dim3 block_nums(1, block_num_y, 1);
4929
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
4930
+ const dim3 block_nums(block_num_y, 1, 1);
4871
4931
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4872
4932
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
4873
4933
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4876,7 +4936,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
4876
4936
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4877
4937
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4878
4938
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4879
- const dim3 block_nums(1, block_num_y, 1);
4939
+ const dim3 block_nums(block_num_y, 1, 1);
4880
4940
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4881
4941
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
4882
4942
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4885,7 +4945,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
4885
4945
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4886
4946
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4887
4947
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4888
- const dim3 block_nums(1, block_num_y, 1);
4948
+ const dim3 block_nums(block_num_y, 1, 1);
4889
4949
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4890
4950
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
4891
4951
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4894,7 +4954,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
4894
4954
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4895
4955
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4896
4956
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4897
- const dim3 block_nums(1, block_num_y, 1);
4957
+ const dim3 block_nums(block_num_y, 1, 1);
4898
4958
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4899
4959
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
4900
4960
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4903,7 +4963,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
4903
4963
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4904
4964
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4905
4965
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4906
- const dim3 block_nums(1, block_num_y, 1);
4966
+ const dim3 block_nums(block_num_y, 1, 1);
4907
4967
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4908
4968
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
4909
4969
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4913,7 +4973,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
4913
4973
  GGML_ASSERT(ncols % QK_K == 0);
4914
4974
  const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
4915
4975
  const int block_num_y = (nrows + ny - 1) / ny;
4916
- const dim3 block_nums(1, block_num_y, 1);
4976
+ const dim3 block_nums(block_num_y, 1, 1);
4917
4977
  const dim3 block_dims(32, ny, 1);
4918
4978
  dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4919
4979
  }
@@ -4922,7 +4982,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
4922
4982
  GGML_ASSERT(ncols % QK_K == 0);
4923
4983
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4924
4984
  const int block_num_y = (nrows + ny - 1) / ny;
4925
- const dim3 block_nums(1, block_num_y, 1);
4985
+ const dim3 block_nums(block_num_y, 1, 1);
4926
4986
  const dim3 block_dims(32, ny, 1);
4927
4987
  dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4928
4988
  }
@@ -4931,7 +4991,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
4931
4991
  GGML_ASSERT(ncols % QK_K == 0);
4932
4992
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4933
4993
  const int block_num_y = (nrows + ny - 1) / ny;
4934
- const dim3 block_nums(1, block_num_y, 1);
4994
+ const dim3 block_nums(block_num_y, 1, 1);
4935
4995
  const dim3 block_dims(32, ny, 1);
4936
4996
  dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4937
4997
  }
@@ -4946,7 +5006,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4946
5006
  GGML_ASSERT(ncols % QK_K == 0);
4947
5007
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4948
5008
  const int block_num_y = (nrows + ny - 1) / ny;
4949
- const dim3 block_nums(1, block_num_y, 1);
5009
+ const dim3 block_nums(block_num_y, 1, 1);
4950
5010
  const dim3 block_dims(32, ny, 1);
4951
5011
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4952
5012
  }
@@ -4954,7 +5014,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4954
5014
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4955
5015
  GGML_ASSERT(ncols % QK4_0 == 0);
4956
5016
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4957
- const dim3 block_nums(1, block_num_y, 1);
5017
+ const dim3 block_nums(block_num_y, 1, 1);
4958
5018
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4959
5019
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
4960
5020
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4963,7 +5023,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
4963
5023
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4964
5024
  GGML_ASSERT(ncols % QK4_1 == 0);
4965
5025
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4966
- const dim3 block_nums(1, block_num_y, 1);
5026
+ const dim3 block_nums(block_num_y, 1, 1);
4967
5027
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4968
5028
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
4969
5029
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4972,7 +5032,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
4972
5032
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4973
5033
  GGML_ASSERT(ncols % QK5_0 == 0);
4974
5034
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4975
- const dim3 block_nums(1, block_num_y, 1);
5035
+ const dim3 block_nums(block_num_y, 1, 1);
4976
5036
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4977
5037
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
4978
5038
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4981,7 +5041,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
4981
5041
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4982
5042
  GGML_ASSERT(ncols % QK5_1 == 0);
4983
5043
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4984
- const dim3 block_nums(1, block_num_y, 1);
5044
+ const dim3 block_nums(block_num_y, 1, 1);
4985
5045
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4986
5046
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
4987
5047
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4990,7 +5050,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
4990
5050
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4991
5051
  GGML_ASSERT(ncols % QK8_0 == 0);
4992
5052
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4993
- const dim3 block_nums(1, block_num_y, 1);
5053
+ const dim3 block_nums(block_num_y, 1, 1);
4994
5054
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4995
5055
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
4996
5056
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4999,7 +5059,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
4999
5059
  static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5000
5060
  GGML_ASSERT(ncols % QK_K == 0);
5001
5061
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5002
- const dim3 block_nums(1, block_num_y, 1);
5062
+ const dim3 block_nums(block_num_y, 1, 1);
5003
5063
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5004
5064
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
5005
5065
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5008,7 +5068,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
5008
5068
  static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5009
5069
  GGML_ASSERT(ncols % QK_K == 0);
5010
5070
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5011
- const dim3 block_nums(1, block_num_y, 1);
5071
+ const dim3 block_nums(block_num_y, 1, 1);
5012
5072
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5013
5073
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
5014
5074
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5017,7 +5077,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
5017
5077
  static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5018
5078
  GGML_ASSERT(ncols % QK_K == 0);
5019
5079
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5020
- const dim3 block_nums(1, block_num_y, 1);
5080
+ const dim3 block_nums(block_num_y, 1, 1);
5021
5081
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5022
5082
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
5023
5083
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5026,7 +5086,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
5026
5086
  static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5027
5087
  GGML_ASSERT(ncols % QK_K == 0);
5028
5088
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5029
- const dim3 block_nums(1, block_num_y, 1);
5089
+ const dim3 block_nums(block_num_y, 1, 1);
5030
5090
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5031
5091
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
5032
5092
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5035,7 +5095,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
5035
5095
  static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5036
5096
  GGML_ASSERT(ncols % QK_K == 0);
5037
5097
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5038
- const dim3 block_nums(1, block_num_y, 1);
5098
+ const dim3 block_nums(block_num_y, 1, 1);
5039
5099
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5040
5100
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
5041
5101
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5054,7 +5114,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
5054
5114
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5055
5115
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
5056
5116
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
5057
- const dim3 block_nums(1, block_num_y, 1);
5117
+ const dim3 block_nums(block_num_y, 1, 1);
5058
5118
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
5059
5119
  dequantize_mul_mat_vec<1, 1, convert_f16>
5060
5120
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -5610,6 +5670,16 @@ static void ggml_cpy_f32_f16_cuda(
5610
5670
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5611
5671
  }
5612
5672
 
5673
+ static void ggml_cpy_f16_f16_cuda(
5674
+ const char * cx, char * cdst, const int ne,
5675
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
5676
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
5677
+
5678
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
5679
+ cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
5680
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5681
+ }
5682
+
5613
5683
  static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
5614
5684
  const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
5615
5685
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -5693,6 +5763,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
5693
5763
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5694
5764
  }
5695
5765
 
5766
+ static void im2col_f32_f16_cuda(const float * x, half * dst,
5767
+ int OH, int IW, int IH, int OW, int IC,
5768
+ int KH, int KW, int N, int ofs0, int ofs1,
5769
+ int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
5770
+ dim3 block_nums(IC, OH, OW);
5771
+ dim3 block_dims(N, KH, KW);
5772
+ im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
5773
+ }
5774
+
5696
5775
  // buffer pool for cuda
5697
5776
  #define MAX_CUDA_BUFFERS 256
5698
5777
 
@@ -5761,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5761
5840
  return ptr;
5762
5841
  }
5763
5842
  #ifdef DEBUG_CUDA_MALLOC
5764
- fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5843
+ fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
5765
5844
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5766
5845
  #endif
5767
5846
  void * ptr;
@@ -5789,6 +5868,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
5789
5868
  CUDA_CHECK(cudaFree(ptr));
5790
5869
  }
5791
5870
 
5871
+ static bool g_cublas_loaded = false;
5872
+
5873
+ bool ggml_cublas_loaded(void) {
5874
+ return g_cublas_loaded;
5875
+ }
5792
5876
 
5793
5877
  void ggml_init_cublas() {
5794
5878
  static bool initialized = false;
@@ -5802,7 +5886,12 @@ void ggml_init_cublas() {
5802
5886
  CUDA_CHECK(cudaDeviceSynchronize());
5803
5887
  #endif
5804
5888
 
5805
- CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5889
+ if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
5890
+ initialized = true;
5891
+ g_cublas_loaded = false;
5892
+ return;
5893
+ }
5894
+
5806
5895
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5807
5896
  int64_t total_vram = 0;
5808
5897
  #if defined(GGML_CUDA_FORCE_MMQ)
@@ -5850,6 +5939,7 @@ void ggml_init_cublas() {
5850
5939
  // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
5851
5940
 
5852
5941
  initialized = true;
5942
+ g_cublas_loaded = true;
5853
5943
  }
5854
5944
  }
5855
5945
 
@@ -5888,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
5888
5978
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
5889
5979
  // This can fixed the OOM error in WSL.
5890
5980
  cudaGetLastError();
5891
- fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
5981
+ fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
5892
5982
  size/1024.0/1024.0, cudaGetErrorString(err));
5893
5983
  return nullptr;
5894
5984
  }
@@ -6116,6 +6206,34 @@ inline void ggml_cuda_op_silu(
6116
6206
  (void) src1_dd;
6117
6207
  }
6118
6208
 
6209
+ inline void ggml_cuda_op_relu(
6210
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6211
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6212
+
6213
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6214
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6215
+
6216
+ relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6217
+
6218
+ (void) src1;
6219
+ (void) dst;
6220
+ (void) src1_dd;
6221
+ }
6222
+
6223
+ inline void ggml_cuda_op_sqr(
6224
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6225
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6226
+
6227
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6228
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6229
+
6230
+ sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6231
+
6232
+ (void) src1;
6233
+ (void) dst;
6234
+ (void) src1_dd;
6235
+ }
6236
+
6119
6237
  inline void ggml_cuda_op_norm(
6120
6238
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6121
6239
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6238,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
6238
6356
  case GGML_TYPE_Q8_0:
6239
6357
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
6240
6358
  case GGML_TYPE_F16:
6359
+ case GGML_TYPE_F32:
6241
6360
  return 1;
6242
6361
  case GGML_TYPE_Q2_K:
6243
6362
  return max_compute_capability >= CC_RDNA2 ? 128 : 32;
@@ -6260,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
6260
6379
  case GGML_TYPE_Q8_0:
6261
6380
  return 64;
6262
6381
  case GGML_TYPE_F16:
6382
+ case GGML_TYPE_F32:
6263
6383
  return 1;
6264
6384
  case GGML_TYPE_Q2_K:
6265
6385
  case GGML_TYPE_Q3_K:
@@ -6451,8 +6571,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6451
6571
  src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6452
6572
  to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6453
6573
  }
6454
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6455
-
6574
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
6456
6575
  size_t dst_as = 0;
6457
6576
  half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6458
6577
 
@@ -6627,6 +6746,45 @@ inline void ggml_cuda_op_alibi(
6627
6746
  (void) src1_dd;
6628
6747
  }
6629
6748
 
6749
+ inline void ggml_cuda_op_im2col(
6750
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6751
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6752
+
6753
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
6754
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6755
+ GGML_ASSERT( dst->type == GGML_TYPE_F16);
6756
+
6757
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
6758
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
6759
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
6760
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
6761
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
6762
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
6763
+
6764
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
6765
+
6766
+ const int64_t N = src1->ne[is_2D ? 3 : 2];
6767
+ const int64_t IC = src1->ne[is_2D ? 2 : 1];
6768
+ const int64_t IH = is_2D ? src1->ne[1] : 1;
6769
+ const int64_t IW = src1->ne[0];
6770
+
6771
+ const int64_t KH = is_2D ? src0->ne[1] : 1;
6772
+ const int64_t KW = src0->ne[0];
6773
+
6774
+ const int64_t OH = is_2D ? dst->ne[2] : 1;
6775
+ const int64_t OW = dst->ne[1];
6776
+
6777
+ const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
6778
+ const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
6779
+
6780
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
6781
+ OH, IW, IH, OW, IC, KH, KW, N,
6782
+ ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
6783
+
6784
+ (void) src0;
6785
+ (void) src0_dd;
6786
+ }
6787
+
6630
6788
  inline void ggml_cuda_op_diag_mask_inf(
6631
6789
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6632
6790
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6892,6 +7050,8 @@ static void ggml_cuda_op_mul_mat(
6892
7050
  int64_t row_low[GGML_CUDA_MAX_DEVICES];
6893
7051
  int64_t row_high[GGML_CUDA_MAX_DEVICES];
6894
7052
 
7053
+ int used_devices = 0;
7054
+
6895
7055
  for (int64_t id = 0; id < g_device_count; ++id) {
6896
7056
  // by default, use all rows
6897
7057
  row_low[id] = 0;
@@ -6919,6 +7079,8 @@ static void ggml_cuda_op_mul_mat(
6919
7079
  continue;
6920
7080
  }
6921
7081
 
7082
+ used_devices++;
7083
+
6922
7084
  const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6923
7085
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6924
7086
 
@@ -6957,12 +7119,12 @@ static void ggml_cuda_op_mul_mat(
6957
7119
 
6958
7120
  // if multiple devices are used they need to wait for the main device
6959
7121
  // here an event is recorded that signals that the main device has finished calculating the input data
6960
- if (split && g_device_count > 1) {
7122
+ if (split && used_devices > 1) {
6961
7123
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6962
7124
  CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6963
7125
  }
6964
7126
 
6965
- const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
7127
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6966
7128
  for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6967
7129
  const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6968
7130
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@@ -7078,6 +7240,9 @@ static void ggml_cuda_op_mul_mat(
7078
7240
  }
7079
7241
 
7080
7242
  for (int64_t id = 0; id < g_device_count; ++id) {
7243
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
7244
+ continue;
7245
+ }
7081
7246
  CUDA_CHECK(ggml_cuda_set_device(id));
7082
7247
 
7083
7248
  // free buffers again when done
@@ -7102,6 +7267,9 @@ static void ggml_cuda_op_mul_mat(
7102
7267
 
7103
7268
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7104
7269
  for (int64_t id = 0; id < g_device_count; ++id) {
7270
+ if (row_low[id] == row_high[id]) {
7271
+ continue;
7272
+ }
7105
7273
  for (int64_t is = 0; is < is_max; ++is) {
7106
7274
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
7107
7275
  }
@@ -7138,6 +7306,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7138
7306
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7139
7307
  }
7140
7308
 
7309
+ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7310
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7311
+ }
7312
+
7313
+ static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7314
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7315
+ }
7316
+
7141
7317
  static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7142
7318
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7143
7319
  }
@@ -7147,6 +7323,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
7147
7323
  }
7148
7324
 
7149
7325
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7326
+ if (!g_cublas_loaded) return false;
7327
+
7150
7328
  const int64_t ne10 = src1->ne[0];
7151
7329
 
7152
7330
  const int64_t ne0 = dst->ne[0];
@@ -7225,7 +7403,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7225
7403
 
7226
7404
  __global__ void k_compute_batched_ptrs(
7227
7405
  const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7228
- void ** ptrs,
7406
+ const void ** ptrs_src, void ** ptrs_dst,
7229
7407
  int ne12, int ne13,
7230
7408
  int ne23,
7231
7409
  int nb02, int nb03,
@@ -7242,9 +7420,9 @@ __global__ void k_compute_batched_ptrs(
7242
7420
  int i03 = i13 / r3;
7243
7421
  int i02 = i12 / r2;
7244
7422
 
7245
- ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
7246
- ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7247
- ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7423
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
7424
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7425
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7248
7426
  }
7249
7427
 
7250
7428
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7350,14 +7528,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7350
7528
  // use cublasGemmBatchedEx
7351
7529
  const int ne23 = ne12*ne13;
7352
7530
 
7353
- void ** ptrs_as = nullptr;
7354
- size_t ptrs_s = 0;
7355
- ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7531
+ const void ** ptrs_src = nullptr;
7532
+ void ** ptrs_dst = nullptr;
7533
+
7534
+ size_t ptrs_src_s = 0;
7535
+ size_t ptrs_dst_s = 0;
7536
+
7537
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
7538
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
7356
7539
 
7357
7540
  dim3 block_dims(ne13, ne12);
7358
7541
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7359
7542
  src0_as_f16, src1_as_f16, dst_f16,
7360
- ptrs_as,
7543
+ ptrs_src, ptrs_dst,
7361
7544
  ne12, ne13,
7362
7545
  ne23,
7363
7546
  nb02, nb03,
@@ -7369,14 +7552,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7369
7552
  CUBLAS_CHECK(
7370
7553
  cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7371
7554
  ne01, ne11, ne10,
7372
- &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7373
- (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7374
- &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7555
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7556
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7557
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
7375
7558
  ne23,
7376
7559
  CUBLAS_COMPUTE_16F,
7377
7560
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7378
7561
 
7379
- ggml_cuda_pool_free(ptrs_as, ptrs_s);
7562
+ if (ptrs_src_s != 0) {
7563
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
7564
+ }
7565
+ if (ptrs_dst_s != 0) {
7566
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
7567
+ }
7380
7568
  }
7381
7569
  #endif
7382
7570
 
@@ -7389,10 +7577,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7389
7577
 
7390
7578
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7391
7579
  const bool all_on_device =
7392
- (src0->backend == GGML_BACKEND_GPU) &&
7580
+ (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7393
7581
  (src1->backend == GGML_BACKEND_GPU) &&
7394
7582
  ( dst->backend == GGML_BACKEND_GPU);
7395
7583
 
7584
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7585
+
7396
7586
  int64_t min_compute_capability = INT_MAX;
7397
7587
  for (int64_t id = 0; id < g_device_count; ++id) {
7398
7588
  if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
@@ -7414,13 +7604,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7414
7604
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7415
7605
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7416
7606
 
7417
- if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7607
+ if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7418
7608
  // KQ single-batch
7419
7609
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7420
- } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7610
+ } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7421
7611
  // KQV single-batch
7422
7612
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7423
- } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7613
+ } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7424
7614
  // KQ + KQV multi-batch
7425
7615
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7426
7616
  } else if (src0->type == GGML_TYPE_F32) {
@@ -7507,6 +7697,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7507
7697
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7508
7698
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7509
7699
  ne10, ne11, nb10, nb11, nb12, main_stream);
7700
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7701
+ ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7702
+ ne10, ne11, nb10, nb11, nb12, main_stream);
7510
7703
  } else {
7511
7704
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7512
7705
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7538,6 +7731,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
7538
7731
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
7539
7732
  }
7540
7733
 
7734
+ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7735
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7736
+ }
7737
+
7541
7738
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7542
7739
  (void) src0;
7543
7740
  (void) src1;
@@ -7649,11 +7846,11 @@ static size_t g_temp_tensor_extra_index = 0;
7649
7846
 
7650
7847
  static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7651
7848
  if (g_temp_tensor_extras == nullptr) {
7652
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7849
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
7653
7850
  }
7654
7851
 
7655
7852
  size_t alloc_index = g_temp_tensor_extra_index;
7656
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7853
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
7657
7854
  ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7658
7855
  memset(extra, 0, sizeof(*extra));
7659
7856
 
@@ -7820,6 +8017,8 @@ void ggml_cuda_free_scratch() {
7820
8017
  }
7821
8018
 
7822
8019
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
8020
+ if (!g_cublas_loaded) return false;
8021
+
7823
8022
  ggml_cuda_func_t func;
7824
8023
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7825
8024
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
@@ -7829,6 +8028,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7829
8028
  return false;
7830
8029
  }
7831
8030
 
8031
+ if (tensor->op == GGML_OP_MUL_MAT) {
8032
+ if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8033
+ #ifndef NDEBUG
8034
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8035
+ #endif
8036
+ return false;
8037
+ }
8038
+ }
8039
+
7832
8040
  switch (tensor->op) {
7833
8041
  case GGML_OP_REPEAT:
7834
8042
  func = ggml_cuda_repeat;
@@ -7853,6 +8061,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7853
8061
  case GGML_UNARY_OP_SILU:
7854
8062
  func = ggml_cuda_silu;
7855
8063
  break;
8064
+ case GGML_UNARY_OP_RELU:
8065
+ func = ggml_cuda_relu;
8066
+ break;
7856
8067
  default:
7857
8068
  return false;
7858
8069
  } break;
@@ -7871,6 +8082,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7871
8082
  case GGML_OP_SCALE:
7872
8083
  func = ggml_cuda_scale;
7873
8084
  break;
8085
+ case GGML_OP_SQR:
8086
+ func = ggml_cuda_sqr;
8087
+ break;
7874
8088
  case GGML_OP_CLAMP:
7875
8089
  if (!any_on_device) {
7876
8090
  return false;
@@ -7901,6 +8115,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7901
8115
  case GGML_OP_ALIBI:
7902
8116
  func = ggml_cuda_alibi;
7903
8117
  break;
8118
+ case GGML_OP_IM2COL:
8119
+ func = ggml_cuda_im2col;
8120
+ break;
7904
8121
  default:
7905
8122
  return false;
7906
8123
  }
@@ -7960,11 +8177,11 @@ struct ggml_backend_buffer_context_cuda {
7960
8177
 
7961
8178
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7962
8179
  if (temp_tensor_extras == nullptr) {
7963
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
8180
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
7964
8181
  }
7965
8182
 
7966
8183
  size_t alloc_index = temp_tensor_extra_index;
7967
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
8184
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
7968
8185
  ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
7969
8186
  memset(extra, 0, sizeof(*extra));
7970
8187
 
@@ -8050,7 +8267,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
8050
8267
  ggml_cuda_set_device(g_main_device);
8051
8268
 
8052
8269
  ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
8270
+
8271
+ size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8272
+
8273
+ ggml_cuda_set_device(g_main_device);
8053
8274
  CUDA_CHECK(cudaMalloc(&ctx->device, size));
8275
+
8054
8276
  return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
8055
8277
  }
8056
8278
 
@@ -8117,6 +8339,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8117
8339
  for (int i = 0; i < cgraph->n_nodes; i++) {
8118
8340
  ggml_tensor * node = cgraph->nodes[i];
8119
8341
 
8342
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8343
+ continue;
8120
8344
  assert(node->backend == GGML_BACKEND_GPU);
8121
8345
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8122
8346
  if (node->src[j] != nullptr) {