llama_cpp 0.9.1 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +383 -210
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +277 -53
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +112 -30
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +877 -1707
- data/ext/llama_cpp/src/ggml.h +68 -45
- data/ext/llama_cpp/src/llama.cpp +475 -117
- data/ext/llama_cpp/src/llama.h +11 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -81,12 +81,15 @@
|
|
81
81
|
|
82
82
|
#include "ggml-cuda.h"
|
83
83
|
#include "ggml.h"
|
84
|
+
#include "ggml-backend-impl.h"
|
84
85
|
|
85
86
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
86
87
|
#define CC_VOLTA 700
|
87
88
|
#define CC_OFFSET_AMD 1000000
|
88
89
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
90
|
|
91
|
+
#define GGML_CUDA_MAX_NODES 8192
|
92
|
+
|
90
93
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
94
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
95
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
@@ -433,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
436
|
#define CUDA_MUL_BLOCK_SIZE 256
|
434
437
|
#define CUDA_GELU_BLOCK_SIZE 256
|
435
438
|
#define CUDA_SILU_BLOCK_SIZE 256
|
439
|
+
#define CUDA_RELU_BLOCK_SIZE 256
|
440
|
+
#define CUDA_SQR_BLOCK_SIZE 256
|
436
441
|
#define CUDA_CPY_BLOCK_SIZE 32
|
437
442
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
438
443
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
@@ -553,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
553
558
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
554
559
|
}
|
555
560
|
|
561
|
+
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
562
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
563
|
+
|
564
|
+
if (i >= k) {
|
565
|
+
return;
|
566
|
+
}
|
567
|
+
dst[i] = fmaxf(x[i], 0);
|
568
|
+
}
|
569
|
+
|
570
|
+
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
571
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
572
|
+
|
573
|
+
if (i >= k) {
|
574
|
+
return;
|
575
|
+
}
|
576
|
+
dst[i] = x[i] * x[i];
|
577
|
+
}
|
578
|
+
|
556
579
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
557
580
|
#pragma unroll
|
558
581
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -982,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
982
1005
|
|
983
1006
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
984
1007
|
|
985
|
-
const int row = blockIdx.
|
1008
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
986
1009
|
if (row > nrows) return;
|
987
1010
|
|
988
1011
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1086,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
1086
1109
|
|
1087
1110
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1088
1111
|
|
1089
|
-
const int row = blockIdx.
|
1112
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1090
1113
|
if (row > nrows) return;
|
1091
1114
|
|
1092
1115
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1190,7 +1213,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
1190
1213
|
|
1191
1214
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1192
1215
|
|
1193
|
-
const int row = blockIdx.
|
1216
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1194
1217
|
if (row > nrows) return;
|
1195
1218
|
const int num_blocks_per_row = ncols / QK_K;
|
1196
1219
|
const int ib0 = row*num_blocks_per_row;
|
@@ -1444,7 +1467,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
1444
1467
|
|
1445
1468
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1446
1469
|
|
1447
|
-
const int row = blockIdx.
|
1470
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1448
1471
|
if (row > nrows) return;
|
1449
1472
|
|
1450
1473
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -4254,7 +4277,7 @@ template <bool need_check> static __global__ void
|
|
4254
4277
|
|
4255
4278
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
4256
4279
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
4257
|
-
const int row = blockIdx.
|
4280
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4258
4281
|
|
4259
4282
|
if (row >= nrows) {
|
4260
4283
|
return;
|
@@ -4294,7 +4317,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
4294
4317
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
4295
4318
|
// qk = quantized weights per x block
|
4296
4319
|
// qr = number of quantized weights per data value in x block
|
4297
|
-
const int row = blockIdx.
|
4320
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4298
4321
|
|
4299
4322
|
if (row >= nrows) {
|
4300
4323
|
return;
|
@@ -4468,6 +4491,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
|
4468
4491
|
*dsti = __float2half(*xi);
|
4469
4492
|
}
|
4470
4493
|
|
4494
|
+
static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
4495
|
+
const half * xi = (const half *) cxi;
|
4496
|
+
half * dsti = (half *) cdsti;
|
4497
|
+
|
4498
|
+
*dsti = *xi;
|
4499
|
+
}
|
4500
|
+
|
4471
4501
|
template <cpy_kernel_t cpy_1>
|
4472
4502
|
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
4473
4503
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -4721,6 +4751,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
4721
4751
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4722
4752
|
}
|
4723
4753
|
|
4754
|
+
static __global__ void im2col_f32_f16(
|
4755
|
+
const float * x, half * dst,
|
4756
|
+
int ofs0, int ofs1, int IW, int IH, int CHW,
|
4757
|
+
int s0, int s1, int p0, int p1, int d0, int d1) {
|
4758
|
+
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
|
4759
|
+
const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
|
4760
|
+
|
4761
|
+
const int offset_dst =
|
4762
|
+
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
|
4763
|
+
(blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
|
4764
|
+
|
4765
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
4766
|
+
dst[offset_dst] = __float2half(0.0f);
|
4767
|
+
} else {
|
4768
|
+
const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
|
4769
|
+
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
4770
|
+
}
|
4771
|
+
}
|
4772
|
+
|
4724
4773
|
template<int qk, int qr, dequantize_kernel_t dq>
|
4725
4774
|
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4726
4775
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
@@ -4759,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4759
4808
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4760
4809
|
}
|
4761
4810
|
|
4811
|
+
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4812
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
4813
|
+
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4814
|
+
}
|
4815
|
+
|
4816
|
+
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4817
|
+
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
4818
|
+
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4819
|
+
}
|
4820
|
+
|
4762
4821
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4763
4822
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4764
4823
|
if (ncols < 1024) {
|
@@ -4867,7 +4926,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4867
4926
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4868
4927
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4869
4928
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4870
|
-
|
4929
|
+
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
4930
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4871
4931
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4872
4932
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
4873
4933
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4876,7 +4936,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
|
4876
4936
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4877
4937
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4878
4938
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4879
|
-
const dim3 block_nums(
|
4939
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4880
4940
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4881
4941
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
4882
4942
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4885,7 +4945,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
|
4885
4945
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4886
4946
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4887
4947
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4888
|
-
const dim3 block_nums(
|
4948
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4889
4949
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4890
4950
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
4891
4951
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4894,7 +4954,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
|
4894
4954
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4895
4955
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4896
4956
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4897
|
-
const dim3 block_nums(
|
4957
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4898
4958
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4899
4959
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
4900
4960
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4903,7 +4963,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
|
4903
4963
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4904
4964
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4905
4965
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4906
|
-
const dim3 block_nums(
|
4966
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4907
4967
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4908
4968
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
4909
4969
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4913,7 +4973,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
|
4913
4973
|
GGML_ASSERT(ncols % QK_K == 0);
|
4914
4974
|
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
4915
4975
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4916
|
-
const dim3 block_nums(
|
4976
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4917
4977
|
const dim3 block_dims(32, ny, 1);
|
4918
4978
|
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4919
4979
|
}
|
@@ -4922,7 +4982,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
|
4922
4982
|
GGML_ASSERT(ncols % QK_K == 0);
|
4923
4983
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4924
4984
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4925
|
-
const dim3 block_nums(
|
4985
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4926
4986
|
const dim3 block_dims(32, ny, 1);
|
4927
4987
|
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4928
4988
|
}
|
@@ -4931,7 +4991,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
|
4931
4991
|
GGML_ASSERT(ncols % QK_K == 0);
|
4932
4992
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4933
4993
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4934
|
-
const dim3 block_nums(
|
4994
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4935
4995
|
const dim3 block_dims(32, ny, 1);
|
4936
4996
|
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4937
4997
|
}
|
@@ -4946,7 +5006,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4946
5006
|
GGML_ASSERT(ncols % QK_K == 0);
|
4947
5007
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4948
5008
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4949
|
-
const dim3 block_nums(
|
5009
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4950
5010
|
const dim3 block_dims(32, ny, 1);
|
4951
5011
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4952
5012
|
}
|
@@ -4954,7 +5014,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4954
5014
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4955
5015
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
4956
5016
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4957
|
-
const dim3 block_nums(
|
5017
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4958
5018
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4959
5019
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
4960
5020
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4963,7 +5023,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4963
5023
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4964
5024
|
GGML_ASSERT(ncols % QK4_1 == 0);
|
4965
5025
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4966
|
-
const dim3 block_nums(
|
5026
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4967
5027
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4968
5028
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
4969
5029
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4972,7 +5032,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4972
5032
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4973
5033
|
GGML_ASSERT(ncols % QK5_0 == 0);
|
4974
5034
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4975
|
-
const dim3 block_nums(
|
5035
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4976
5036
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4977
5037
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
4978
5038
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4981,7 +5041,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4981
5041
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4982
5042
|
GGML_ASSERT(ncols % QK5_1 == 0);
|
4983
5043
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4984
|
-
const dim3 block_nums(
|
5044
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4985
5045
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4986
5046
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
4987
5047
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4990,7 +5050,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4990
5050
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4991
5051
|
GGML_ASSERT(ncols % QK8_0 == 0);
|
4992
5052
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4993
|
-
const dim3 block_nums(
|
5053
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4994
5054
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4995
5055
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
4996
5056
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4999,7 +5059,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4999
5059
|
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5000
5060
|
GGML_ASSERT(ncols % QK_K == 0);
|
5001
5061
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5002
|
-
const dim3 block_nums(
|
5062
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5003
5063
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5004
5064
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
5005
5065
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5008,7 +5068,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5008
5068
|
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5009
5069
|
GGML_ASSERT(ncols % QK_K == 0);
|
5010
5070
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5011
|
-
const dim3 block_nums(
|
5071
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5012
5072
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5013
5073
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
5014
5074
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5017,7 +5077,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5017
5077
|
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5018
5078
|
GGML_ASSERT(ncols % QK_K == 0);
|
5019
5079
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5020
|
-
const dim3 block_nums(
|
5080
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5021
5081
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5022
5082
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
5023
5083
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5026,7 +5086,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5026
5086
|
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5027
5087
|
GGML_ASSERT(ncols % QK_K == 0);
|
5028
5088
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5029
|
-
const dim3 block_nums(
|
5089
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5030
5090
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5031
5091
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
5032
5092
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5035,7 +5095,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5035
5095
|
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5036
5096
|
GGML_ASSERT(ncols % QK_K == 0);
|
5037
5097
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5038
|
-
const dim3 block_nums(
|
5098
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5039
5099
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5040
5100
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
5041
5101
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5054,7 +5114,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
|
5054
5114
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5055
5115
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5056
5116
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5057
|
-
const dim3 block_nums(
|
5117
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5058
5118
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5059
5119
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5060
5120
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -5610,6 +5670,16 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5610
5670
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5611
5671
|
}
|
5612
5672
|
|
5673
|
+
static void ggml_cpy_f16_f16_cuda(
|
5674
|
+
const char * cx, char * cdst, const int ne,
|
5675
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5676
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
5677
|
+
|
5678
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
5679
|
+
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
5680
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5681
|
+
}
|
5682
|
+
|
5613
5683
|
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
5614
5684
|
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
5615
5685
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
@@ -5693,6 +5763,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
|
|
5693
5763
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5694
5764
|
}
|
5695
5765
|
|
5766
|
+
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
5767
|
+
int OH, int IW, int IH, int OW, int IC,
|
5768
|
+
int KH, int KW, int N, int ofs0, int ofs1,
|
5769
|
+
int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
|
5770
|
+
dim3 block_nums(IC, OH, OW);
|
5771
|
+
dim3 block_dims(N, KH, KW);
|
5772
|
+
im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
5773
|
+
}
|
5774
|
+
|
5696
5775
|
// buffer pool for cuda
|
5697
5776
|
#define MAX_CUDA_BUFFERS 256
|
5698
5777
|
|
@@ -5761,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5761
5840
|
return ptr;
|
5762
5841
|
}
|
5763
5842
|
#ifdef DEBUG_CUDA_MALLOC
|
5764
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
5843
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
|
5765
5844
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5766
5845
|
#endif
|
5767
5846
|
void * ptr;
|
@@ -5789,6 +5868,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
5789
5868
|
CUDA_CHECK(cudaFree(ptr));
|
5790
5869
|
}
|
5791
5870
|
|
5871
|
+
static bool g_cublas_loaded = false;
|
5872
|
+
|
5873
|
+
bool ggml_cublas_loaded(void) {
|
5874
|
+
return g_cublas_loaded;
|
5875
|
+
}
|
5792
5876
|
|
5793
5877
|
void ggml_init_cublas() {
|
5794
5878
|
static bool initialized = false;
|
@@ -5802,7 +5886,12 @@ void ggml_init_cublas() {
|
|
5802
5886
|
CUDA_CHECK(cudaDeviceSynchronize());
|
5803
5887
|
#endif
|
5804
5888
|
|
5805
|
-
|
5889
|
+
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
5890
|
+
initialized = true;
|
5891
|
+
g_cublas_loaded = false;
|
5892
|
+
return;
|
5893
|
+
}
|
5894
|
+
|
5806
5895
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5807
5896
|
int64_t total_vram = 0;
|
5808
5897
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
@@ -5850,6 +5939,7 @@ void ggml_init_cublas() {
|
|
5850
5939
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
5851
5940
|
|
5852
5941
|
initialized = true;
|
5942
|
+
g_cublas_loaded = true;
|
5853
5943
|
}
|
5854
5944
|
}
|
5855
5945
|
|
@@ -5888,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
5888
5978
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
5889
5979
|
// This can fixed the OOM error in WSL.
|
5890
5980
|
cudaGetLastError();
|
5891
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
5981
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
|
5892
5982
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
5893
5983
|
return nullptr;
|
5894
5984
|
}
|
@@ -6116,6 +6206,34 @@ inline void ggml_cuda_op_silu(
|
|
6116
6206
|
(void) src1_dd;
|
6117
6207
|
}
|
6118
6208
|
|
6209
|
+
inline void ggml_cuda_op_relu(
|
6210
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6211
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6212
|
+
|
6213
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6214
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6215
|
+
|
6216
|
+
relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6217
|
+
|
6218
|
+
(void) src1;
|
6219
|
+
(void) dst;
|
6220
|
+
(void) src1_dd;
|
6221
|
+
}
|
6222
|
+
|
6223
|
+
inline void ggml_cuda_op_sqr(
|
6224
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6225
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6226
|
+
|
6227
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6228
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6229
|
+
|
6230
|
+
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6231
|
+
|
6232
|
+
(void) src1;
|
6233
|
+
(void) dst;
|
6234
|
+
(void) src1_dd;
|
6235
|
+
}
|
6236
|
+
|
6119
6237
|
inline void ggml_cuda_op_norm(
|
6120
6238
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6121
6239
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6238,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6238
6356
|
case GGML_TYPE_Q8_0:
|
6239
6357
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
6240
6358
|
case GGML_TYPE_F16:
|
6359
|
+
case GGML_TYPE_F32:
|
6241
6360
|
return 1;
|
6242
6361
|
case GGML_TYPE_Q2_K:
|
6243
6362
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
@@ -6260,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6260
6379
|
case GGML_TYPE_Q8_0:
|
6261
6380
|
return 64;
|
6262
6381
|
case GGML_TYPE_F16:
|
6382
|
+
case GGML_TYPE_F32:
|
6263
6383
|
return 1;
|
6264
6384
|
case GGML_TYPE_Q2_K:
|
6265
6385
|
case GGML_TYPE_Q3_K:
|
@@ -6451,8 +6571,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6451
6571
|
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6452
6572
|
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6453
6573
|
}
|
6454
|
-
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *)
|
6455
|
-
|
6574
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
6456
6575
|
size_t dst_as = 0;
|
6457
6576
|
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6458
6577
|
|
@@ -6627,6 +6746,45 @@ inline void ggml_cuda_op_alibi(
|
|
6627
6746
|
(void) src1_dd;
|
6628
6747
|
}
|
6629
6748
|
|
6749
|
+
inline void ggml_cuda_op_im2col(
|
6750
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6751
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6752
|
+
|
6753
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
6754
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6755
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
6756
|
+
|
6757
|
+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
6758
|
+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
6759
|
+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
6760
|
+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
6761
|
+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
6762
|
+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
6763
|
+
|
6764
|
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
6765
|
+
|
6766
|
+
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
6767
|
+
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
6768
|
+
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
6769
|
+
const int64_t IW = src1->ne[0];
|
6770
|
+
|
6771
|
+
const int64_t KH = is_2D ? src0->ne[1] : 1;
|
6772
|
+
const int64_t KW = src0->ne[0];
|
6773
|
+
|
6774
|
+
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
6775
|
+
const int64_t OW = dst->ne[1];
|
6776
|
+
|
6777
|
+
const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
6778
|
+
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
6779
|
+
|
6780
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
6781
|
+
OH, IW, IH, OW, IC, KH, KW, N,
|
6782
|
+
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
6783
|
+
|
6784
|
+
(void) src0;
|
6785
|
+
(void) src0_dd;
|
6786
|
+
}
|
6787
|
+
|
6630
6788
|
inline void ggml_cuda_op_diag_mask_inf(
|
6631
6789
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6632
6790
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6892,6 +7050,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6892
7050
|
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6893
7051
|
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
6894
7052
|
|
7053
|
+
int used_devices = 0;
|
7054
|
+
|
6895
7055
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
6896
7056
|
// by default, use all rows
|
6897
7057
|
row_low[id] = 0;
|
@@ -6919,6 +7079,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6919
7079
|
continue;
|
6920
7080
|
}
|
6921
7081
|
|
7082
|
+
used_devices++;
|
7083
|
+
|
6922
7084
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6923
7085
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6924
7086
|
|
@@ -6957,12 +7119,12 @@ static void ggml_cuda_op_mul_mat(
|
|
6957
7119
|
|
6958
7120
|
// if multiple devices are used they need to wait for the main device
|
6959
7121
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
6960
|
-
if (split &&
|
7122
|
+
if (split && used_devices > 1) {
|
6961
7123
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6962
7124
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6963
7125
|
}
|
6964
7126
|
|
6965
|
-
const int64_t src1_col_stride = split &&
|
7127
|
+
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6966
7128
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6967
7129
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6968
7130
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
@@ -7078,6 +7240,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7078
7240
|
}
|
7079
7241
|
|
7080
7242
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7243
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
7244
|
+
continue;
|
7245
|
+
}
|
7081
7246
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
7082
7247
|
|
7083
7248
|
// free buffers again when done
|
@@ -7102,6 +7267,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7102
7267
|
|
7103
7268
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7104
7269
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7270
|
+
if (row_low[id] == row_high[id]) {
|
7271
|
+
continue;
|
7272
|
+
}
|
7105
7273
|
for (int64_t is = 0; is < is_max; ++is) {
|
7106
7274
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
7107
7275
|
}
|
@@ -7138,6 +7306,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7138
7306
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7139
7307
|
}
|
7140
7308
|
|
7309
|
+
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7310
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7311
|
+
}
|
7312
|
+
|
7313
|
+
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7314
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7315
|
+
}
|
7316
|
+
|
7141
7317
|
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7142
7318
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7143
7319
|
}
|
@@ -7147,6 +7323,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7147
7323
|
}
|
7148
7324
|
|
7149
7325
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7326
|
+
if (!g_cublas_loaded) return false;
|
7327
|
+
|
7150
7328
|
const int64_t ne10 = src1->ne[0];
|
7151
7329
|
|
7152
7330
|
const int64_t ne0 = dst->ne[0];
|
@@ -7225,7 +7403,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7225
7403
|
|
7226
7404
|
__global__ void k_compute_batched_ptrs(
|
7227
7405
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7228
|
-
void **
|
7406
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
7229
7407
|
int ne12, int ne13,
|
7230
7408
|
int ne23,
|
7231
7409
|
int nb02, int nb03,
|
@@ -7242,9 +7420,9 @@ __global__ void k_compute_batched_ptrs(
|
|
7242
7420
|
int i03 = i13 / r3;
|
7243
7421
|
int i02 = i12 / r2;
|
7244
7422
|
|
7245
|
-
|
7246
|
-
|
7247
|
-
|
7423
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7424
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7425
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7248
7426
|
}
|
7249
7427
|
|
7250
7428
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -7350,14 +7528,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7350
7528
|
// use cublasGemmBatchedEx
|
7351
7529
|
const int ne23 = ne12*ne13;
|
7352
7530
|
|
7353
|
-
void **
|
7354
|
-
|
7355
|
-
|
7531
|
+
const void ** ptrs_src = nullptr;
|
7532
|
+
void ** ptrs_dst = nullptr;
|
7533
|
+
|
7534
|
+
size_t ptrs_src_s = 0;
|
7535
|
+
size_t ptrs_dst_s = 0;
|
7536
|
+
|
7537
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
7538
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
7356
7539
|
|
7357
7540
|
dim3 block_dims(ne13, ne12);
|
7358
7541
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7359
7542
|
src0_as_f16, src1_as_f16, dst_f16,
|
7360
|
-
|
7543
|
+
ptrs_src, ptrs_dst,
|
7361
7544
|
ne12, ne13,
|
7362
7545
|
ne23,
|
7363
7546
|
nb02, nb03,
|
@@ -7369,14 +7552,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7369
7552
|
CUBLAS_CHECK(
|
7370
7553
|
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7371
7554
|
ne01, ne11, ne10,
|
7372
|
-
&alpha_f16, (const void
|
7373
|
-
(const void
|
7374
|
-
&beta_f16, ( void **
|
7555
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7556
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7557
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
7375
7558
|
ne23,
|
7376
7559
|
CUBLAS_COMPUTE_16F,
|
7377
7560
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7378
7561
|
|
7379
|
-
|
7562
|
+
if (ptrs_src_s != 0) {
|
7563
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
7564
|
+
}
|
7565
|
+
if (ptrs_dst_s != 0) {
|
7566
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
7567
|
+
}
|
7380
7568
|
}
|
7381
7569
|
#endif
|
7382
7570
|
|
@@ -7389,10 +7577,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7389
7577
|
|
7390
7578
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7391
7579
|
const bool all_on_device =
|
7392
|
-
(src0->backend == GGML_BACKEND_GPU) &&
|
7580
|
+
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
7393
7581
|
(src1->backend == GGML_BACKEND_GPU) &&
|
7394
7582
|
( dst->backend == GGML_BACKEND_GPU);
|
7395
7583
|
|
7584
|
+
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7585
|
+
|
7396
7586
|
int64_t min_compute_capability = INT_MAX;
|
7397
7587
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7398
7588
|
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
@@ -7414,13 +7604,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7414
7604
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7415
7605
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7416
7606
|
|
7417
|
-
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7607
|
+
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7418
7608
|
// KQ single-batch
|
7419
7609
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7420
|
-
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7610
|
+
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7421
7611
|
// KQV single-batch
|
7422
7612
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7423
|
-
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7613
|
+
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7424
7614
|
// KQ + KQV multi-batch
|
7425
7615
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7426
7616
|
} else if (src0->type == GGML_TYPE_F32) {
|
@@ -7507,6 +7697,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7507
7697
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7508
7698
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7509
7699
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7700
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7701
|
+
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7702
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7510
7703
|
} else {
|
7511
7704
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7512
7705
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7538,6 +7731,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7538
7731
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
7539
7732
|
}
|
7540
7733
|
|
7734
|
+
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7735
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7736
|
+
}
|
7737
|
+
|
7541
7738
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7542
7739
|
(void) src0;
|
7543
7740
|
(void) src1;
|
@@ -7649,11 +7846,11 @@ static size_t g_temp_tensor_extra_index = 0;
|
|
7649
7846
|
|
7650
7847
|
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7651
7848
|
if (g_temp_tensor_extras == nullptr) {
|
7652
|
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[
|
7849
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
7653
7850
|
}
|
7654
7851
|
|
7655
7852
|
size_t alloc_index = g_temp_tensor_extra_index;
|
7656
|
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) %
|
7853
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
7657
7854
|
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7658
7855
|
memset(extra, 0, sizeof(*extra));
|
7659
7856
|
|
@@ -7820,6 +8017,8 @@ void ggml_cuda_free_scratch() {
|
|
7820
8017
|
}
|
7821
8018
|
|
7822
8019
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8020
|
+
if (!g_cublas_loaded) return false;
|
8021
|
+
|
7823
8022
|
ggml_cuda_func_t func;
|
7824
8023
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7825
8024
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
@@ -7829,6 +8028,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7829
8028
|
return false;
|
7830
8029
|
}
|
7831
8030
|
|
8031
|
+
if (tensor->op == GGML_OP_MUL_MAT) {
|
8032
|
+
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8033
|
+
#ifndef NDEBUG
|
8034
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8035
|
+
#endif
|
8036
|
+
return false;
|
8037
|
+
}
|
8038
|
+
}
|
8039
|
+
|
7832
8040
|
switch (tensor->op) {
|
7833
8041
|
case GGML_OP_REPEAT:
|
7834
8042
|
func = ggml_cuda_repeat;
|
@@ -7853,6 +8061,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7853
8061
|
case GGML_UNARY_OP_SILU:
|
7854
8062
|
func = ggml_cuda_silu;
|
7855
8063
|
break;
|
8064
|
+
case GGML_UNARY_OP_RELU:
|
8065
|
+
func = ggml_cuda_relu;
|
8066
|
+
break;
|
7856
8067
|
default:
|
7857
8068
|
return false;
|
7858
8069
|
} break;
|
@@ -7871,6 +8082,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7871
8082
|
case GGML_OP_SCALE:
|
7872
8083
|
func = ggml_cuda_scale;
|
7873
8084
|
break;
|
8085
|
+
case GGML_OP_SQR:
|
8086
|
+
func = ggml_cuda_sqr;
|
8087
|
+
break;
|
7874
8088
|
case GGML_OP_CLAMP:
|
7875
8089
|
if (!any_on_device) {
|
7876
8090
|
return false;
|
@@ -7901,6 +8115,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7901
8115
|
case GGML_OP_ALIBI:
|
7902
8116
|
func = ggml_cuda_alibi;
|
7903
8117
|
break;
|
8118
|
+
case GGML_OP_IM2COL:
|
8119
|
+
func = ggml_cuda_im2col;
|
8120
|
+
break;
|
7904
8121
|
default:
|
7905
8122
|
return false;
|
7906
8123
|
}
|
@@ -7960,11 +8177,11 @@ struct ggml_backend_buffer_context_cuda {
|
|
7960
8177
|
|
7961
8178
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7962
8179
|
if (temp_tensor_extras == nullptr) {
|
7963
|
-
temp_tensor_extras = new ggml_tensor_extra_gpu[
|
8180
|
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
7964
8181
|
}
|
7965
8182
|
|
7966
8183
|
size_t alloc_index = temp_tensor_extra_index;
|
7967
|
-
temp_tensor_extra_index = (temp_tensor_extra_index + 1) %
|
8184
|
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
7968
8185
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
7969
8186
|
memset(extra, 0, sizeof(*extra));
|
7970
8187
|
|
@@ -8050,7 +8267,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
|
|
8050
8267
|
ggml_cuda_set_device(g_main_device);
|
8051
8268
|
|
8052
8269
|
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
8270
|
+
|
8271
|
+
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8272
|
+
|
8273
|
+
ggml_cuda_set_device(g_main_device);
|
8053
8274
|
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
8275
|
+
|
8054
8276
|
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
8055
8277
|
}
|
8056
8278
|
|
@@ -8117,6 +8339,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8117
8339
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8118
8340
|
ggml_tensor * node = cgraph->nodes[i];
|
8119
8341
|
|
8342
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8343
|
+
continue;
|
8120
8344
|
assert(node->backend == GGML_BACKEND_GPU);
|
8121
8345
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8122
8346
|
if (node->src[j] != nullptr) {
|