llama_cpp 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +383 -210
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +277 -53
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +112 -30
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +877 -1707
- data/ext/llama_cpp/src/ggml.h +68 -45
- data/ext/llama_cpp/src/llama.cpp +475 -117
- data/ext/llama_cpp/src/llama.h +11 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -81,12 +81,15 @@
|
|
81
81
|
|
82
82
|
#include "ggml-cuda.h"
|
83
83
|
#include "ggml.h"
|
84
|
+
#include "ggml-backend-impl.h"
|
84
85
|
|
85
86
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
86
87
|
#define CC_VOLTA 700
|
87
88
|
#define CC_OFFSET_AMD 1000000
|
88
89
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
90
|
|
91
|
+
#define GGML_CUDA_MAX_NODES 8192
|
92
|
+
|
90
93
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
94
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
95
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
@@ -433,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
436
|
#define CUDA_MUL_BLOCK_SIZE 256
|
434
437
|
#define CUDA_GELU_BLOCK_SIZE 256
|
435
438
|
#define CUDA_SILU_BLOCK_SIZE 256
|
439
|
+
#define CUDA_RELU_BLOCK_SIZE 256
|
440
|
+
#define CUDA_SQR_BLOCK_SIZE 256
|
436
441
|
#define CUDA_CPY_BLOCK_SIZE 32
|
437
442
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
438
443
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
@@ -553,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
553
558
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
554
559
|
}
|
555
560
|
|
561
|
+
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
562
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
563
|
+
|
564
|
+
if (i >= k) {
|
565
|
+
return;
|
566
|
+
}
|
567
|
+
dst[i] = fmaxf(x[i], 0);
|
568
|
+
}
|
569
|
+
|
570
|
+
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
571
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
572
|
+
|
573
|
+
if (i >= k) {
|
574
|
+
return;
|
575
|
+
}
|
576
|
+
dst[i] = x[i] * x[i];
|
577
|
+
}
|
578
|
+
|
556
579
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
557
580
|
#pragma unroll
|
558
581
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -982,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
982
1005
|
|
983
1006
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
984
1007
|
|
985
|
-
const int row = blockIdx.
|
1008
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
986
1009
|
if (row > nrows) return;
|
987
1010
|
|
988
1011
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1086,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
1086
1109
|
|
1087
1110
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1088
1111
|
|
1089
|
-
const int row = blockIdx.
|
1112
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1090
1113
|
if (row > nrows) return;
|
1091
1114
|
|
1092
1115
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1190,7 +1213,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
1190
1213
|
|
1191
1214
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1192
1215
|
|
1193
|
-
const int row = blockIdx.
|
1216
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1194
1217
|
if (row > nrows) return;
|
1195
1218
|
const int num_blocks_per_row = ncols / QK_K;
|
1196
1219
|
const int ib0 = row*num_blocks_per_row;
|
@@ -1444,7 +1467,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
1444
1467
|
|
1445
1468
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1446
1469
|
|
1447
|
-
const int row = blockIdx.
|
1470
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1448
1471
|
if (row > nrows) return;
|
1449
1472
|
|
1450
1473
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -4254,7 +4277,7 @@ template <bool need_check> static __global__ void
|
|
4254
4277
|
|
4255
4278
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
4256
4279
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
4257
|
-
const int row = blockIdx.
|
4280
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4258
4281
|
|
4259
4282
|
if (row >= nrows) {
|
4260
4283
|
return;
|
@@ -4294,7 +4317,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
4294
4317
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
4295
4318
|
// qk = quantized weights per x block
|
4296
4319
|
// qr = number of quantized weights per data value in x block
|
4297
|
-
const int row = blockIdx.
|
4320
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4298
4321
|
|
4299
4322
|
if (row >= nrows) {
|
4300
4323
|
return;
|
@@ -4468,6 +4491,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
|
4468
4491
|
*dsti = __float2half(*xi);
|
4469
4492
|
}
|
4470
4493
|
|
4494
|
+
static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
4495
|
+
const half * xi = (const half *) cxi;
|
4496
|
+
half * dsti = (half *) cdsti;
|
4497
|
+
|
4498
|
+
*dsti = *xi;
|
4499
|
+
}
|
4500
|
+
|
4471
4501
|
template <cpy_kernel_t cpy_1>
|
4472
4502
|
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
4473
4503
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -4721,6 +4751,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
4721
4751
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4722
4752
|
}
|
4723
4753
|
|
4754
|
+
static __global__ void im2col_f32_f16(
|
4755
|
+
const float * x, half * dst,
|
4756
|
+
int ofs0, int ofs1, int IW, int IH, int CHW,
|
4757
|
+
int s0, int s1, int p0, int p1, int d0, int d1) {
|
4758
|
+
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
|
4759
|
+
const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
|
4760
|
+
|
4761
|
+
const int offset_dst =
|
4762
|
+
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
|
4763
|
+
(blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
|
4764
|
+
|
4765
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
4766
|
+
dst[offset_dst] = __float2half(0.0f);
|
4767
|
+
} else {
|
4768
|
+
const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
|
4769
|
+
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
4770
|
+
}
|
4771
|
+
}
|
4772
|
+
|
4724
4773
|
template<int qk, int qr, dequantize_kernel_t dq>
|
4725
4774
|
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4726
4775
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
@@ -4759,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4759
4808
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4760
4809
|
}
|
4761
4810
|
|
4811
|
+
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4812
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
4813
|
+
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4814
|
+
}
|
4815
|
+
|
4816
|
+
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4817
|
+
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
4818
|
+
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4819
|
+
}
|
4820
|
+
|
4762
4821
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4763
4822
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4764
4823
|
if (ncols < 1024) {
|
@@ -4867,7 +4926,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4867
4926
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4868
4927
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4869
4928
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4870
|
-
|
4929
|
+
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
4930
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4871
4931
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4872
4932
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
4873
4933
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4876,7 +4936,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
|
4876
4936
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4877
4937
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4878
4938
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4879
|
-
const dim3 block_nums(
|
4939
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4880
4940
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4881
4941
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
4882
4942
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4885,7 +4945,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
|
4885
4945
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4886
4946
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4887
4947
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4888
|
-
const dim3 block_nums(
|
4948
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4889
4949
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4890
4950
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
4891
4951
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4894,7 +4954,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
|
4894
4954
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4895
4955
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4896
4956
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4897
|
-
const dim3 block_nums(
|
4957
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4898
4958
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4899
4959
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
4900
4960
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4903,7 +4963,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
|
4903
4963
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4904
4964
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4905
4965
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4906
|
-
const dim3 block_nums(
|
4966
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4907
4967
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4908
4968
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
4909
4969
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4913,7 +4973,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
|
4913
4973
|
GGML_ASSERT(ncols % QK_K == 0);
|
4914
4974
|
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
4915
4975
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4916
|
-
const dim3 block_nums(
|
4976
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4917
4977
|
const dim3 block_dims(32, ny, 1);
|
4918
4978
|
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4919
4979
|
}
|
@@ -4922,7 +4982,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
|
4922
4982
|
GGML_ASSERT(ncols % QK_K == 0);
|
4923
4983
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4924
4984
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4925
|
-
const dim3 block_nums(
|
4985
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4926
4986
|
const dim3 block_dims(32, ny, 1);
|
4927
4987
|
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4928
4988
|
}
|
@@ -4931,7 +4991,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
|
4931
4991
|
GGML_ASSERT(ncols % QK_K == 0);
|
4932
4992
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4933
4993
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4934
|
-
const dim3 block_nums(
|
4994
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4935
4995
|
const dim3 block_dims(32, ny, 1);
|
4936
4996
|
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4937
4997
|
}
|
@@ -4946,7 +5006,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4946
5006
|
GGML_ASSERT(ncols % QK_K == 0);
|
4947
5007
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4948
5008
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4949
|
-
const dim3 block_nums(
|
5009
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4950
5010
|
const dim3 block_dims(32, ny, 1);
|
4951
5011
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4952
5012
|
}
|
@@ -4954,7 +5014,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4954
5014
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4955
5015
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
4956
5016
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4957
|
-
const dim3 block_nums(
|
5017
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4958
5018
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4959
5019
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
4960
5020
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4963,7 +5023,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4963
5023
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4964
5024
|
GGML_ASSERT(ncols % QK4_1 == 0);
|
4965
5025
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4966
|
-
const dim3 block_nums(
|
5026
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4967
5027
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4968
5028
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
4969
5029
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4972,7 +5032,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4972
5032
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4973
5033
|
GGML_ASSERT(ncols % QK5_0 == 0);
|
4974
5034
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4975
|
-
const dim3 block_nums(
|
5035
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4976
5036
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4977
5037
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
4978
5038
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4981,7 +5041,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4981
5041
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4982
5042
|
GGML_ASSERT(ncols % QK5_1 == 0);
|
4983
5043
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4984
|
-
const dim3 block_nums(
|
5044
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4985
5045
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4986
5046
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
4987
5047
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4990,7 +5050,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4990
5050
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4991
5051
|
GGML_ASSERT(ncols % QK8_0 == 0);
|
4992
5052
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4993
|
-
const dim3 block_nums(
|
5053
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4994
5054
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4995
5055
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
4996
5056
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4999,7 +5059,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4999
5059
|
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5000
5060
|
GGML_ASSERT(ncols % QK_K == 0);
|
5001
5061
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5002
|
-
const dim3 block_nums(
|
5062
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5003
5063
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5004
5064
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
5005
5065
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5008,7 +5068,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5008
5068
|
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5009
5069
|
GGML_ASSERT(ncols % QK_K == 0);
|
5010
5070
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5011
|
-
const dim3 block_nums(
|
5071
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5012
5072
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5013
5073
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
5014
5074
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5017,7 +5077,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5017
5077
|
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5018
5078
|
GGML_ASSERT(ncols % QK_K == 0);
|
5019
5079
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5020
|
-
const dim3 block_nums(
|
5080
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5021
5081
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5022
5082
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
5023
5083
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5026,7 +5086,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5026
5086
|
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5027
5087
|
GGML_ASSERT(ncols % QK_K == 0);
|
5028
5088
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5029
|
-
const dim3 block_nums(
|
5089
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5030
5090
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5031
5091
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
5032
5092
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5035,7 +5095,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
5035
5095
|
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5036
5096
|
GGML_ASSERT(ncols % QK_K == 0);
|
5037
5097
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5038
|
-
const dim3 block_nums(
|
5098
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5039
5099
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5040
5100
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
5041
5101
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -5054,7 +5114,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
|
5054
5114
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
5055
5115
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
5056
5116
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
5057
|
-
const dim3 block_nums(
|
5117
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
5058
5118
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
5059
5119
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
5060
5120
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -5610,6 +5670,16 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5610
5670
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5611
5671
|
}
|
5612
5672
|
|
5673
|
+
static void ggml_cpy_f16_f16_cuda(
|
5674
|
+
const char * cx, char * cdst, const int ne,
|
5675
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5676
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
5677
|
+
|
5678
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
5679
|
+
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
5680
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5681
|
+
}
|
5682
|
+
|
5613
5683
|
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
5614
5684
|
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
5615
5685
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
@@ -5693,6 +5763,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
|
|
5693
5763
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5694
5764
|
}
|
5695
5765
|
|
5766
|
+
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
5767
|
+
int OH, int IW, int IH, int OW, int IC,
|
5768
|
+
int KH, int KW, int N, int ofs0, int ofs1,
|
5769
|
+
int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
|
5770
|
+
dim3 block_nums(IC, OH, OW);
|
5771
|
+
dim3 block_dims(N, KH, KW);
|
5772
|
+
im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
5773
|
+
}
|
5774
|
+
|
5696
5775
|
// buffer pool for cuda
|
5697
5776
|
#define MAX_CUDA_BUFFERS 256
|
5698
5777
|
|
@@ -5761,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5761
5840
|
return ptr;
|
5762
5841
|
}
|
5763
5842
|
#ifdef DEBUG_CUDA_MALLOC
|
5764
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
5843
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
|
5765
5844
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5766
5845
|
#endif
|
5767
5846
|
void * ptr;
|
@@ -5789,6 +5868,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
5789
5868
|
CUDA_CHECK(cudaFree(ptr));
|
5790
5869
|
}
|
5791
5870
|
|
5871
|
+
static bool g_cublas_loaded = false;
|
5872
|
+
|
5873
|
+
bool ggml_cublas_loaded(void) {
|
5874
|
+
return g_cublas_loaded;
|
5875
|
+
}
|
5792
5876
|
|
5793
5877
|
void ggml_init_cublas() {
|
5794
5878
|
static bool initialized = false;
|
@@ -5802,7 +5886,12 @@ void ggml_init_cublas() {
|
|
5802
5886
|
CUDA_CHECK(cudaDeviceSynchronize());
|
5803
5887
|
#endif
|
5804
5888
|
|
5805
|
-
|
5889
|
+
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
5890
|
+
initialized = true;
|
5891
|
+
g_cublas_loaded = false;
|
5892
|
+
return;
|
5893
|
+
}
|
5894
|
+
|
5806
5895
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5807
5896
|
int64_t total_vram = 0;
|
5808
5897
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
@@ -5850,6 +5939,7 @@ void ggml_init_cublas() {
|
|
5850
5939
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
5851
5940
|
|
5852
5941
|
initialized = true;
|
5942
|
+
g_cublas_loaded = true;
|
5853
5943
|
}
|
5854
5944
|
}
|
5855
5945
|
|
@@ -5888,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
5888
5978
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
5889
5979
|
// This can fixed the OOM error in WSL.
|
5890
5980
|
cudaGetLastError();
|
5891
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
5981
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
|
5892
5982
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
5893
5983
|
return nullptr;
|
5894
5984
|
}
|
@@ -6116,6 +6206,34 @@ inline void ggml_cuda_op_silu(
|
|
6116
6206
|
(void) src1_dd;
|
6117
6207
|
}
|
6118
6208
|
|
6209
|
+
inline void ggml_cuda_op_relu(
|
6210
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6211
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6212
|
+
|
6213
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6214
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6215
|
+
|
6216
|
+
relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6217
|
+
|
6218
|
+
(void) src1;
|
6219
|
+
(void) dst;
|
6220
|
+
(void) src1_dd;
|
6221
|
+
}
|
6222
|
+
|
6223
|
+
inline void ggml_cuda_op_sqr(
|
6224
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6225
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6226
|
+
|
6227
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6228
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6229
|
+
|
6230
|
+
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6231
|
+
|
6232
|
+
(void) src1;
|
6233
|
+
(void) dst;
|
6234
|
+
(void) src1_dd;
|
6235
|
+
}
|
6236
|
+
|
6119
6237
|
inline void ggml_cuda_op_norm(
|
6120
6238
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6121
6239
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6238,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6238
6356
|
case GGML_TYPE_Q8_0:
|
6239
6357
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
6240
6358
|
case GGML_TYPE_F16:
|
6359
|
+
case GGML_TYPE_F32:
|
6241
6360
|
return 1;
|
6242
6361
|
case GGML_TYPE_Q2_K:
|
6243
6362
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
@@ -6260,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6260
6379
|
case GGML_TYPE_Q8_0:
|
6261
6380
|
return 64;
|
6262
6381
|
case GGML_TYPE_F16:
|
6382
|
+
case GGML_TYPE_F32:
|
6263
6383
|
return 1;
|
6264
6384
|
case GGML_TYPE_Q2_K:
|
6265
6385
|
case GGML_TYPE_Q3_K:
|
@@ -6451,8 +6571,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6451
6571
|
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6452
6572
|
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6453
6573
|
}
|
6454
|
-
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *)
|
6455
|
-
|
6574
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
6456
6575
|
size_t dst_as = 0;
|
6457
6576
|
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6458
6577
|
|
@@ -6627,6 +6746,45 @@ inline void ggml_cuda_op_alibi(
|
|
6627
6746
|
(void) src1_dd;
|
6628
6747
|
}
|
6629
6748
|
|
6749
|
+
inline void ggml_cuda_op_im2col(
|
6750
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6751
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6752
|
+
|
6753
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
6754
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6755
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
6756
|
+
|
6757
|
+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
6758
|
+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
6759
|
+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
6760
|
+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
6761
|
+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
6762
|
+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
6763
|
+
|
6764
|
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
6765
|
+
|
6766
|
+
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
6767
|
+
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
6768
|
+
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
6769
|
+
const int64_t IW = src1->ne[0];
|
6770
|
+
|
6771
|
+
const int64_t KH = is_2D ? src0->ne[1] : 1;
|
6772
|
+
const int64_t KW = src0->ne[0];
|
6773
|
+
|
6774
|
+
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
6775
|
+
const int64_t OW = dst->ne[1];
|
6776
|
+
|
6777
|
+
const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
6778
|
+
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
6779
|
+
|
6780
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
6781
|
+
OH, IW, IH, OW, IC, KH, KW, N,
|
6782
|
+
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
6783
|
+
|
6784
|
+
(void) src0;
|
6785
|
+
(void) src0_dd;
|
6786
|
+
}
|
6787
|
+
|
6630
6788
|
inline void ggml_cuda_op_diag_mask_inf(
|
6631
6789
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6632
6790
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6892,6 +7050,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6892
7050
|
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6893
7051
|
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
6894
7052
|
|
7053
|
+
int used_devices = 0;
|
7054
|
+
|
6895
7055
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
6896
7056
|
// by default, use all rows
|
6897
7057
|
row_low[id] = 0;
|
@@ -6919,6 +7079,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6919
7079
|
continue;
|
6920
7080
|
}
|
6921
7081
|
|
7082
|
+
used_devices++;
|
7083
|
+
|
6922
7084
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6923
7085
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6924
7086
|
|
@@ -6957,12 +7119,12 @@ static void ggml_cuda_op_mul_mat(
|
|
6957
7119
|
|
6958
7120
|
// if multiple devices are used they need to wait for the main device
|
6959
7121
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
6960
|
-
if (split &&
|
7122
|
+
if (split && used_devices > 1) {
|
6961
7123
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6962
7124
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6963
7125
|
}
|
6964
7126
|
|
6965
|
-
const int64_t src1_col_stride = split &&
|
7127
|
+
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6966
7128
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6967
7129
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6968
7130
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
@@ -7078,6 +7240,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7078
7240
|
}
|
7079
7241
|
|
7080
7242
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7243
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
7244
|
+
continue;
|
7245
|
+
}
|
7081
7246
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
7082
7247
|
|
7083
7248
|
// free buffers again when done
|
@@ -7102,6 +7267,9 @@ static void ggml_cuda_op_mul_mat(
|
|
7102
7267
|
|
7103
7268
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7104
7269
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7270
|
+
if (row_low[id] == row_high[id]) {
|
7271
|
+
continue;
|
7272
|
+
}
|
7105
7273
|
for (int64_t is = 0; is < is_max; ++is) {
|
7106
7274
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
7107
7275
|
}
|
@@ -7138,6 +7306,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7138
7306
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7139
7307
|
}
|
7140
7308
|
|
7309
|
+
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7310
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7311
|
+
}
|
7312
|
+
|
7313
|
+
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7314
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7315
|
+
}
|
7316
|
+
|
7141
7317
|
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7142
7318
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7143
7319
|
}
|
@@ -7147,6 +7323,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7147
7323
|
}
|
7148
7324
|
|
7149
7325
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7326
|
+
if (!g_cublas_loaded) return false;
|
7327
|
+
|
7150
7328
|
const int64_t ne10 = src1->ne[0];
|
7151
7329
|
|
7152
7330
|
const int64_t ne0 = dst->ne[0];
|
@@ -7225,7 +7403,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7225
7403
|
|
7226
7404
|
__global__ void k_compute_batched_ptrs(
|
7227
7405
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7228
|
-
void **
|
7406
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
7229
7407
|
int ne12, int ne13,
|
7230
7408
|
int ne23,
|
7231
7409
|
int nb02, int nb03,
|
@@ -7242,9 +7420,9 @@ __global__ void k_compute_batched_ptrs(
|
|
7242
7420
|
int i03 = i13 / r3;
|
7243
7421
|
int i02 = i12 / r2;
|
7244
7422
|
|
7245
|
-
|
7246
|
-
|
7247
|
-
|
7423
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7424
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7425
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7248
7426
|
}
|
7249
7427
|
|
7250
7428
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -7350,14 +7528,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7350
7528
|
// use cublasGemmBatchedEx
|
7351
7529
|
const int ne23 = ne12*ne13;
|
7352
7530
|
|
7353
|
-
void **
|
7354
|
-
|
7355
|
-
|
7531
|
+
const void ** ptrs_src = nullptr;
|
7532
|
+
void ** ptrs_dst = nullptr;
|
7533
|
+
|
7534
|
+
size_t ptrs_src_s = 0;
|
7535
|
+
size_t ptrs_dst_s = 0;
|
7536
|
+
|
7537
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
7538
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
7356
7539
|
|
7357
7540
|
dim3 block_dims(ne13, ne12);
|
7358
7541
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7359
7542
|
src0_as_f16, src1_as_f16, dst_f16,
|
7360
|
-
|
7543
|
+
ptrs_src, ptrs_dst,
|
7361
7544
|
ne12, ne13,
|
7362
7545
|
ne23,
|
7363
7546
|
nb02, nb03,
|
@@ -7369,14 +7552,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7369
7552
|
CUBLAS_CHECK(
|
7370
7553
|
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7371
7554
|
ne01, ne11, ne10,
|
7372
|
-
&alpha_f16, (const void
|
7373
|
-
(const void
|
7374
|
-
&beta_f16, ( void **
|
7555
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7556
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7557
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
7375
7558
|
ne23,
|
7376
7559
|
CUBLAS_COMPUTE_16F,
|
7377
7560
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7378
7561
|
|
7379
|
-
|
7562
|
+
if (ptrs_src_s != 0) {
|
7563
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
7564
|
+
}
|
7565
|
+
if (ptrs_dst_s != 0) {
|
7566
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
7567
|
+
}
|
7380
7568
|
}
|
7381
7569
|
#endif
|
7382
7570
|
|
@@ -7389,10 +7577,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7389
7577
|
|
7390
7578
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7391
7579
|
const bool all_on_device =
|
7392
|
-
(src0->backend == GGML_BACKEND_GPU) &&
|
7580
|
+
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
7393
7581
|
(src1->backend == GGML_BACKEND_GPU) &&
|
7394
7582
|
( dst->backend == GGML_BACKEND_GPU);
|
7395
7583
|
|
7584
|
+
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7585
|
+
|
7396
7586
|
int64_t min_compute_capability = INT_MAX;
|
7397
7587
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7398
7588
|
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
@@ -7414,13 +7604,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7414
7604
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7415
7605
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7416
7606
|
|
7417
|
-
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7607
|
+
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7418
7608
|
// KQ single-batch
|
7419
7609
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7420
|
-
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7610
|
+
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7421
7611
|
// KQV single-batch
|
7422
7612
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7423
|
-
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7613
|
+
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7424
7614
|
// KQ + KQV multi-batch
|
7425
7615
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7426
7616
|
} else if (src0->type == GGML_TYPE_F32) {
|
@@ -7507,6 +7697,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7507
7697
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7508
7698
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7509
7699
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7700
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7701
|
+
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7702
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7510
7703
|
} else {
|
7511
7704
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7512
7705
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7538,6 +7731,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7538
7731
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
7539
7732
|
}
|
7540
7733
|
|
7734
|
+
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7735
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7736
|
+
}
|
7737
|
+
|
7541
7738
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7542
7739
|
(void) src0;
|
7543
7740
|
(void) src1;
|
@@ -7649,11 +7846,11 @@ static size_t g_temp_tensor_extra_index = 0;
|
|
7649
7846
|
|
7650
7847
|
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7651
7848
|
if (g_temp_tensor_extras == nullptr) {
|
7652
|
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[
|
7849
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
7653
7850
|
}
|
7654
7851
|
|
7655
7852
|
size_t alloc_index = g_temp_tensor_extra_index;
|
7656
|
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) %
|
7853
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
7657
7854
|
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7658
7855
|
memset(extra, 0, sizeof(*extra));
|
7659
7856
|
|
@@ -7820,6 +8017,8 @@ void ggml_cuda_free_scratch() {
|
|
7820
8017
|
}
|
7821
8018
|
|
7822
8019
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8020
|
+
if (!g_cublas_loaded) return false;
|
8021
|
+
|
7823
8022
|
ggml_cuda_func_t func;
|
7824
8023
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7825
8024
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
@@ -7829,6 +8028,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7829
8028
|
return false;
|
7830
8029
|
}
|
7831
8030
|
|
8031
|
+
if (tensor->op == GGML_OP_MUL_MAT) {
|
8032
|
+
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8033
|
+
#ifndef NDEBUG
|
8034
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8035
|
+
#endif
|
8036
|
+
return false;
|
8037
|
+
}
|
8038
|
+
}
|
8039
|
+
|
7832
8040
|
switch (tensor->op) {
|
7833
8041
|
case GGML_OP_REPEAT:
|
7834
8042
|
func = ggml_cuda_repeat;
|
@@ -7853,6 +8061,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7853
8061
|
case GGML_UNARY_OP_SILU:
|
7854
8062
|
func = ggml_cuda_silu;
|
7855
8063
|
break;
|
8064
|
+
case GGML_UNARY_OP_RELU:
|
8065
|
+
func = ggml_cuda_relu;
|
8066
|
+
break;
|
7856
8067
|
default:
|
7857
8068
|
return false;
|
7858
8069
|
} break;
|
@@ -7871,6 +8082,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7871
8082
|
case GGML_OP_SCALE:
|
7872
8083
|
func = ggml_cuda_scale;
|
7873
8084
|
break;
|
8085
|
+
case GGML_OP_SQR:
|
8086
|
+
func = ggml_cuda_sqr;
|
8087
|
+
break;
|
7874
8088
|
case GGML_OP_CLAMP:
|
7875
8089
|
if (!any_on_device) {
|
7876
8090
|
return false;
|
@@ -7901,6 +8115,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7901
8115
|
case GGML_OP_ALIBI:
|
7902
8116
|
func = ggml_cuda_alibi;
|
7903
8117
|
break;
|
8118
|
+
case GGML_OP_IM2COL:
|
8119
|
+
func = ggml_cuda_im2col;
|
8120
|
+
break;
|
7904
8121
|
default:
|
7905
8122
|
return false;
|
7906
8123
|
}
|
@@ -7960,11 +8177,11 @@ struct ggml_backend_buffer_context_cuda {
|
|
7960
8177
|
|
7961
8178
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7962
8179
|
if (temp_tensor_extras == nullptr) {
|
7963
|
-
temp_tensor_extras = new ggml_tensor_extra_gpu[
|
8180
|
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
7964
8181
|
}
|
7965
8182
|
|
7966
8183
|
size_t alloc_index = temp_tensor_extra_index;
|
7967
|
-
temp_tensor_extra_index = (temp_tensor_extra_index + 1) %
|
8184
|
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
7968
8185
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
7969
8186
|
memset(extra, 0, sizeof(*extra));
|
7970
8187
|
|
@@ -8050,7 +8267,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
|
|
8050
8267
|
ggml_cuda_set_device(g_main_device);
|
8051
8268
|
|
8052
8269
|
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
8270
|
+
|
8271
|
+
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8272
|
+
|
8273
|
+
ggml_cuda_set_device(g_main_device);
|
8053
8274
|
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
8275
|
+
|
8054
8276
|
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
8055
8277
|
}
|
8056
8278
|
|
@@ -8117,6 +8339,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8117
8339
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8118
8340
|
ggml_tensor * node = cgraph->nodes[i];
|
8119
8341
|
|
8342
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8343
|
+
continue;
|
8120
8344
|
assert(node->backend == GGML_BACKEND_GPU);
|
8121
8345
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8122
8346
|
if (node->src[j] != nullptr) {
|