llama_cpp 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +354 -126
- data/ext/llama_cpp/src/ggml-metal.metal +128 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +17 -15
- data/ext/llama_cpp/src/ggml.c +58 -46
- data/ext/llama_cpp/src/ggml.h +12 -7
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1360 -60
- data/lib/llama_cpp/version.rb +2 -2
- metadata +4 -2
@@ -62,6 +62,7 @@
|
|
62
62
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
63
63
|
#define cudaMemcpyKind hipMemcpyKind
|
64
64
|
#define cudaMemset hipMemset
|
65
|
+
#define cudaMemsetAsync hipMemsetAsync
|
65
66
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
66
67
|
#define cudaSetDevice hipSetDevice
|
67
68
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
@@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
414
415
|
#define CUDA_SILU_BLOCK_SIZE 256
|
415
416
|
#define CUDA_CPY_BLOCK_SIZE 32
|
416
417
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
418
|
+
#define CUDA_CLAMP_BLOCK_SIZE 256
|
417
419
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
418
420
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
419
421
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
420
422
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
421
423
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
424
|
+
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
422
425
|
|
423
426
|
// dmmv = dequantize_mul_mat_vec
|
424
427
|
#ifndef GGML_CUDA_DMMV_X
|
@@ -1574,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1574
1577
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1575
1578
|
}
|
1576
1579
|
|
1580
|
+
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1581
|
+
static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
|
1582
|
+
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1583
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1584
|
+
|
1585
|
+
if (col >= ncols) {
|
1586
|
+
return;
|
1587
|
+
}
|
1588
|
+
|
1589
|
+
const int r = y[row];
|
1590
|
+
|
1591
|
+
// copy x[r*ncols + col] to dst[row*ncols + col]
|
1592
|
+
const int xi = r*ncols + col;
|
1593
|
+
const int di = row*ncols + col;
|
1594
|
+
|
1595
|
+
const int ib = xi/qk; // block index
|
1596
|
+
const int iqs = (xi%qk)/qr; // quant index
|
1597
|
+
const int iybs = di - di%qk; // y block start index
|
1598
|
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
1599
|
+
|
1600
|
+
// dequantize
|
1601
|
+
dfloat2 v;
|
1602
|
+
dequantize_kernel(x, ib, iqs, v);
|
1603
|
+
|
1604
|
+
dst[iybs + iqs + 0] = v.x;
|
1605
|
+
dst[iybs + iqs + y_offset] = v.y;
|
1606
|
+
}
|
1607
|
+
|
1577
1608
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1578
1609
|
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1579
1610
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
@@ -4555,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
4555
4586
|
dst[i] = scale * x[i];
|
4556
4587
|
}
|
4557
4588
|
|
4589
|
+
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
4590
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
4591
|
+
|
4592
|
+
if (i >= k) {
|
4593
|
+
return;
|
4594
|
+
}
|
4595
|
+
|
4596
|
+
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4597
|
+
}
|
4598
|
+
|
4599
|
+
template<int qk, int qr, dequantize_kernel_t dq>
|
4600
|
+
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4601
|
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
4602
|
+
const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
4603
|
+
const dim3 block_nums(block_num_x, nrows, 1);
|
4604
|
+
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
4605
|
+
}
|
4606
|
+
|
4558
4607
|
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4559
4608
|
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4560
4609
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -5436,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
5436
5485
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5437
5486
|
}
|
5438
5487
|
|
5488
|
+
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
5489
|
+
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
5490
|
+
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
5491
|
+
}
|
5492
|
+
|
5439
5493
|
template<typename T>
|
5440
5494
|
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5441
5495
|
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
@@ -5703,7 +5757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5703
5757
|
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5704
5758
|
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5705
5759
|
kind = cudaMemcpyDeviceToDevice;
|
5706
|
-
|
5760
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5707
5761
|
int id;
|
5708
5762
|
CUDA_CHECK(cudaGetDevice(&id));
|
5709
5763
|
src_ptr = (char *) extra->data_device[id];
|
@@ -5739,6 +5793,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5739
5793
|
}
|
5740
5794
|
}
|
5741
5795
|
|
5796
|
+
static void ggml_cuda_op_repeat(
|
5797
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5798
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5799
|
+
// guaranteed to be an integer due to the check in ggml_can_repeat
|
5800
|
+
const int64_t ne0 = dst->ne[0];
|
5801
|
+
const int64_t ne1 = dst->ne[1];
|
5802
|
+
const int64_t ne2 = dst->ne[2];
|
5803
|
+
const int64_t ne3 = dst->ne[3];
|
5804
|
+
|
5805
|
+
const int64_t ne00 = src0->ne[0];
|
5806
|
+
const int64_t ne01 = src0->ne[1];
|
5807
|
+
const int64_t ne02 = src0->ne[2];
|
5808
|
+
const int64_t ne03 = src0->ne[3];
|
5809
|
+
|
5810
|
+
const size_t nb0 = dst->nb[0];
|
5811
|
+
const size_t nb1 = dst->nb[1];
|
5812
|
+
const size_t nb2 = dst->nb[2];
|
5813
|
+
const size_t nb3 = dst->nb[3];
|
5814
|
+
|
5815
|
+
const size_t nb00 = src0->nb[0];
|
5816
|
+
const size_t nb01 = src0->nb[1];
|
5817
|
+
const size_t nb02 = src0->nb[2];
|
5818
|
+
const size_t nb03 = src0->nb[3];
|
5819
|
+
|
5820
|
+
const int nr0 = (int)(ne0/ne00);
|
5821
|
+
const int nr1 = (int)(ne1/ne01);
|
5822
|
+
const int nr2 = (int)(ne2/ne02);
|
5823
|
+
const int nr3 = (int)(ne3/ne03);
|
5824
|
+
|
5825
|
+
// TODO: support for transposed / permuted tensors
|
5826
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
5827
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
5828
|
+
|
5829
|
+
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
5830
|
+
for (int i3 = 0; i3 < nr3; i3++) {
|
5831
|
+
for (int k3 = 0; k3 < ne03; k3++) {
|
5832
|
+
for (int i2 = 0; i2 < nr2; i2++) {
|
5833
|
+
for (int k2 = 0; k2 < ne02; k2++) {
|
5834
|
+
for (int i1 = 0; i1 < nr1; i1++) {
|
5835
|
+
for (int k1 = 0; k1 < ne01; k1++) {
|
5836
|
+
for (int i0 = 0; i0 < nr0; i0++) {
|
5837
|
+
CUDA_CHECK(cudaMemcpyAsync(
|
5838
|
+
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
5839
|
+
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
5840
|
+
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
5841
|
+
}
|
5842
|
+
}
|
5843
|
+
}
|
5844
|
+
}
|
5845
|
+
}
|
5846
|
+
}
|
5847
|
+
}
|
5848
|
+
|
5849
|
+
(void) src1;
|
5850
|
+
(void) src1_d;
|
5851
|
+
}
|
5852
|
+
|
5853
|
+
static void ggml_cuda_op_get_rows(
|
5854
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5855
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5856
|
+
|
5857
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
5858
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
5859
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
5860
|
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
5861
|
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
5862
|
+
|
5863
|
+
const int ncols = src0->ne[0];
|
5864
|
+
const int nrows = ggml_nelements(src1);
|
5865
|
+
|
5866
|
+
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
5867
|
+
|
5868
|
+
switch (src0->type) {
|
5869
|
+
case GGML_TYPE_F16:
|
5870
|
+
get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5871
|
+
break;
|
5872
|
+
case GGML_TYPE_F32:
|
5873
|
+
get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5874
|
+
break;
|
5875
|
+
case GGML_TYPE_Q4_0:
|
5876
|
+
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5877
|
+
break;
|
5878
|
+
case GGML_TYPE_Q4_1:
|
5879
|
+
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5880
|
+
break;
|
5881
|
+
case GGML_TYPE_Q5_0:
|
5882
|
+
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5883
|
+
break;
|
5884
|
+
case GGML_TYPE_Q5_1:
|
5885
|
+
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5886
|
+
break;
|
5887
|
+
case GGML_TYPE_Q8_0:
|
5888
|
+
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5889
|
+
break;
|
5890
|
+
default:
|
5891
|
+
// TODO: k-quants
|
5892
|
+
GGML_ASSERT(false);
|
5893
|
+
break;
|
5894
|
+
}
|
5895
|
+
}
|
5896
|
+
|
5742
5897
|
inline void ggml_cuda_op_add(
|
5743
5898
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5744
5899
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6279,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
|
|
6279
6434
|
const int64_t ne02 = src0->ne[2];
|
6280
6435
|
const int64_t nrows = ggml_nrows(src0);
|
6281
6436
|
|
6282
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
6437
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6283
6438
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
6284
6439
|
float max_bias;
|
6285
6440
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
6286
6441
|
|
6287
|
-
GGML_ASSERT(ne01 + n_past == ne00);
|
6442
|
+
//GGML_ASSERT(ne01 + n_past == ne00);
|
6288
6443
|
GGML_ASSERT(n_head == ne02);
|
6289
6444
|
|
6290
6445
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
@@ -6343,7 +6498,14 @@ inline void ggml_cuda_op_scale(
|
|
6343
6498
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6344
6499
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6345
6500
|
|
6346
|
-
|
6501
|
+
float scale;
|
6502
|
+
// HACK: support for ggml backend interface
|
6503
|
+
if (src1->backend == GGML_BACKEND_CPU) {
|
6504
|
+
scale = ((float *) src1->data)[0];
|
6505
|
+
} else {
|
6506
|
+
// TODO: pass pointer to kernel instead of copying to host
|
6507
|
+
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
6508
|
+
}
|
6347
6509
|
|
6348
6510
|
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
6349
6511
|
CUDA_CHECK(cudaGetLastError());
|
@@ -6353,6 +6515,24 @@ inline void ggml_cuda_op_scale(
|
|
6353
6515
|
(void) src1_dd;
|
6354
6516
|
}
|
6355
6517
|
|
6518
|
+
inline void ggml_cuda_op_clamp(
|
6519
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6520
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6521
|
+
|
6522
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6523
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6524
|
+
|
6525
|
+
const float min = ((float *) dst->op_params)[0];
|
6526
|
+
const float max = ((float *) dst->op_params)[1];
|
6527
|
+
|
6528
|
+
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6529
|
+
CUDA_CHECK(cudaGetLastError());
|
6530
|
+
|
6531
|
+
(void) src1;
|
6532
|
+
(void) dst;
|
6533
|
+
(void) src1_dd;
|
6534
|
+
}
|
6535
|
+
|
6356
6536
|
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6357
6537
|
const int64_t nrows0 = ggml_nrows(src0);
|
6358
6538
|
|
@@ -6362,9 +6542,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6362
6542
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6363
6543
|
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6364
6544
|
|
6365
|
-
|
6366
|
-
|
6367
|
-
|
6545
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6546
|
+
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6547
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6368
6548
|
|
6369
6549
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6370
6550
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
@@ -6505,9 +6685,9 @@ static void ggml_cuda_op_mul_mat(
|
|
6505
6685
|
const size_t q8_1_ts = sizeof(block_q8_1);
|
6506
6686
|
const size_t q8_1_bs = QK8_1;
|
6507
6687
|
|
6508
|
-
|
6509
|
-
|
6510
|
-
|
6688
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6689
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6690
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6511
6691
|
|
6512
6692
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6513
6693
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
@@ -6585,7 +6765,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6585
6765
|
if (convert_src1_to_q8_1) {
|
6586
6766
|
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6587
6767
|
|
6588
|
-
if (
|
6768
|
+
if (src1_on_device && src1_is_contiguous) {
|
6589
6769
|
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6590
6770
|
CUDA_CHECK(cudaGetLastError());
|
6591
6771
|
}
|
@@ -6667,7 +6847,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6667
6847
|
GGML_ASSERT(false);
|
6668
6848
|
}
|
6669
6849
|
|
6670
|
-
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6850
|
+
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
6671
6851
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6672
6852
|
CUDA_CHECK(cudaGetLastError());
|
6673
6853
|
}
|
@@ -6758,6 +6938,14 @@ static void ggml_cuda_op_mul_mat(
|
|
6758
6938
|
}
|
6759
6939
|
}
|
6760
6940
|
|
6941
|
+
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6942
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
|
6943
|
+
}
|
6944
|
+
|
6945
|
+
static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6946
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
|
6947
|
+
}
|
6948
|
+
|
6761
6949
|
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6762
6950
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6763
6951
|
}
|
@@ -6812,13 +7000,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
6812
7000
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6813
7001
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6814
7002
|
|
6815
|
-
|
7003
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6816
7004
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6817
7005
|
|
6818
|
-
|
7006
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6819
7007
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6820
7008
|
|
6821
|
-
|
7009
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6822
7010
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6823
7011
|
|
6824
7012
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
@@ -6843,13 +7031,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
6843
7031
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6844
7032
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6845
7033
|
|
6846
|
-
|
7034
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6847
7035
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6848
7036
|
|
6849
|
-
|
7037
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6850
7038
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6851
7039
|
|
6852
|
-
|
7040
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6853
7041
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6854
7042
|
|
6855
7043
|
const int64_t row_stride_x = nb01 / sizeof(half);
|
@@ -6870,11 +7058,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6870
7058
|
}
|
6871
7059
|
}
|
6872
7060
|
|
6873
|
-
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7061
|
+
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6874
7062
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6875
7063
|
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6876
7064
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6877
|
-
}else if (src0->type == GGML_TYPE_F32) {
|
7065
|
+
} else if (src0->type == GGML_TYPE_F32) {
|
6878
7066
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6879
7067
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6880
7068
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
@@ -6906,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
6906
7094
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6907
7095
|
}
|
6908
7096
|
|
7097
|
+
static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7098
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
|
7099
|
+
}
|
7100
|
+
|
6909
7101
|
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6910
7102
|
const int64_t ne = ggml_nelements(src0);
|
6911
7103
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
@@ -6935,8 +7127,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
6935
7127
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6936
7128
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6937
7129
|
|
6938
|
-
const
|
6939
|
-
const
|
7130
|
+
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7131
|
+
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6940
7132
|
|
6941
7133
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6942
7134
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
@@ -6991,8 +7183,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6991
7183
|
|
6992
7184
|
const size_t nb1 = tensor->nb[1];
|
6993
7185
|
|
6994
|
-
|
6995
|
-
|
7186
|
+
ggml_backend_type backend = tensor->backend;
|
7187
|
+
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
6996
7188
|
memset(extra, 0, sizeof(*extra));
|
6997
7189
|
|
6998
7190
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
@@ -7046,7 +7238,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
7046
7238
|
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
7047
7239
|
}
|
7048
7240
|
|
7049
|
-
|
7050
7241
|
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
7051
7242
|
|
7052
7243
|
extra->data_device[id] = buf;
|
@@ -7085,17 +7276,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
7085
7276
|
delete extra;
|
7086
7277
|
}
|
7087
7278
|
|
7088
|
-
static
|
7279
|
+
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
7089
7280
|
static size_t g_temp_tensor_extra_index = 0;
|
7090
7281
|
|
7091
|
-
static
|
7282
|
+
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7092
7283
|
if (g_temp_tensor_extras == nullptr) {
|
7093
7284
|
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7094
7285
|
}
|
7095
7286
|
|
7096
7287
|
size_t alloc_index = g_temp_tensor_extra_index;
|
7097
7288
|
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7098
|
-
|
7289
|
+
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7099
7290
|
memset(extra, 0, sizeof(*extra));
|
7100
7291
|
|
7101
7292
|
return extra;
|
@@ -7123,7 +7314,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7123
7314
|
return;
|
7124
7315
|
}
|
7125
7316
|
|
7126
|
-
|
7317
|
+
ggml_tensor_extra_gpu * extra;
|
7127
7318
|
|
7128
7319
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7129
7320
|
tensor->op == GGML_OP_VIEW ||
|
@@ -7132,7 +7323,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7132
7323
|
|
7133
7324
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7134
7325
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7135
|
-
|
7326
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7136
7327
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7137
7328
|
size_t offset = 0;
|
7138
7329
|
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7141,7 +7332,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7141
7332
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7142
7333
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
7143
7334
|
} else if (tensor->op == GGML_OP_CPY) {
|
7144
|
-
|
7335
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
7145
7336
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
7146
7337
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7147
7338
|
extra->data_device[g_main_device] = src1_ddv;
|
@@ -7183,13 +7374,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
7183
7374
|
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
7184
7375
|
}
|
7185
7376
|
|
7186
|
-
|
7377
|
+
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
7187
7378
|
|
7188
7379
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7189
7380
|
tensor->op == GGML_OP_VIEW;
|
7190
7381
|
|
7191
7382
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7192
|
-
|
7383
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7193
7384
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7194
7385
|
size_t view_offset = 0;
|
7195
7386
|
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7207,7 +7398,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
|
7207
7398
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7208
7399
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7209
7400
|
|
7210
|
-
|
7401
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7211
7402
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7212
7403
|
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7213
7404
|
}
|
@@ -7264,58 +7455,47 @@ void ggml_cuda_free_scratch() {
|
|
7264
7455
|
g_scratch_buffer = nullptr;
|
7265
7456
|
}
|
7266
7457
|
|
7267
|
-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
7458
|
+
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7268
7459
|
ggml_cuda_func_t func;
|
7269
7460
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7270
7461
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
7271
7462
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
7272
7463
|
|
7464
|
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
7465
|
+
return false;
|
7466
|
+
}
|
7467
|
+
|
7273
7468
|
switch (tensor->op) {
|
7469
|
+
case GGML_OP_REPEAT:
|
7470
|
+
func = ggml_cuda_repeat;
|
7471
|
+
break;
|
7472
|
+
case GGML_OP_GET_ROWS:
|
7473
|
+
func = ggml_cuda_get_rows;
|
7474
|
+
break;
|
7274
7475
|
case GGML_OP_DUP:
|
7275
|
-
if (!any_on_device) {
|
7276
|
-
return false;
|
7277
|
-
}
|
7278
7476
|
func = ggml_cuda_dup;
|
7279
7477
|
break;
|
7280
7478
|
case GGML_OP_ADD:
|
7281
|
-
if (!any_on_device) {
|
7282
|
-
return false;
|
7283
|
-
}
|
7284
7479
|
func = ggml_cuda_add;
|
7285
7480
|
break;
|
7286
7481
|
case GGML_OP_MUL:
|
7287
|
-
if (!any_on_device) {
|
7288
|
-
return false;
|
7289
|
-
}
|
7290
7482
|
func = ggml_cuda_mul;
|
7291
7483
|
break;
|
7292
7484
|
case GGML_OP_UNARY:
|
7293
7485
|
switch (ggml_get_unary_op(tensor)) {
|
7294
7486
|
case GGML_UNARY_OP_GELU:
|
7295
|
-
if (!any_on_device) {
|
7296
|
-
return false;
|
7297
|
-
}
|
7298
7487
|
func = ggml_cuda_gelu;
|
7299
7488
|
break;
|
7300
7489
|
case GGML_UNARY_OP_SILU:
|
7301
|
-
if (!any_on_device) {
|
7302
|
-
return false;
|
7303
|
-
}
|
7304
7490
|
func = ggml_cuda_silu;
|
7305
7491
|
break;
|
7306
7492
|
default:
|
7307
7493
|
return false;
|
7308
7494
|
} break;
|
7309
7495
|
case GGML_OP_NORM:
|
7310
|
-
if (!any_on_device) {
|
7311
|
-
return false;
|
7312
|
-
}
|
7313
7496
|
func = ggml_cuda_norm;
|
7314
7497
|
break;
|
7315
7498
|
case GGML_OP_RMS_NORM:
|
7316
|
-
if (!any_on_device) {
|
7317
|
-
return false;
|
7318
|
-
}
|
7319
7499
|
func = ggml_cuda_rms_norm;
|
7320
7500
|
break;
|
7321
7501
|
case GGML_OP_MUL_MAT:
|
@@ -7325,54 +7505,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7325
7505
|
func = ggml_cuda_mul_mat;
|
7326
7506
|
break;
|
7327
7507
|
case GGML_OP_SCALE:
|
7328
|
-
if (!any_on_device) {
|
7329
|
-
return false;
|
7330
|
-
}
|
7331
7508
|
func = ggml_cuda_scale;
|
7332
7509
|
break;
|
7333
|
-
case
|
7510
|
+
case GGML_OP_CLAMP:
|
7334
7511
|
if (!any_on_device) {
|
7335
7512
|
return false;
|
7336
7513
|
}
|
7514
|
+
func = ggml_cuda_clamp;
|
7515
|
+
break;
|
7516
|
+
case GGML_OP_CPY:
|
7337
7517
|
func = ggml_cuda_cpy;
|
7338
7518
|
break;
|
7339
7519
|
case GGML_OP_CONT:
|
7340
|
-
if (!any_on_device) {
|
7341
|
-
return false;
|
7342
|
-
}
|
7343
7520
|
func = ggml_cuda_dup;
|
7344
7521
|
break;
|
7345
7522
|
case GGML_OP_RESHAPE:
|
7346
7523
|
case GGML_OP_VIEW:
|
7347
7524
|
case GGML_OP_PERMUTE:
|
7348
7525
|
case GGML_OP_TRANSPOSE:
|
7349
|
-
if (!any_on_device) {
|
7350
|
-
return false;
|
7351
|
-
}
|
7352
7526
|
func = ggml_cuda_nop;
|
7353
7527
|
break;
|
7354
7528
|
case GGML_OP_DIAG_MASK_INF:
|
7355
|
-
if (!any_on_device) {
|
7356
|
-
return false;
|
7357
|
-
}
|
7358
7529
|
func = ggml_cuda_diag_mask_inf;
|
7359
7530
|
break;
|
7360
7531
|
case GGML_OP_SOFT_MAX:
|
7361
|
-
if (!any_on_device) {
|
7362
|
-
return false;
|
7363
|
-
}
|
7364
7532
|
func = ggml_cuda_soft_max;
|
7365
7533
|
break;
|
7366
7534
|
case GGML_OP_ROPE:
|
7367
|
-
if (!any_on_device) {
|
7368
|
-
return false;
|
7369
|
-
}
|
7370
7535
|
func = ggml_cuda_rope;
|
7371
7536
|
break;
|
7372
7537
|
case GGML_OP_ALIBI:
|
7373
|
-
if (!any_on_device) {
|
7374
|
-
return false;
|
7375
|
-
}
|
7376
7538
|
func = ggml_cuda_alibi;
|
7377
7539
|
break;
|
7378
7540
|
default:
|
@@ -7400,3 +7562,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
7400
7562
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
7401
7563
|
snprintf(description, description_size, "%s", prop.name);
|
7402
7564
|
}
|
7565
|
+
|
7566
|
+
////////////////////////////////////////////////////////////////////////////////
|
7567
|
+
|
7568
|
+
// backend interface
|
7569
|
+
|
7570
|
+
#define UNUSED GGML_UNUSED
|
7571
|
+
|
7572
|
+
struct ggml_backend_context_cuda {
|
7573
|
+
};
|
7574
|
+
|
7575
|
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
7576
|
+
return GGML_CUDA_NAME;
|
7577
|
+
|
7578
|
+
UNUSED(backend);
|
7579
|
+
}
|
7580
|
+
|
7581
|
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
7582
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
7583
|
+
delete cuda_ctx;
|
7584
|
+
delete backend;
|
7585
|
+
}
|
7586
|
+
|
7587
|
+
struct ggml_backend_buffer_context_cuda {
|
7588
|
+
void * device;
|
7589
|
+
|
7590
|
+
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
7591
|
+
size_t temp_tensor_extra_index = 0;
|
7592
|
+
|
7593
|
+
~ggml_backend_buffer_context_cuda() {
|
7594
|
+
delete[] temp_tensor_extras;
|
7595
|
+
}
|
7596
|
+
|
7597
|
+
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7598
|
+
if (temp_tensor_extras == nullptr) {
|
7599
|
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7600
|
+
}
|
7601
|
+
|
7602
|
+
size_t alloc_index = temp_tensor_extra_index;
|
7603
|
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7604
|
+
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
7605
|
+
memset(extra, 0, sizeof(*extra));
|
7606
|
+
|
7607
|
+
return extra;
|
7608
|
+
}
|
7609
|
+
};
|
7610
|
+
|
7611
|
+
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
7612
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7613
|
+
CUDA_CHECK(cudaFree(ctx->device));
|
7614
|
+
delete ctx;
|
7615
|
+
}
|
7616
|
+
|
7617
|
+
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
7618
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7619
|
+
return ctx->device;
|
7620
|
+
}
|
7621
|
+
|
7622
|
+
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7623
|
+
int64_t row_low = 0;
|
7624
|
+
int64_t row_high = ggml_nrows(tensor);
|
7625
|
+
int64_t nrows_split = row_high - row_low;
|
7626
|
+
|
7627
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
7628
|
+
|
7629
|
+
int64_t ne0 = tensor->ne[0];
|
7630
|
+
|
7631
|
+
if (ggml_is_quantized(tensor->type)) {
|
7632
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
7633
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
7634
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
7635
|
+
}
|
7636
|
+
}
|
7637
|
+
|
7638
|
+
return size;
|
7639
|
+
|
7640
|
+
UNUSED(buffer);
|
7641
|
+
}
|
7642
|
+
|
7643
|
+
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7644
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7645
|
+
|
7646
|
+
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
7647
|
+
assert(tensor->view_src->buffer->backend == buffer->backend);
|
7648
|
+
tensor->backend = tensor->view_src->backend;
|
7649
|
+
tensor->extra = tensor->view_src->extra;
|
7650
|
+
return;
|
7651
|
+
}
|
7652
|
+
|
7653
|
+
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
7654
|
+
|
7655
|
+
extra->data_device[g_main_device] = tensor->data;
|
7656
|
+
|
7657
|
+
tensor->backend = GGML_BACKEND_GPU;
|
7658
|
+
tensor->extra = extra;
|
7659
|
+
|
7660
|
+
if (ggml_is_quantized(tensor->type)) {
|
7661
|
+
// initialize padding to 0 to avoid possible NaN values
|
7662
|
+
int64_t row_low = 0;
|
7663
|
+
int64_t row_high = ggml_nrows(tensor);
|
7664
|
+
int64_t nrows_split = row_high - row_low;
|
7665
|
+
|
7666
|
+
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
7667
|
+
size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
|
7668
|
+
|
7669
|
+
if (padded_size > original_size && tensor->view_src == nullptr) {
|
7670
|
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
|
7671
|
+
}
|
7672
|
+
}
|
7673
|
+
|
7674
|
+
UNUSED(buffer);
|
7675
|
+
}
|
7676
|
+
|
7677
|
+
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
7678
|
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
7679
|
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
7680
|
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
|
7681
|
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
7682
|
+
/* .free_tensor = */ NULL,
|
7683
|
+
};
|
7684
|
+
|
7685
|
+
static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
|
7686
|
+
ggml_cuda_set_device(g_main_device);
|
7687
|
+
|
7688
|
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
7689
|
+
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
7690
|
+
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
7691
|
+
}
|
7692
|
+
|
7693
|
+
static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
|
7694
|
+
return 128;
|
7695
|
+
UNUSED(backend);
|
7696
|
+
}
|
7697
|
+
|
7698
|
+
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
7699
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
7700
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7701
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7702
|
+
|
7703
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
|
7704
|
+
|
7705
|
+
UNUSED(backend);
|
7706
|
+
}
|
7707
|
+
|
7708
|
+
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
7709
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
7710
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7711
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7712
|
+
|
7713
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
7714
|
+
|
7715
|
+
UNUSED(backend);
|
7716
|
+
}
|
7717
|
+
|
7718
|
+
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
7719
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
7720
|
+
|
7721
|
+
UNUSED(backend);
|
7722
|
+
}
|
7723
|
+
|
7724
|
+
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7725
|
+
GGML_ASSERT(!"not implemented");
|
7726
|
+
|
7727
|
+
return nullptr;
|
7728
|
+
|
7729
|
+
UNUSED(backend);
|
7730
|
+
UNUSED(cgraph);
|
7731
|
+
}
|
7732
|
+
|
7733
|
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7734
|
+
GGML_ASSERT(!"not implemented");
|
7735
|
+
|
7736
|
+
UNUSED(backend);
|
7737
|
+
UNUSED(plan);
|
7738
|
+
}
|
7739
|
+
|
7740
|
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7741
|
+
GGML_ASSERT(!"not implemented");
|
7742
|
+
|
7743
|
+
UNUSED(backend);
|
7744
|
+
UNUSED(plan);
|
7745
|
+
}
|
7746
|
+
|
7747
|
+
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7748
|
+
ggml_cuda_set_device(g_main_device);
|
7749
|
+
|
7750
|
+
ggml_compute_params params = {};
|
7751
|
+
params.type = GGML_TASK_COMPUTE;
|
7752
|
+
params.ith = 0;
|
7753
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
7754
|
+
ggml_tensor * node = cgraph->nodes[i];
|
7755
|
+
|
7756
|
+
assert(node->backend == GGML_BACKEND_GPU);
|
7757
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
7758
|
+
if (node->src[j] != nullptr) {
|
7759
|
+
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
7760
|
+
}
|
7761
|
+
}
|
7762
|
+
|
7763
|
+
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
7764
|
+
if (!ok) {
|
7765
|
+
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
7766
|
+
}
|
7767
|
+
GGML_ASSERT(ok);
|
7768
|
+
|
7769
|
+
#if 0
|
7770
|
+
if (node->type == GGML_TYPE_F32) {
|
7771
|
+
cudaDeviceSynchronize();
|
7772
|
+
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
7773
|
+
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
7774
|
+
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
7775
|
+
ggml_type_name(node->src[0]->type),
|
7776
|
+
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
7777
|
+
node->src[0]->name,
|
7778
|
+
node->src[1] ? node->src[1]->name : "none");
|
7779
|
+
double sum = 0.0;
|
7780
|
+
double sq_sum = 0.0;
|
7781
|
+
for (int i = 0; i < ggml_nelements(node); i++) {
|
7782
|
+
printf("%f ", tmp[i]);
|
7783
|
+
sum += tmp[i];
|
7784
|
+
sq_sum += tmp[i]*tmp[i];
|
7785
|
+
}
|
7786
|
+
printf("\n");
|
7787
|
+
printf("sum: %f, ", sum);
|
7788
|
+
printf("sq_sum: %f\n", sq_sum);
|
7789
|
+
}
|
7790
|
+
#endif
|
7791
|
+
}
|
7792
|
+
|
7793
|
+
UNUSED(backend);
|
7794
|
+
}
|
7795
|
+
|
7796
|
+
static ggml_backend_i cuda_backend_i = {
|
7797
|
+
/* .get_name = */ ggml_backend_cuda_name,
|
7798
|
+
/* .free = */ ggml_backend_cuda_free,
|
7799
|
+
/* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
|
7800
|
+
/* .get_alignment = */ ggml_backend_cuda_get_alignment,
|
7801
|
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
7802
|
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
7803
|
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
7804
|
+
/* .cpy_tensor_from = */ nullptr,
|
7805
|
+
/* .cpy_tensor_to = */ nullptr,
|
7806
|
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
7807
|
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
7808
|
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
7809
|
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
7810
|
+
/* .supports_op = */ nullptr,
|
7811
|
+
};
|
7812
|
+
|
7813
|
+
ggml_backend_t ggml_backend_cuda_init() {
|
7814
|
+
ggml_init_cublas(); // TODO: remove from ggml.c
|
7815
|
+
|
7816
|
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
|
7817
|
+
|
7818
|
+
ggml_backend_t cuda_backend = new ggml_backend {
|
7819
|
+
/* .interface = */ cuda_backend_i,
|
7820
|
+
/* .context = */ ctx
|
7821
|
+
};
|
7822
|
+
|
7823
|
+
return cuda_backend;
|
7824
|
+
}
|