llama_cpp 0.12.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -8,8 +8,13 @@
|
|
8
8
|
#include <limits>
|
9
9
|
#include <stdint.h>
|
10
10
|
#include <stdio.h>
|
11
|
+
#include <string>
|
11
12
|
#include <vector>
|
12
|
-
|
13
|
+
#include <map>
|
14
|
+
#include <array>
|
15
|
+
#include "ggml-cuda.h"
|
16
|
+
#include "ggml.h"
|
17
|
+
#include "ggml-backend-impl.h"
|
13
18
|
|
14
19
|
#if defined(GGML_USE_HIPBLAS)
|
15
20
|
#include <hip/hip_runtime.h>
|
@@ -77,6 +82,7 @@
|
|
77
82
|
#define cudaMemcpyKind hipMemcpyKind
|
78
83
|
#define cudaMemset hipMemset
|
79
84
|
#define cudaMemsetAsync hipMemsetAsync
|
85
|
+
#define cudaMemGetInfo hipMemGetInfo
|
80
86
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
81
87
|
#define cudaSetDevice hipSetDevice
|
82
88
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
@@ -112,9 +118,7 @@
|
|
112
118
|
|
113
119
|
#endif // defined(GGML_USE_HIPBLAS)
|
114
120
|
|
115
|
-
#
|
116
|
-
#include "ggml.h"
|
117
|
-
#include "ggml-backend-impl.h"
|
121
|
+
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
118
122
|
|
119
123
|
#define CC_PASCAL 600
|
120
124
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
@@ -519,6 +523,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
|
|
519
523
|
#define CUDA_ACC_BLOCK_SIZE 256
|
520
524
|
#define CUDA_IM2COL_BLOCK_SIZE 256
|
521
525
|
|
526
|
+
#define CUDA_Q8_0_NE_ALIGN 2048
|
527
|
+
|
522
528
|
// dmmv = dequantize_mul_mat_vec
|
523
529
|
#ifndef GGML_CUDA_DMMV_X
|
524
530
|
#define GGML_CUDA_DMMV_X 32
|
@@ -562,7 +568,7 @@ static void ggml_cuda_set_device(const int device) {
|
|
562
568
|
|
563
569
|
static int g_device_count = -1;
|
564
570
|
static int g_main_device = 0;
|
565
|
-
static float
|
571
|
+
static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
|
566
572
|
|
567
573
|
struct cuda_device_capabilities {
|
568
574
|
int cc; // compute capability
|
@@ -573,10 +579,6 @@ struct cuda_device_capabilities {
|
|
573
579
|
|
574
580
|
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
|
575
581
|
|
576
|
-
static void * g_scratch_buffer = nullptr;
|
577
|
-
static size_t g_scratch_size = 0; // disabled by default
|
578
|
-
static size_t g_scratch_offset = 0;
|
579
|
-
|
580
582
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
581
583
|
|
582
584
|
[[noreturn]]
|
@@ -605,16 +607,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
605
607
|
}
|
606
608
|
|
607
609
|
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
608
|
-
#if
|
609
|
-
(void) a;
|
610
|
-
bad_arch();
|
611
|
-
#else
|
610
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
612
611
|
#pragma unroll
|
613
612
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
614
613
|
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
615
614
|
}
|
616
615
|
return a;
|
617
|
-
#
|
616
|
+
#else
|
617
|
+
(void) a;
|
618
|
+
bad_arch();
|
619
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
618
620
|
}
|
619
621
|
|
620
622
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
@@ -626,16 +628,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
626
628
|
}
|
627
629
|
|
628
630
|
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
629
|
-
#if
|
630
|
-
(void) x;
|
631
|
-
bad_arch();
|
632
|
-
#else
|
631
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
633
632
|
#pragma unroll
|
634
633
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
635
634
|
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
636
635
|
}
|
637
636
|
return x;
|
638
|
-
#
|
637
|
+
#else
|
638
|
+
(void) x;
|
639
|
+
bad_arch();
|
640
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
639
641
|
}
|
640
642
|
|
641
643
|
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
@@ -1103,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
1103
1105
|
#endif // GGML_CUDA_F16
|
1104
1106
|
}
|
1105
1107
|
|
1108
|
+
template<typename dst_t>
|
1109
|
+
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
1110
|
+
|
1111
|
+
const int i = blockIdx.x;
|
1112
|
+
|
1113
|
+
// assume 32 threads
|
1114
|
+
const int tid = threadIdx.x;
|
1115
|
+
const int il = tid/8;
|
1116
|
+
const int ir = tid%8;
|
1117
|
+
const int ib = 8*i + ir;
|
1118
|
+
if (ib >= nb32) {
|
1119
|
+
return;
|
1120
|
+
}
|
1121
|
+
|
1122
|
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
1123
|
+
|
1124
|
+
const block_q4_0 * x = (const block_q4_0 *)vx + ib;
|
1125
|
+
const float d = __half2float(x->d);
|
1126
|
+
const float dm = -8*d;
|
1127
|
+
|
1128
|
+
const uint8_t * q = x->qs + 4*il;
|
1129
|
+
|
1130
|
+
for (int l = 0; l < 4; ++l) {
|
1131
|
+
y[l+ 0] = d * (q[l] & 0xF) + dm;
|
1132
|
+
y[l+16] = d * (q[l] >> 4) + dm;
|
1133
|
+
}
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
template<typename dst_t>
|
1137
|
+
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
1138
|
+
|
1139
|
+
const int i = blockIdx.x;
|
1140
|
+
|
1141
|
+
// assume 32 threads
|
1142
|
+
const int tid = threadIdx.x;
|
1143
|
+
const int il = tid/8;
|
1144
|
+
const int ir = tid%8;
|
1145
|
+
const int ib = 8*i + ir;
|
1146
|
+
if (ib >= nb32) {
|
1147
|
+
return;
|
1148
|
+
}
|
1149
|
+
|
1150
|
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
1151
|
+
|
1152
|
+
const block_q4_1 * x = (const block_q4_1 *)vx + ib;
|
1153
|
+
const float2 d = __half22float2(x->dm);
|
1154
|
+
|
1155
|
+
const uint8_t * q = x->qs + 4*il;
|
1156
|
+
|
1157
|
+
for (int l = 0; l < 4; ++l) {
|
1158
|
+
y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
|
1159
|
+
y[l+16] = d.x * (q[l] >> 4) + d.y;
|
1160
|
+
}
|
1161
|
+
}
|
1162
|
+
|
1106
1163
|
//================================== k-quants
|
1107
1164
|
|
1108
1165
|
template<typename dst_t>
|
@@ -2327,6 +2384,45 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
|
2327
2384
|
y[i] = x[i];
|
2328
2385
|
}
|
2329
2386
|
|
2387
|
+
template <bool need_check>
|
2388
|
+
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
2389
|
+
#if __CUDA_ARCH__ >= CC_PASCAL
|
2390
|
+
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
2391
|
+
|
2392
|
+
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
2393
|
+
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
2394
|
+
half2 * y2 = (half2 *) (y + i0);
|
2395
|
+
|
2396
|
+
__shared__ int vals[nint];
|
2397
|
+
|
2398
|
+
#pragma unroll
|
2399
|
+
for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
|
2400
|
+
if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
|
2401
|
+
break;
|
2402
|
+
}
|
2403
|
+
|
2404
|
+
const int ix = ix0 + threadIdx.x;
|
2405
|
+
vals[ix] = x0[ix];
|
2406
|
+
}
|
2407
|
+
|
2408
|
+
#pragma unroll
|
2409
|
+
for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
|
2410
|
+
if (need_check && i0 + iy + 2*threadIdx.x >= k) {
|
2411
|
+
return;
|
2412
|
+
}
|
2413
|
+
|
2414
|
+
const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
|
2415
|
+
const half d = *b0;
|
2416
|
+
const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
|
2417
|
+
|
2418
|
+
y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
|
2419
|
+
}
|
2420
|
+
#else
|
2421
|
+
(void) vx; (void) y; (void) k;
|
2422
|
+
bad_arch();
|
2423
|
+
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
2424
|
+
}
|
2425
|
+
|
2330
2426
|
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
2331
2427
|
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
2332
2428
|
|
@@ -5613,7 +5709,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
5613
5709
|
|
5614
5710
|
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
|
5615
5711
|
static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
5616
|
-
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
5712
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
5617
5713
|
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
5618
5714
|
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
5619
5715
|
|
@@ -5738,7 +5834,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
|
5738
5834
|
#else
|
5739
5835
|
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
5740
5836
|
bad_arch();
|
5741
|
-
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
5837
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
5742
5838
|
}
|
5743
5839
|
|
5744
5840
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
@@ -6181,6 +6277,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri
|
|
6181
6277
|
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
6182
6278
|
}
|
6183
6279
|
|
6280
|
+
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
6281
|
+
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
6282
|
+
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
6283
|
+
const bool need_check = false;
|
6284
|
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
6285
|
+
} else {
|
6286
|
+
const bool need_check = true;
|
6287
|
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
6288
|
+
}
|
6289
|
+
}
|
6290
|
+
|
6184
6291
|
template<typename dst_t>
|
6185
6292
|
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6186
6293
|
const int nb = k / QK_K;
|
@@ -6201,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
6201
6308
|
#endif
|
6202
6309
|
}
|
6203
6310
|
|
6311
|
+
template<typename dst_t>
|
6312
|
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6313
|
+
const int nb32 = k / 32;
|
6314
|
+
const int nb = (k + 255) / 256;
|
6315
|
+
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
6316
|
+
}
|
6317
|
+
|
6318
|
+
template<typename dst_t>
|
6319
|
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6320
|
+
const int nb32 = k / 32;
|
6321
|
+
const int nb = (k + 255) / 256;
|
6322
|
+
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
6323
|
+
}
|
6324
|
+
|
6204
6325
|
template<typename dst_t>
|
6205
6326
|
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6206
6327
|
const int nb = k / QK_K;
|
@@ -6246,16 +6367,21 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
|
|
6246
6367
|
}
|
6247
6368
|
|
6248
6369
|
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
6370
|
+
int id;
|
6249
6371
|
switch (type) {
|
6250
6372
|
case GGML_TYPE_Q4_0:
|
6251
|
-
return
|
6373
|
+
return dequantize_row_q4_0_cuda;
|
6252
6374
|
case GGML_TYPE_Q4_1:
|
6253
|
-
return
|
6375
|
+
return dequantize_row_q4_1_cuda;
|
6254
6376
|
case GGML_TYPE_Q5_0:
|
6255
6377
|
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
6256
6378
|
case GGML_TYPE_Q5_1:
|
6257
6379
|
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
6258
6380
|
case GGML_TYPE_Q8_0:
|
6381
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
6382
|
+
if (g_device_caps[id].cc >= CC_PASCAL) {
|
6383
|
+
return dequantize_block_q8_0_f16_cuda;
|
6384
|
+
}
|
6259
6385
|
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
6260
6386
|
case GGML_TYPE_Q2_K:
|
6261
6387
|
return dequantize_row_q2_K_cuda;
|
@@ -6281,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
6281
6407
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
6282
6408
|
switch (type) {
|
6283
6409
|
case GGML_TYPE_Q4_0:
|
6284
|
-
return
|
6410
|
+
return dequantize_row_q4_0_cuda;
|
6285
6411
|
case GGML_TYPE_Q4_1:
|
6286
|
-
return
|
6412
|
+
return dequantize_row_q4_1_cuda;
|
6287
6413
|
case GGML_TYPE_Q5_0:
|
6288
6414
|
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
6289
6415
|
case GGML_TYPE_Q5_1:
|
@@ -7489,11 +7615,11 @@ struct cuda_pool_alloc {
|
|
7489
7615
|
|
7490
7616
|
static bool g_cublas_loaded = false;
|
7491
7617
|
|
7492
|
-
bool ggml_cublas_loaded(void) {
|
7618
|
+
GGML_CALL bool ggml_cublas_loaded(void) {
|
7493
7619
|
return g_cublas_loaded;
|
7494
7620
|
}
|
7495
7621
|
|
7496
|
-
void ggml_init_cublas() {
|
7622
|
+
GGML_CALL void ggml_init_cublas() {
|
7497
7623
|
static bool initialized = false;
|
7498
7624
|
|
7499
7625
|
if (!initialized) {
|
@@ -7546,8 +7672,9 @@ void ggml_init_cublas() {
|
|
7546
7672
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
7547
7673
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
7548
7674
|
|
7549
|
-
|
7675
|
+
g_default_tensor_split[id] = total_vram;
|
7550
7676
|
total_vram += prop.totalGlobalMem;
|
7677
|
+
|
7551
7678
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
7552
7679
|
g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
7553
7680
|
#else
|
@@ -7556,7 +7683,7 @@ void ggml_init_cublas() {
|
|
7556
7683
|
g_device_caps[id].smpb = prop.sharedMemPerBlock;
|
7557
7684
|
}
|
7558
7685
|
for (int id = 0; id < g_device_count; ++id) {
|
7559
|
-
|
7686
|
+
g_default_tensor_split[id] /= total_vram;
|
7560
7687
|
}
|
7561
7688
|
|
7562
7689
|
for (int id = 0; id < g_device_count; ++id) {
|
@@ -7580,31 +7707,7 @@ void ggml_init_cublas() {
|
|
7580
7707
|
}
|
7581
7708
|
}
|
7582
7709
|
|
7583
|
-
void
|
7584
|
-
if (tensor_split == nullptr) {
|
7585
|
-
return;
|
7586
|
-
}
|
7587
|
-
bool all_zero = true;
|
7588
|
-
for (int i = 0; i < g_device_count; ++i) {
|
7589
|
-
if (tensor_split[i] != 0.0f) {
|
7590
|
-
all_zero = false;
|
7591
|
-
break;
|
7592
|
-
}
|
7593
|
-
}
|
7594
|
-
if (all_zero) {
|
7595
|
-
return;
|
7596
|
-
}
|
7597
|
-
float split_sum = 0.0f;
|
7598
|
-
for (int i = 0; i < g_device_count; ++i) {
|
7599
|
-
g_tensor_split[i] = split_sum;
|
7600
|
-
split_sum += tensor_split[i];
|
7601
|
-
}
|
7602
|
-
for (int i = 0; i < g_device_count; ++i) {
|
7603
|
-
g_tensor_split[i] /= split_sum;
|
7604
|
-
}
|
7605
|
-
}
|
7606
|
-
|
7607
|
-
void * ggml_cuda_host_malloc(size_t size) {
|
7710
|
+
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
7608
7711
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
7609
7712
|
return nullptr;
|
7610
7713
|
}
|
@@ -7622,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
7622
7725
|
return ptr;
|
7623
7726
|
}
|
7624
7727
|
|
7625
|
-
void ggml_cuda_host_free(void * ptr) {
|
7728
|
+
GGML_CALL void ggml_cuda_host_free(void * ptr) {
|
7626
7729
|
CUDA_CHECK(cudaFreeHost(ptr));
|
7627
7730
|
}
|
7628
7731
|
|
@@ -8055,11 +8158,11 @@ static void ggml_cuda_op_mul_mat_q(
|
|
8055
8158
|
(void) src1_ddf_i;
|
8056
8159
|
}
|
8057
8160
|
|
8058
|
-
static int64_t get_row_rounding(ggml_type type) {
|
8161
|
+
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
8059
8162
|
int64_t min_compute_capability = INT_MAX;
|
8060
8163
|
int64_t max_compute_capability = INT_MIN;
|
8061
8164
|
for (int id = 0; id < g_device_count; ++id) {
|
8062
|
-
if (
|
8165
|
+
if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
8063
8166
|
if (min_compute_capability > g_device_caps[id].cc) {
|
8064
8167
|
min_compute_capability = g_device_caps[id].cc;
|
8065
8168
|
}
|
@@ -8120,6 +8223,21 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
8120
8223
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
8121
8224
|
}
|
8122
8225
|
|
8226
|
+
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
8227
|
+
const int64_t nrows = ggml_nrows(tensor);
|
8228
|
+
const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
|
8229
|
+
|
8230
|
+
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
8231
|
+
*row_low -= *row_low % rounding;
|
8232
|
+
|
8233
|
+
if (id == g_device_count - 1) {
|
8234
|
+
*row_high = nrows;
|
8235
|
+
} else {
|
8236
|
+
*row_high = nrows*tensor_split[id + 1];
|
8237
|
+
*row_high -= *row_high % rounding;
|
8238
|
+
}
|
8239
|
+
}
|
8240
|
+
|
8123
8241
|
static void ggml_cuda_op_mul_mat_vec_q(
|
8124
8242
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
8125
8243
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
@@ -8574,15 +8692,15 @@ static void ggml_cuda_op_soft_max(
|
|
8574
8692
|
float scale = 1.0f;
|
8575
8693
|
memcpy(&scale, dst->op_params, sizeof(float));
|
8576
8694
|
|
8577
|
-
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
8578
|
-
const bool use_f16_soft_max = false;
|
8579
|
-
#else
|
8695
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
|
8580
8696
|
#ifdef GGML_CUDA_F16
|
8581
8697
|
const bool use_f16_soft_max = true;
|
8582
8698
|
#else
|
8583
8699
|
const bool use_f16_soft_max = false;
|
8584
8700
|
#endif // GGML_CUDA_F16
|
8585
|
-
#
|
8701
|
+
#else
|
8702
|
+
const bool use_f16_soft_max = false;
|
8703
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
|
8586
8704
|
|
8587
8705
|
if (use_f16_soft_max) {
|
8588
8706
|
soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
@@ -8737,6 +8855,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
8737
8855
|
peer_access_enabled = enable_peer_access;
|
8738
8856
|
}
|
8739
8857
|
|
8858
|
+
// FIXME: move this somewhere else
|
8859
|
+
struct ggml_backend_cuda_split_buffer_type_context {
|
8860
|
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
8861
|
+
};
|
8862
|
+
|
8740
8863
|
static void ggml_cuda_op_mul_mat(
|
8741
8864
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
8742
8865
|
const bool convert_src1_to_q8_1) {
|
@@ -8788,6 +8911,14 @@ static void ggml_cuda_op_mul_mat(
|
|
8788
8911
|
GGML_ASSERT(!(split && ne03 > 1));
|
8789
8912
|
GGML_ASSERT(!(split && ne02 < ne12));
|
8790
8913
|
|
8914
|
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
8915
|
+
if (split) {
|
8916
|
+
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
|
8917
|
+
// GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
|
8918
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
8919
|
+
tensor_split = buft_ctx->tensor_split;
|
8920
|
+
}
|
8921
|
+
|
8791
8922
|
struct dev_data {
|
8792
8923
|
cuda_pool_alloc<char> src0_dd_alloc;
|
8793
8924
|
cuda_pool_alloc<float> src1_ddf_alloc;
|
@@ -8815,17 +8946,17 @@ static void ggml_cuda_op_mul_mat(
|
|
8815
8946
|
// for multi GPU, get the row boundaries from tensor split
|
8816
8947
|
// and round to mul_mat_q tile sizes
|
8817
8948
|
if (split) {
|
8818
|
-
const int64_t rounding = get_row_rounding(src0->type);
|
8949
|
+
const int64_t rounding = get_row_rounding(src0->type, tensor_split);
|
8819
8950
|
|
8820
8951
|
if (id != 0) {
|
8821
|
-
dev[id].row_low = ne01*
|
8952
|
+
dev[id].row_low = ne01*tensor_split[id];
|
8822
8953
|
if (dev[id].row_low < ne01) {
|
8823
8954
|
dev[id].row_low -= dev[id].row_low % rounding;
|
8824
8955
|
}
|
8825
8956
|
}
|
8826
8957
|
|
8827
8958
|
if (id != g_device_count - 1) {
|
8828
|
-
dev[id].row_high = ne01*
|
8959
|
+
dev[id].row_high = ne01*tensor_split[id + 1];
|
8829
8960
|
if (dev[id].row_high < ne01) {
|
8830
8961
|
dev[id].row_high -= dev[id].row_high % rounding;
|
8831
8962
|
}
|
@@ -9111,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
9111
9242
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
9112
9243
|
}
|
9113
9244
|
|
9114
|
-
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
9245
|
+
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
9115
9246
|
if (!g_cublas_loaded) return false;
|
9116
9247
|
|
9117
9248
|
const int64_t ne10 = src1->ne[0];
|
@@ -9371,10 +9502,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
9371
9502
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
9372
9503
|
|
9373
9504
|
int64_t min_compute_capability = INT_MAX;
|
9374
|
-
|
9375
|
-
|
9376
|
-
|
9505
|
+
|
9506
|
+
if (split) {
|
9507
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
9508
|
+
auto & tensor_split = buft_ctx->tensor_split;
|
9509
|
+
for (int id = 0; id < g_device_count; ++id) {
|
9510
|
+
if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
9511
|
+
min_compute_capability = g_device_caps[id].cc;
|
9512
|
+
}
|
9377
9513
|
}
|
9514
|
+
} else {
|
9515
|
+
min_compute_capability = g_device_caps[g_main_device].cc;
|
9378
9516
|
}
|
9379
9517
|
|
9380
9518
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
@@ -9413,7 +9551,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
9413
9551
|
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
9414
9552
|
// KQV single-batch
|
9415
9553
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
9416
|
-
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
9554
|
+
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
9417
9555
|
// KQ + KQV multi-batch
|
9418
9556
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
9419
9557
|
} else if (src0->type == GGML_TYPE_F32) {
|
@@ -9875,297 +10013,39 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
|
9875
10013
|
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
9876
10014
|
}
|
9877
10015
|
|
9878
|
-
void
|
9879
|
-
|
9880
|
-
|
9881
|
-
|
9882
|
-
|
9883
|
-
const size_t nb1 = tensor->nb[1];
|
9884
|
-
|
9885
|
-
ggml_backend_type backend = tensor->backend;
|
9886
|
-
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
9887
|
-
memset(extra, 0, sizeof(*extra));
|
9888
|
-
|
9889
|
-
for (int id = 0; id < g_device_count; ++id) {
|
9890
|
-
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
9891
|
-
continue;
|
9892
|
-
}
|
9893
|
-
|
9894
|
-
ggml_cuda_set_device(id);
|
9895
|
-
|
9896
|
-
int64_t row_low, row_high;
|
9897
|
-
if (backend == GGML_BACKEND_GPU) {
|
9898
|
-
row_low = 0;
|
9899
|
-
row_high = nrows;
|
9900
|
-
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
9901
|
-
const int64_t rounding = get_row_rounding(tensor->type);
|
9902
|
-
|
9903
|
-
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
9904
|
-
row_low -= row_low % rounding;
|
9905
|
-
|
9906
|
-
if (id == g_device_count - 1) {
|
9907
|
-
row_high = nrows;
|
9908
|
-
} else {
|
9909
|
-
row_high = nrows*g_tensor_split[id + 1];
|
9910
|
-
row_high -= row_high % rounding;
|
9911
|
-
}
|
9912
|
-
} else {
|
9913
|
-
GGML_ASSERT(false);
|
9914
|
-
}
|
9915
|
-
if (row_low == row_high) {
|
9916
|
-
continue;
|
9917
|
-
}
|
9918
|
-
|
9919
|
-
int64_t nrows_split = row_high - row_low;
|
9920
|
-
|
9921
|
-
const size_t offset_split = row_low*nb1;
|
9922
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
9923
|
-
const size_t original_size = size;
|
9924
|
-
|
9925
|
-
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
9926
|
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
9927
|
-
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
9928
|
-
}
|
9929
|
-
|
9930
|
-
char * buf;
|
9931
|
-
CUDA_CHECK(cudaMalloc(&buf, size));
|
9932
|
-
char * buf_host = (char *)data + offset_split;
|
9933
|
-
|
9934
|
-
// set padding to 0 to avoid possible NaN values
|
9935
|
-
if (size > original_size) {
|
9936
|
-
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
9937
|
-
}
|
9938
|
-
|
9939
|
-
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
9940
|
-
|
9941
|
-
extra->data_device[id] = buf;
|
9942
|
-
|
9943
|
-
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
9944
|
-
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
9945
|
-
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
9946
|
-
}
|
9947
|
-
}
|
9948
|
-
}
|
9949
|
-
|
9950
|
-
tensor->extra = extra;
|
9951
|
-
}
|
9952
|
-
|
9953
|
-
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
9954
|
-
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
10016
|
+
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
10017
|
+
if (main_device >= g_device_count) {
|
10018
|
+
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
10019
|
+
main_device, g_device_count, g_main_device);
|
9955
10020
|
return;
|
9956
10021
|
}
|
9957
10022
|
|
9958
|
-
|
9959
|
-
|
9960
|
-
|
9961
|
-
|
9962
|
-
|
9963
|
-
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
9964
|
-
}
|
9965
|
-
|
9966
|
-
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
9967
|
-
if (extra->events[id][is] != nullptr) {
|
9968
|
-
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
9969
|
-
}
|
9970
|
-
}
|
9971
|
-
}
|
9972
|
-
|
9973
|
-
delete extra;
|
9974
|
-
}
|
9975
|
-
|
9976
|
-
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
9977
|
-
static size_t g_temp_tensor_extra_index = 0;
|
9978
|
-
|
9979
|
-
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
9980
|
-
if (g_temp_tensor_extras == nullptr) {
|
9981
|
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
10023
|
+
if (g_main_device != main_device && g_device_count > 1) {
|
10024
|
+
g_main_device = main_device;
|
10025
|
+
//cudaDeviceProp prop;
|
10026
|
+
//CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
10027
|
+
//fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
9982
10028
|
}
|
9983
|
-
|
9984
|
-
size_t alloc_index = g_temp_tensor_extra_index;
|
9985
|
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
9986
|
-
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
9987
|
-
memset(extra, 0, sizeof(*extra));
|
9988
|
-
|
9989
|
-
return extra;
|
9990
10029
|
}
|
9991
10030
|
|
9992
|
-
|
9993
|
-
if (
|
9994
|
-
return;
|
9995
|
-
}
|
9996
|
-
|
9997
|
-
tensor->backend = GGML_BACKEND_GPU;
|
9998
|
-
|
9999
|
-
// recursively assign CUDA buffers until a compute tensor is found
|
10000
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
10001
|
-
const ggml_op src0_op = tensor->src[0]->op;
|
10002
|
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
10003
|
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
10004
|
-
}
|
10005
|
-
}
|
10006
|
-
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
10007
|
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
10008
|
-
}
|
10009
|
-
|
10010
|
-
if (scratch && no_alloc) {
|
10011
|
-
return;
|
10012
|
-
}
|
10013
|
-
|
10014
|
-
ggml_tensor_extra_gpu * extra;
|
10015
|
-
|
10016
|
-
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
10017
|
-
tensor->op == GGML_OP_VIEW ||
|
10018
|
-
force_inplace;
|
10019
|
-
const size_t size = ggml_nbytes(tensor);
|
10020
|
-
|
10021
|
-
ggml_cuda_set_device(g_main_device);
|
10022
|
-
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
10023
|
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
10024
|
-
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
10025
|
-
size_t offset = 0;
|
10026
|
-
if (tensor->op == GGML_OP_VIEW) {
|
10027
|
-
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
10028
|
-
}
|
10029
|
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
10030
|
-
extra->data_device[g_main_device] = src0_ddc + offset;
|
10031
|
-
} else if (tensor->op == GGML_OP_CPY) {
|
10032
|
-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
10033
|
-
void * src1_ddv = src1_extra->data_device[g_main_device];
|
10034
|
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
10035
|
-
extra->data_device[g_main_device] = src1_ddv;
|
10036
|
-
} else if (scratch) {
|
10037
|
-
GGML_ASSERT(size <= g_scratch_size);
|
10038
|
-
if (g_scratch_offset + size > g_scratch_size) {
|
10039
|
-
g_scratch_offset = 0;
|
10040
|
-
}
|
10041
|
-
|
10042
|
-
char * data = (char *) g_scratch_buffer;
|
10043
|
-
if (data == nullptr) {
|
10044
|
-
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
10045
|
-
g_scratch_buffer = data;
|
10046
|
-
}
|
10047
|
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
10048
|
-
extra->data_device[g_main_device] = data + g_scratch_offset;
|
10049
|
-
|
10050
|
-
g_scratch_offset += size;
|
10051
|
-
|
10052
|
-
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
10053
|
-
} else { // allocate new buffers outside of scratch
|
10054
|
-
void * data;
|
10055
|
-
CUDA_CHECK(cudaMalloc(&data, size));
|
10056
|
-
CUDA_CHECK(cudaMemset(data, 0, size));
|
10057
|
-
extra = new ggml_tensor_extra_gpu;
|
10058
|
-
memset(extra, 0, sizeof(*extra));
|
10059
|
-
extra->data_device[g_main_device] = data;
|
10060
|
-
}
|
10031
|
+
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
10032
|
+
if (!g_cublas_loaded) return false;
|
10061
10033
|
|
10062
|
-
|
10063
|
-
|
10034
|
+
ggml_cuda_func_t func;
|
10035
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
10036
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
10037
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
10064
10038
|
|
10065
|
-
|
10066
|
-
|
10067
|
-
return;
|
10068
|
-
}
|
10069
|
-
if (g_scratch_buffer == nullptr) {
|
10070
|
-
ggml_cuda_set_device(g_main_device);
|
10071
|
-
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
10039
|
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
10040
|
+
return false;
|
10072
10041
|
}
|
10073
10042
|
|
10074
|
-
|
10075
|
-
|
10076
|
-
|
10077
|
-
|
10078
|
-
|
10079
|
-
|
10080
|
-
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
10081
|
-
size_t view_offset = 0;
|
10082
|
-
if (tensor->op == GGML_OP_VIEW) {
|
10083
|
-
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
10084
|
-
}
|
10085
|
-
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
10086
|
-
} else {
|
10087
|
-
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
10088
|
-
}
|
10089
|
-
|
10090
|
-
tensor->extra = extra;
|
10091
|
-
}
|
10092
|
-
|
10093
|
-
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
10094
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
10095
|
-
GGML_ASSERT(ggml_is_contiguous(tensor));
|
10096
|
-
|
10097
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
10098
|
-
ggml_cuda_set_device(g_main_device);
|
10099
|
-
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
10100
|
-
}
|
10101
|
-
|
10102
|
-
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
10103
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
10104
|
-
}
|
10105
|
-
|
10106
|
-
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
10107
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
10108
|
-
}
|
10109
|
-
|
10110
|
-
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
10111
|
-
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
10112
|
-
}
|
10113
|
-
|
10114
|
-
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
10115
|
-
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
10116
|
-
}
|
10117
|
-
|
10118
|
-
void ggml_cuda_set_main_device(const int main_device) {
|
10119
|
-
if (main_device >= g_device_count) {
|
10120
|
-
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
10121
|
-
main_device, g_device_count, g_main_device);
|
10122
|
-
return;
|
10123
|
-
}
|
10124
|
-
|
10125
|
-
if (g_main_device != main_device && g_device_count > 1) {
|
10126
|
-
g_main_device = main_device;
|
10127
|
-
cudaDeviceProp prop;
|
10128
|
-
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
10129
|
-
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
10130
|
-
}
|
10131
|
-
}
|
10132
|
-
|
10133
|
-
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
10134
|
-
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
10135
|
-
// it still won't always work as expected, but it's better than nothing
|
10136
|
-
if (scratch_size > g_scratch_size) {
|
10137
|
-
ggml_cuda_free_scratch();
|
10138
|
-
}
|
10139
|
-
g_scratch_size = std::max(g_scratch_size, scratch_size);
|
10140
|
-
}
|
10141
|
-
|
10142
|
-
void ggml_cuda_free_scratch() {
|
10143
|
-
if (g_scratch_buffer == nullptr) {
|
10144
|
-
return;
|
10145
|
-
}
|
10146
|
-
|
10147
|
-
CUDA_CHECK(cudaFree(g_scratch_buffer));
|
10148
|
-
g_scratch_buffer = nullptr;
|
10149
|
-
}
|
10150
|
-
|
10151
|
-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
10152
|
-
if (!g_cublas_loaded) return false;
|
10153
|
-
|
10154
|
-
ggml_cuda_func_t func;
|
10155
|
-
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
10156
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
10157
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
10158
|
-
|
10159
|
-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
10160
|
-
return false;
|
10161
|
-
}
|
10162
|
-
|
10163
|
-
if (tensor->op == GGML_OP_MUL_MAT) {
|
10164
|
-
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
10165
|
-
#ifndef NDEBUG
|
10166
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
10167
|
-
#endif
|
10168
|
-
return false;
|
10043
|
+
if (tensor->op == GGML_OP_MUL_MAT) {
|
10044
|
+
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
10045
|
+
#ifndef NDEBUG
|
10046
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
10047
|
+
#endif
|
10048
|
+
return false;
|
10169
10049
|
}
|
10170
10050
|
}
|
10171
10051
|
|
@@ -10306,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
10306
10186
|
return true;
|
10307
10187
|
}
|
10308
10188
|
|
10309
|
-
int ggml_cuda_get_device_count() {
|
10189
|
+
GGML_CALL int ggml_cuda_get_device_count() {
|
10310
10190
|
int device_count;
|
10311
10191
|
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
10312
10192
|
return 0;
|
@@ -10314,7 +10194,7 @@ int ggml_cuda_get_device_count() {
|
|
10314
10194
|
return device_count;
|
10315
10195
|
}
|
10316
10196
|
|
10317
|
-
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
10197
|
+
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
10318
10198
|
cudaDeviceProp prop;
|
10319
10199
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
10320
10200
|
snprintf(description, description_size, "%s", prop.name);
|
@@ -10326,21 +10206,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
10326
10206
|
|
10327
10207
|
#define UNUSED GGML_UNUSED
|
10328
10208
|
|
10209
|
+
struct ggml_backend_cuda_context {
|
10210
|
+
int device;
|
10211
|
+
std::string name;
|
10212
|
+
};
|
10213
|
+
|
10329
10214
|
// cuda buffer
|
10330
10215
|
|
10331
|
-
struct
|
10216
|
+
struct ggml_backend_cuda_buffer_context {
|
10332
10217
|
int device;
|
10333
10218
|
void * dev_ptr = nullptr;
|
10334
10219
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
10335
10220
|
size_t temp_tensor_extra_index = 0;
|
10221
|
+
std::string name;
|
10336
10222
|
|
10337
|
-
|
10223
|
+
ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
|
10224
|
+
device(device), dev_ptr(dev_ptr),
|
10225
|
+
name(GGML_CUDA_NAME + std::to_string(device)) {
|
10226
|
+
}
|
10338
10227
|
|
10339
|
-
~
|
10228
|
+
~ggml_backend_cuda_buffer_context() {
|
10340
10229
|
delete[] temp_tensor_extras;
|
10341
10230
|
}
|
10342
10231
|
|
10343
10232
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
10233
|
+
// TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
|
10344
10234
|
if (temp_tensor_extras == nullptr) {
|
10345
10235
|
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
10346
10236
|
}
|
@@ -10354,19 +10244,28 @@ struct ggml_backend_buffer_context_cuda {
|
|
10354
10244
|
}
|
10355
10245
|
};
|
10356
10246
|
|
10357
|
-
static
|
10358
|
-
|
10247
|
+
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
10248
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10249
|
+
return ctx->name.c_str();
|
10250
|
+
}
|
10251
|
+
|
10252
|
+
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
10253
|
+
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
10254
|
+
}
|
10255
|
+
|
10256
|
+
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
10257
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10359
10258
|
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
10360
10259
|
delete ctx;
|
10361
10260
|
}
|
10362
10261
|
|
10363
|
-
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
10364
|
-
|
10262
|
+
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
10263
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10365
10264
|
return ctx->dev_ptr;
|
10366
10265
|
}
|
10367
10266
|
|
10368
|
-
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
10369
|
-
|
10267
|
+
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
10268
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10370
10269
|
|
10371
10270
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
10372
10271
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
@@ -10395,14 +10294,12 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
10395
10294
|
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
10396
10295
|
}
|
10397
10296
|
}
|
10398
|
-
|
10399
|
-
UNUSED(buffer);
|
10400
10297
|
}
|
10401
10298
|
|
10402
|
-
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10299
|
+
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10403
10300
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
10404
10301
|
|
10405
|
-
|
10302
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10406
10303
|
|
10407
10304
|
ggml_cuda_set_device(ctx->device);
|
10408
10305
|
CUDA_CHECK(cudaDeviceSynchronize());
|
@@ -10410,61 +10307,93 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|
10410
10307
|
CUDA_CHECK(cudaDeviceSynchronize());
|
10411
10308
|
}
|
10412
10309
|
|
10413
|
-
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10310
|
+
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10414
10311
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
10415
10312
|
|
10416
|
-
|
10313
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10417
10314
|
|
10418
10315
|
ggml_cuda_set_device(ctx->device);
|
10419
10316
|
CUDA_CHECK(cudaDeviceSynchronize());
|
10420
|
-
|
10421
10317
|
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
10318
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10422
10319
|
}
|
10423
10320
|
|
10424
|
-
static
|
10425
|
-
|
10321
|
+
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
10322
|
+
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
10323
|
+
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
10324
|
+
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10325
|
+
|
10326
|
+
ggml_cuda_set_device(src_ctx->device);
|
10327
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10328
|
+
ggml_cuda_set_device(dst_ctx->device);
|
10329
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10330
|
+
CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
|
10331
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10332
|
+
|
10333
|
+
return true;
|
10334
|
+
}
|
10335
|
+
return false;
|
10336
|
+
}
|
10337
|
+
|
10338
|
+
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
10339
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10426
10340
|
|
10427
10341
|
ggml_cuda_set_device(ctx->device);
|
10428
10342
|
CUDA_CHECK(cudaDeviceSynchronize());
|
10429
|
-
|
10430
10343
|
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
10344
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10431
10345
|
}
|
10432
10346
|
|
10433
|
-
static
|
10347
|
+
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
10348
|
+
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
10434
10349
|
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
10435
10350
|
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
10436
10351
|
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
10437
10352
|
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
10438
10353
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
10439
|
-
/* .
|
10440
|
-
/* .cpy_tensor_to = */ NULL,
|
10354
|
+
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
|
10441
10355
|
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
10356
|
+
/* .reset = */ NULL,
|
10442
10357
|
};
|
10443
10358
|
|
10444
10359
|
// cuda buffer type
|
10360
|
+
struct ggml_backend_cuda_buffer_type_context {
|
10361
|
+
int device;
|
10362
|
+
std::string name;
|
10363
|
+
};
|
10445
10364
|
|
10446
|
-
static
|
10447
|
-
|
10365
|
+
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
10366
|
+
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
10448
10367
|
|
10449
|
-
|
10368
|
+
return ctx->name.c_str();
|
10369
|
+
}
|
10370
|
+
|
10371
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
10372
|
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
10373
|
+
|
10374
|
+
ggml_cuda_set_device(buft_ctx->device);
|
10450
10375
|
|
10451
10376
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
10452
10377
|
|
10453
10378
|
void * dev_ptr;
|
10454
|
-
|
10379
|
+
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
10380
|
+
if (err != cudaSuccess) {
|
10381
|
+
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
10382
|
+
return nullptr;
|
10383
|
+
}
|
10455
10384
|
|
10456
|
-
|
10385
|
+
ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
|
10457
10386
|
|
10458
|
-
return ggml_backend_buffer_init(buft,
|
10387
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
10459
10388
|
}
|
10460
10389
|
|
10461
|
-
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
10390
|
+
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
10462
10391
|
return 128;
|
10463
10392
|
|
10464
10393
|
UNUSED(buft);
|
10465
10394
|
}
|
10466
10395
|
|
10467
|
-
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
10396
|
+
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
10468
10397
|
int64_t row_low = 0;
|
10469
10398
|
int64_t row_high = ggml_nrows(tensor);
|
10470
10399
|
int64_t nrows_split = row_high - row_low;
|
@@ -10484,22 +10413,33 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|
10484
10413
|
UNUSED(buft);
|
10485
10414
|
}
|
10486
10415
|
|
10487
|
-
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
10488
|
-
|
10416
|
+
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
10417
|
+
if (!ggml_backend_is_cuda(backend)) {
|
10418
|
+
return false;
|
10419
|
+
}
|
10489
10420
|
|
10490
|
-
|
10421
|
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
10422
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10423
|
+
|
10424
|
+
return buft_ctx->device == cuda_ctx->device;
|
10491
10425
|
}
|
10492
10426
|
|
10493
10427
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
10428
|
+
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
10494
10429
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
10495
10430
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
10496
10431
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
10497
10432
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
10498
|
-
/* .is_host = */
|
10433
|
+
/* .is_host = */ NULL,
|
10499
10434
|
};
|
10500
10435
|
|
10501
|
-
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
10502
|
-
|
10436
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
10437
|
+
// FIXME: this is not thread safe
|
10438
|
+
if (device >= ggml_backend_cuda_get_device_count()) {
|
10439
|
+
return nullptr;
|
10440
|
+
}
|
10441
|
+
|
10442
|
+
static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
10503
10443
|
|
10504
10444
|
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
10505
10445
|
|
@@ -10507,7 +10447,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
10507
10447
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
10508
10448
|
ggml_backend_cuda_buffer_types[i] = {
|
10509
10449
|
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
10510
|
-
/* .context = */
|
10450
|
+
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
10511
10451
|
};
|
10512
10452
|
}
|
10513
10453
|
ggml_backend_cuda_buffer_type_initialized = true;
|
@@ -10516,13 +10456,311 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
10516
10456
|
return &ggml_backend_cuda_buffer_types[device];
|
10517
10457
|
}
|
10518
10458
|
|
10459
|
+
// cuda split buffer
|
10460
|
+
|
10461
|
+
struct ggml_backend_cuda_split_buffer_context {
|
10462
|
+
~ggml_backend_cuda_split_buffer_context() {
|
10463
|
+
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
10464
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10465
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
10466
|
+
if (extra->events[id][is] != nullptr) {
|
10467
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
10468
|
+
}
|
10469
|
+
}
|
10470
|
+
if (extra->data_device[id] != nullptr) {
|
10471
|
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
10472
|
+
}
|
10473
|
+
}
|
10474
|
+
delete extra;
|
10475
|
+
}
|
10476
|
+
}
|
10477
|
+
|
10478
|
+
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
10479
|
+
};
|
10480
|
+
|
10481
|
+
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
10482
|
+
return GGML_CUDA_NAME "_Split";
|
10483
|
+
|
10484
|
+
UNUSED(buffer);
|
10485
|
+
}
|
10486
|
+
|
10487
|
+
// unused at the moment
|
10488
|
+
//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
10489
|
+
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
10490
|
+
//}
|
10491
|
+
|
10492
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
10493
|
+
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
10494
|
+
delete ctx;
|
10495
|
+
}
|
10496
|
+
|
10497
|
+
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
10498
|
+
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
10499
|
+
return (void *)0x1000;
|
10500
|
+
|
10501
|
+
UNUSED(buffer);
|
10502
|
+
}
|
10503
|
+
|
10504
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
10505
|
+
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
10506
|
+
|
10507
|
+
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
10508
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
10509
|
+
|
10510
|
+
const int64_t ne0 = tensor->ne[0];
|
10511
|
+
|
10512
|
+
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
10513
|
+
|
10514
|
+
ctx->tensor_extras.push_back(extra);
|
10515
|
+
|
10516
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10517
|
+
int64_t row_low, row_high;
|
10518
|
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
10519
|
+
|
10520
|
+
int64_t nrows_split = row_high - row_low;
|
10521
|
+
if (nrows_split == 0) {
|
10522
|
+
continue;
|
10523
|
+
}
|
10524
|
+
|
10525
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
10526
|
+
const size_t original_size = size;
|
10527
|
+
|
10528
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
10529
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
10530
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
10531
|
+
}
|
10532
|
+
|
10533
|
+
// FIXME: do not crash if cudaMalloc fails
|
10534
|
+
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
10535
|
+
ggml_cuda_set_device(id);
|
10536
|
+
char * buf;
|
10537
|
+
CUDA_CHECK(cudaMalloc(&buf, size));
|
10538
|
+
|
10539
|
+
// set padding to 0 to avoid possible NaN values
|
10540
|
+
if (size > original_size) {
|
10541
|
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
10542
|
+
}
|
10543
|
+
|
10544
|
+
extra->data_device[id] = buf;
|
10545
|
+
|
10546
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
10547
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
10548
|
+
}
|
10549
|
+
}
|
10550
|
+
tensor->backend = GGML_BACKEND_GPU_SPLIT;
|
10551
|
+
tensor->extra = extra;
|
10552
|
+
}
|
10553
|
+
|
10554
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10555
|
+
// split tensors must always be set in their entirety at once
|
10556
|
+
GGML_ASSERT(offset == 0);
|
10557
|
+
GGML_ASSERT(size == ggml_nbytes(tensor));
|
10558
|
+
|
10559
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
10560
|
+
|
10561
|
+
const int64_t ne0 = tensor->ne[0];
|
10562
|
+
const size_t nb1 = tensor->nb[1];
|
10563
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
10564
|
+
|
10565
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10566
|
+
int64_t row_low, row_high;
|
10567
|
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
10568
|
+
|
10569
|
+
int64_t nrows_split = row_high - row_low;
|
10570
|
+
if (nrows_split == 0) {
|
10571
|
+
continue;
|
10572
|
+
}
|
10573
|
+
|
10574
|
+
const size_t offset_split = row_low*nb1;
|
10575
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
10576
|
+
const size_t original_size = size;
|
10577
|
+
|
10578
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
10579
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
10580
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
10581
|
+
}
|
10582
|
+
|
10583
|
+
const char * buf_host = (const char *)data + offset_split;
|
10584
|
+
CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
|
10585
|
+
}
|
10586
|
+
}
|
10587
|
+
|
10588
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10589
|
+
// split tensors must always be set in their entirety at once
|
10590
|
+
GGML_ASSERT(offset == 0);
|
10591
|
+
GGML_ASSERT(size == ggml_nbytes(tensor));
|
10592
|
+
|
10593
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
10594
|
+
|
10595
|
+
const int64_t ne0 = tensor->ne[0];
|
10596
|
+
const size_t nb1 = tensor->nb[1];
|
10597
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
10598
|
+
|
10599
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10600
|
+
int64_t row_low, row_high;
|
10601
|
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
10602
|
+
|
10603
|
+
int64_t nrows_split = row_high - row_low;
|
10604
|
+
if (nrows_split == 0) {
|
10605
|
+
continue;
|
10606
|
+
}
|
10607
|
+
|
10608
|
+
const size_t offset_split = row_low*nb1;
|
10609
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
10610
|
+
const size_t original_size = size;
|
10611
|
+
|
10612
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
10613
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
10614
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
10615
|
+
}
|
10616
|
+
|
10617
|
+
char * buf_host = (char *)data + offset_split;
|
10618
|
+
CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
|
10619
|
+
}
|
10620
|
+
}
|
10621
|
+
|
10622
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
10623
|
+
UNUSED(buffer);
|
10624
|
+
UNUSED(value);
|
10625
|
+
}
|
10626
|
+
|
10627
|
+
static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
10628
|
+
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
10629
|
+
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
10630
|
+
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
10631
|
+
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
10632
|
+
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
|
10633
|
+
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
|
10634
|
+
/* .cpy_tensor = */ NULL,
|
10635
|
+
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
|
10636
|
+
/* .reset = */ NULL,
|
10637
|
+
};
|
10638
|
+
|
10639
|
+
// cuda split buffer type
|
10640
|
+
|
10641
|
+
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
10642
|
+
return GGML_CUDA_NAME "_Split";
|
10643
|
+
|
10644
|
+
UNUSED(buft);
|
10645
|
+
}
|
10646
|
+
|
10647
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
10648
|
+
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
10649
|
+
// instead, we allocate them for each tensor separately in init_tensor
|
10650
|
+
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
10651
|
+
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
|
10652
|
+
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
|
10653
|
+
|
10654
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
10655
|
+
}
|
10656
|
+
|
10657
|
+
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
10658
|
+
return 128;
|
10659
|
+
|
10660
|
+
UNUSED(buft);
|
10661
|
+
}
|
10662
|
+
|
10663
|
+
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
10664
|
+
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
10665
|
+
|
10666
|
+
size_t total_size = 0;
|
10667
|
+
|
10668
|
+
const int64_t ne0 = tensor->ne[0];
|
10669
|
+
|
10670
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10671
|
+
int64_t row_low, row_high;
|
10672
|
+
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
|
10673
|
+
|
10674
|
+
int64_t nrows_split = row_high - row_low;
|
10675
|
+
if (nrows_split == 0) {
|
10676
|
+
continue;
|
10677
|
+
}
|
10678
|
+
|
10679
|
+
total_size += ggml_nbytes_split(tensor, nrows_split);
|
10680
|
+
|
10681
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
10682
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
10683
|
+
total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
10684
|
+
}
|
10685
|
+
}
|
10686
|
+
|
10687
|
+
return total_size;
|
10688
|
+
}
|
10689
|
+
|
10690
|
+
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
10691
|
+
return ggml_backend_is_cuda(backend);
|
10692
|
+
|
10693
|
+
UNUSED(buft);
|
10694
|
+
}
|
10695
|
+
|
10696
|
+
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
10697
|
+
return false;
|
10698
|
+
|
10699
|
+
UNUSED(buft);
|
10700
|
+
}
|
10701
|
+
|
10702
|
+
static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
10703
|
+
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
10704
|
+
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
10705
|
+
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
10706
|
+
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
10707
|
+
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
10708
|
+
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
10709
|
+
};
|
10710
|
+
|
10711
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
10712
|
+
// FIXME: this is not thread safe
|
10713
|
+
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
10714
|
+
|
10715
|
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
10716
|
+
|
10717
|
+
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
|
10718
|
+
if (all_zero) {
|
10719
|
+
tensor_split_arr = g_default_tensor_split;
|
10720
|
+
} else {
|
10721
|
+
float split_sum = 0.0f;
|
10722
|
+
for (int i = 0; i < g_device_count; ++i) {
|
10723
|
+
tensor_split_arr[i] = split_sum;
|
10724
|
+
split_sum += tensor_split[i];
|
10725
|
+
}
|
10726
|
+
for (int i = 0; i < g_device_count; ++i) {
|
10727
|
+
tensor_split_arr[i] /= split_sum;
|
10728
|
+
}
|
10729
|
+
}
|
10730
|
+
|
10731
|
+
auto it = buft_map.find(tensor_split_arr);
|
10732
|
+
if (it != buft_map.end()) {
|
10733
|
+
return &it->second;
|
10734
|
+
}
|
10735
|
+
|
10736
|
+
struct ggml_backend_buffer_type buft {
|
10737
|
+
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
10738
|
+
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
10739
|
+
};
|
10740
|
+
|
10741
|
+
auto result = buft_map.emplace(tensor_split_arr, buft);
|
10742
|
+
return &result.first->second;
|
10743
|
+
}
|
10744
|
+
|
10519
10745
|
// host buffer type
|
10520
10746
|
|
10521
|
-
static
|
10747
|
+
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
10748
|
+
return GGML_CUDA_NAME "_Host";
|
10749
|
+
|
10750
|
+
UNUSED(buft);
|
10751
|
+
}
|
10752
|
+
|
10753
|
+
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
10754
|
+
return GGML_CUDA_NAME "_Host";
|
10755
|
+
|
10756
|
+
UNUSED(buffer);
|
10757
|
+
}
|
10758
|
+
|
10759
|
+
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
10522
10760
|
ggml_cuda_host_free(buffer->context);
|
10523
10761
|
}
|
10524
10762
|
|
10525
|
-
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
10763
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
10526
10764
|
void * ptr = ggml_cuda_host_malloc(size);
|
10527
10765
|
|
10528
10766
|
if (ptr == nullptr) {
|
@@ -10530,17 +10768,18 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
10530
10768
|
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
10531
10769
|
}
|
10532
10770
|
|
10533
|
-
// FIXME: this is a hack to avoid having to implement a new buffer type
|
10534
10771
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
10535
10772
|
buffer->buft = buft;
|
10773
|
+
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
10536
10774
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
10537
10775
|
|
10538
10776
|
return buffer;
|
10539
10777
|
}
|
10540
10778
|
|
10541
|
-
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
10779
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
10542
10780
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
10543
10781
|
/* .iface = */ {
|
10782
|
+
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
10544
10783
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
10545
10784
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
10546
10785
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
@@ -10555,31 +10794,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
10555
10794
|
|
10556
10795
|
// backend
|
10557
10796
|
|
10558
|
-
|
10559
|
-
|
10560
|
-
};
|
10797
|
+
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
10798
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10561
10799
|
|
10562
|
-
|
10563
|
-
return GGML_CUDA_NAME;
|
10564
|
-
|
10565
|
-
UNUSED(backend);
|
10800
|
+
return cuda_ctx->name.c_str();
|
10566
10801
|
}
|
10567
10802
|
|
10568
|
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
10569
|
-
|
10803
|
+
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
10804
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10570
10805
|
|
10571
10806
|
delete cuda_ctx;
|
10572
10807
|
delete backend;
|
10573
10808
|
}
|
10574
10809
|
|
10575
|
-
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
10576
|
-
|
10810
|
+
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
10811
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10577
10812
|
|
10578
10813
|
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
10579
10814
|
}
|
10580
10815
|
|
10581
|
-
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10582
|
-
|
10816
|
+
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10817
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10583
10818
|
|
10584
10819
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
10585
10820
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -10587,8 +10822,8 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|
10587
10822
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
10588
10823
|
}
|
10589
10824
|
|
10590
|
-
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10591
|
-
|
10825
|
+
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10826
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10592
10827
|
|
10593
10828
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
10594
10829
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -10596,39 +10831,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|
10596
10831
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
10597
10832
|
}
|
10598
10833
|
|
10599
|
-
static
|
10600
|
-
|
10601
|
-
|
10602
|
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
10603
|
-
|
10604
|
-
UNUSED(backend);
|
10605
|
-
}
|
10606
|
-
|
10607
|
-
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
10608
|
-
GGML_ASSERT(!"not implemented");
|
10834
|
+
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
10835
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10609
10836
|
|
10610
|
-
|
10837
|
+
if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
|
10838
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
10839
|
+
return true;
|
10840
|
+
}
|
10611
10841
|
|
10612
|
-
|
10613
|
-
UNUSED(cgraph);
|
10842
|
+
return false;
|
10614
10843
|
}
|
10615
10844
|
|
10616
|
-
static void
|
10617
|
-
|
10845
|
+
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
10846
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10618
10847
|
|
10619
|
-
|
10620
|
-
UNUSED(plan);
|
10621
|
-
}
|
10622
|
-
|
10623
|
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
10624
|
-
GGML_ASSERT(!"not implemented");
|
10848
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
10625
10849
|
|
10626
10850
|
UNUSED(backend);
|
10627
|
-
UNUSED(plan);
|
10628
10851
|
}
|
10629
10852
|
|
10630
|
-
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
10631
|
-
|
10853
|
+
GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
10854
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10632
10855
|
|
10633
10856
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
10634
10857
|
|
@@ -10638,57 +10861,35 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
10638
10861
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
10639
10862
|
ggml_tensor * node = cgraph->nodes[i];
|
10640
10863
|
|
10641
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
10864
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
10642
10865
|
continue;
|
10866
|
+
}
|
10643
10867
|
|
10644
|
-
|
10868
|
+
#ifndef NDEBUG
|
10869
|
+
assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
|
10645
10870
|
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
10646
10871
|
assert(node->extra != nullptr);
|
10647
10872
|
|
10648
10873
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
10649
10874
|
if (node->src[j] != nullptr) {
|
10650
|
-
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
10875
|
+
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
10651
10876
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
10652
10877
|
assert(node->src[j]->extra != nullptr);
|
10653
10878
|
}
|
10654
10879
|
}
|
10880
|
+
#endif
|
10655
10881
|
|
10656
10882
|
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
10657
10883
|
if (!ok) {
|
10658
10884
|
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
10659
10885
|
}
|
10660
10886
|
GGML_ASSERT(ok);
|
10661
|
-
|
10662
|
-
#if 0
|
10663
|
-
if (node->type == GGML_TYPE_F32) {
|
10664
|
-
cudaDeviceSynchronize();
|
10665
|
-
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
10666
|
-
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
10667
|
-
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
10668
|
-
ggml_type_name(node->src[0]->type),
|
10669
|
-
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
10670
|
-
node->src[0]->name,
|
10671
|
-
node->src[1] ? node->src[1]->name : "none");
|
10672
|
-
double sum = 0.0;
|
10673
|
-
double sq_sum = 0.0;
|
10674
|
-
for (int i = 0; i < ggml_nelements(node); i++) {
|
10675
|
-
printf("%f ", tmp[i]);
|
10676
|
-
sum += tmp[i];
|
10677
|
-
sq_sum += tmp[i]*tmp[i];
|
10678
|
-
}
|
10679
|
-
printf("\n");
|
10680
|
-
printf("sum: %f, ", sum);
|
10681
|
-
printf("sq_sum: %f\n", sq_sum);
|
10682
|
-
}
|
10683
|
-
#endif
|
10684
10887
|
}
|
10685
10888
|
|
10686
|
-
UNUSED(backend);
|
10687
|
-
|
10688
10889
|
return true;
|
10689
10890
|
}
|
10690
10891
|
|
10691
|
-
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
10892
|
+
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
10692
10893
|
switch (op->op) {
|
10693
10894
|
case GGML_OP_UNARY:
|
10694
10895
|
switch (ggml_get_unary_op(op)) {
|
@@ -10799,23 +11000,22 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
10799
11000
|
UNUSED(backend);
|
10800
11001
|
}
|
10801
11002
|
|
10802
|
-
static ggml_backend_i
|
11003
|
+
static ggml_backend_i ggml_backend_cuda_interface = {
|
10803
11004
|
/* .get_name = */ ggml_backend_cuda_name,
|
10804
11005
|
/* .free = */ ggml_backend_cuda_free,
|
10805
11006
|
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
10806
11007
|
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
10807
11008
|
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
10808
|
-
/* .
|
10809
|
-
/* .cpy_tensor_to_async = */ NULL,
|
11009
|
+
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
10810
11010
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
10811
|
-
/* .graph_plan_create = */
|
10812
|
-
/* .graph_plan_free = */
|
10813
|
-
/* .graph_plan_compute = */
|
11011
|
+
/* .graph_plan_create = */ NULL,
|
11012
|
+
/* .graph_plan_free = */ NULL,
|
11013
|
+
/* .graph_plan_compute = */ NULL,
|
10814
11014
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
10815
11015
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
10816
11016
|
};
|
10817
11017
|
|
10818
|
-
ggml_backend_t ggml_backend_cuda_init(int device) {
|
11018
|
+
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
10819
11019
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
10820
11020
|
|
10821
11021
|
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
@@ -10826,32 +11026,48 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
10826
11026
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
10827
11027
|
ggml_cuda_set_main_device(device);
|
10828
11028
|
|
10829
|
-
|
10830
|
-
/* .device = */ device
|
11029
|
+
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
|
11030
|
+
/* .device = */ device,
|
11031
|
+
/* .name = */ GGML_CUDA_NAME + std::to_string(device),
|
10831
11032
|
};
|
10832
11033
|
|
10833
11034
|
ggml_backend_t cuda_backend = new ggml_backend {
|
10834
|
-
/* .interface = */
|
11035
|
+
/* .interface = */ ggml_backend_cuda_interface,
|
10835
11036
|
/* .context = */ ctx
|
10836
11037
|
};
|
10837
11038
|
|
10838
11039
|
return cuda_backend;
|
10839
11040
|
}
|
10840
11041
|
|
10841
|
-
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
10842
|
-
return backend->iface.get_name == ggml_backend_cuda_name;
|
11042
|
+
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
11043
|
+
return backend && backend->iface.get_name == ggml_backend_cuda_name;
|
11044
|
+
}
|
11045
|
+
|
11046
|
+
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
11047
|
+
return ggml_cuda_get_device_count();
|
11048
|
+
}
|
11049
|
+
|
11050
|
+
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
11051
|
+
ggml_cuda_get_device_description(device, description, description_size);
|
11052
|
+
}
|
11053
|
+
|
11054
|
+
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
11055
|
+
ggml_cuda_set_device(device);
|
11056
|
+
|
11057
|
+
CUDA_CHECK(cudaMemGetInfo(free, total));
|
10843
11058
|
}
|
10844
11059
|
|
10845
|
-
|
11060
|
+
// backend registry
|
11061
|
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
10846
11062
|
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
10847
11063
|
return cuda_backend;
|
10848
11064
|
|
10849
11065
|
UNUSED(params);
|
10850
11066
|
}
|
10851
11067
|
|
10852
|
-
extern "C" int ggml_backend_cuda_reg_devices();
|
11068
|
+
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
10853
11069
|
|
10854
|
-
int ggml_backend_cuda_reg_devices() {
|
11070
|
+
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
10855
11071
|
int device_count = ggml_cuda_get_device_count();
|
10856
11072
|
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
10857
11073
|
for (int i = 0; i < device_count; i++) {
|