llama_cpp 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -8,8 +8,13 @@
|
|
8
8
|
#include <limits>
|
9
9
|
#include <stdint.h>
|
10
10
|
#include <stdio.h>
|
11
|
+
#include <string>
|
11
12
|
#include <vector>
|
12
|
-
|
13
|
+
#include <map>
|
14
|
+
#include <array>
|
15
|
+
#include "ggml-cuda.h"
|
16
|
+
#include "ggml.h"
|
17
|
+
#include "ggml-backend-impl.h"
|
13
18
|
|
14
19
|
#if defined(GGML_USE_HIPBLAS)
|
15
20
|
#include <hip/hip_runtime.h>
|
@@ -77,6 +82,7 @@
|
|
77
82
|
#define cudaMemcpyKind hipMemcpyKind
|
78
83
|
#define cudaMemset hipMemset
|
79
84
|
#define cudaMemsetAsync hipMemsetAsync
|
85
|
+
#define cudaMemGetInfo hipMemGetInfo
|
80
86
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
81
87
|
#define cudaSetDevice hipSetDevice
|
82
88
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
@@ -112,9 +118,7 @@
|
|
112
118
|
|
113
119
|
#endif // defined(GGML_USE_HIPBLAS)
|
114
120
|
|
115
|
-
#
|
116
|
-
#include "ggml.h"
|
117
|
-
#include "ggml-backend-impl.h"
|
121
|
+
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
118
122
|
|
119
123
|
#define CC_PASCAL 600
|
120
124
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
@@ -519,6 +523,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
|
|
519
523
|
#define CUDA_ACC_BLOCK_SIZE 256
|
520
524
|
#define CUDA_IM2COL_BLOCK_SIZE 256
|
521
525
|
|
526
|
+
#define CUDA_Q8_0_NE_ALIGN 2048
|
527
|
+
|
522
528
|
// dmmv = dequantize_mul_mat_vec
|
523
529
|
#ifndef GGML_CUDA_DMMV_X
|
524
530
|
#define GGML_CUDA_DMMV_X 32
|
@@ -562,7 +568,7 @@ static void ggml_cuda_set_device(const int device) {
|
|
562
568
|
|
563
569
|
static int g_device_count = -1;
|
564
570
|
static int g_main_device = 0;
|
565
|
-
static float
|
571
|
+
static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
|
566
572
|
|
567
573
|
struct cuda_device_capabilities {
|
568
574
|
int cc; // compute capability
|
@@ -573,10 +579,6 @@ struct cuda_device_capabilities {
|
|
573
579
|
|
574
580
|
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
|
575
581
|
|
576
|
-
static void * g_scratch_buffer = nullptr;
|
577
|
-
static size_t g_scratch_size = 0; // disabled by default
|
578
|
-
static size_t g_scratch_offset = 0;
|
579
|
-
|
580
582
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
581
583
|
|
582
584
|
[[noreturn]]
|
@@ -605,16 +607,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
605
607
|
}
|
606
608
|
|
607
609
|
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
608
|
-
#if
|
609
|
-
(void) a;
|
610
|
-
bad_arch();
|
611
|
-
#else
|
610
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
612
611
|
#pragma unroll
|
613
612
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
614
613
|
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
615
614
|
}
|
616
615
|
return a;
|
617
|
-
#
|
616
|
+
#else
|
617
|
+
(void) a;
|
618
|
+
bad_arch();
|
619
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
618
620
|
}
|
619
621
|
|
620
622
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
@@ -626,16 +628,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
626
628
|
}
|
627
629
|
|
628
630
|
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
629
|
-
#if
|
630
|
-
(void) x;
|
631
|
-
bad_arch();
|
632
|
-
#else
|
631
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
633
632
|
#pragma unroll
|
634
633
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
635
634
|
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
636
635
|
}
|
637
636
|
return x;
|
638
|
-
#
|
637
|
+
#else
|
638
|
+
(void) x;
|
639
|
+
bad_arch();
|
640
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
639
641
|
}
|
640
642
|
|
641
643
|
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
@@ -1103,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
1103
1105
|
#endif // GGML_CUDA_F16
|
1104
1106
|
}
|
1105
1107
|
|
1108
|
+
template<typename dst_t>
|
1109
|
+
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
1110
|
+
|
1111
|
+
const int i = blockIdx.x;
|
1112
|
+
|
1113
|
+
// assume 32 threads
|
1114
|
+
const int tid = threadIdx.x;
|
1115
|
+
const int il = tid/8;
|
1116
|
+
const int ir = tid%8;
|
1117
|
+
const int ib = 8*i + ir;
|
1118
|
+
if (ib >= nb32) {
|
1119
|
+
return;
|
1120
|
+
}
|
1121
|
+
|
1122
|
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
1123
|
+
|
1124
|
+
const block_q4_0 * x = (const block_q4_0 *)vx + ib;
|
1125
|
+
const float d = __half2float(x->d);
|
1126
|
+
const float dm = -8*d;
|
1127
|
+
|
1128
|
+
const uint8_t * q = x->qs + 4*il;
|
1129
|
+
|
1130
|
+
for (int l = 0; l < 4; ++l) {
|
1131
|
+
y[l+ 0] = d * (q[l] & 0xF) + dm;
|
1132
|
+
y[l+16] = d * (q[l] >> 4) + dm;
|
1133
|
+
}
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
template<typename dst_t>
|
1137
|
+
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
1138
|
+
|
1139
|
+
const int i = blockIdx.x;
|
1140
|
+
|
1141
|
+
// assume 32 threads
|
1142
|
+
const int tid = threadIdx.x;
|
1143
|
+
const int il = tid/8;
|
1144
|
+
const int ir = tid%8;
|
1145
|
+
const int ib = 8*i + ir;
|
1146
|
+
if (ib >= nb32) {
|
1147
|
+
return;
|
1148
|
+
}
|
1149
|
+
|
1150
|
+
dst_t * y = yy + 256*i + 32*ir + 4*il;
|
1151
|
+
|
1152
|
+
const block_q4_1 * x = (const block_q4_1 *)vx + ib;
|
1153
|
+
const float2 d = __half22float2(x->dm);
|
1154
|
+
|
1155
|
+
const uint8_t * q = x->qs + 4*il;
|
1156
|
+
|
1157
|
+
for (int l = 0; l < 4; ++l) {
|
1158
|
+
y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
|
1159
|
+
y[l+16] = d.x * (q[l] >> 4) + d.y;
|
1160
|
+
}
|
1161
|
+
}
|
1162
|
+
|
1106
1163
|
//================================== k-quants
|
1107
1164
|
|
1108
1165
|
template<typename dst_t>
|
@@ -2327,6 +2384,45 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
|
2327
2384
|
y[i] = x[i];
|
2328
2385
|
}
|
2329
2386
|
|
2387
|
+
template <bool need_check>
|
2388
|
+
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
2389
|
+
#if __CUDA_ARCH__ >= CC_PASCAL
|
2390
|
+
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
2391
|
+
|
2392
|
+
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
2393
|
+
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
2394
|
+
half2 * y2 = (half2 *) (y + i0);
|
2395
|
+
|
2396
|
+
__shared__ int vals[nint];
|
2397
|
+
|
2398
|
+
#pragma unroll
|
2399
|
+
for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
|
2400
|
+
if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
|
2401
|
+
break;
|
2402
|
+
}
|
2403
|
+
|
2404
|
+
const int ix = ix0 + threadIdx.x;
|
2405
|
+
vals[ix] = x0[ix];
|
2406
|
+
}
|
2407
|
+
|
2408
|
+
#pragma unroll
|
2409
|
+
for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
|
2410
|
+
if (need_check && i0 + iy + 2*threadIdx.x >= k) {
|
2411
|
+
return;
|
2412
|
+
}
|
2413
|
+
|
2414
|
+
const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
|
2415
|
+
const half d = *b0;
|
2416
|
+
const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
|
2417
|
+
|
2418
|
+
y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
|
2419
|
+
}
|
2420
|
+
#else
|
2421
|
+
(void) vx; (void) y; (void) k;
|
2422
|
+
bad_arch();
|
2423
|
+
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
2424
|
+
}
|
2425
|
+
|
2330
2426
|
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
2331
2427
|
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
2332
2428
|
|
@@ -5613,7 +5709,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
5613
5709
|
|
5614
5710
|
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
|
5615
5711
|
static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
5616
|
-
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
5712
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
5617
5713
|
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
5618
5714
|
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
5619
5715
|
|
@@ -5738,7 +5834,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
|
5738
5834
|
#else
|
5739
5835
|
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
5740
5836
|
bad_arch();
|
5741
|
-
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
5837
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
5742
5838
|
}
|
5743
5839
|
|
5744
5840
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
@@ -6181,6 +6277,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri
|
|
6181
6277
|
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
6182
6278
|
}
|
6183
6279
|
|
6280
|
+
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
6281
|
+
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
6282
|
+
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
6283
|
+
const bool need_check = false;
|
6284
|
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
6285
|
+
} else {
|
6286
|
+
const bool need_check = true;
|
6287
|
+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
6288
|
+
}
|
6289
|
+
}
|
6290
|
+
|
6184
6291
|
template<typename dst_t>
|
6185
6292
|
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6186
6293
|
const int nb = k / QK_K;
|
@@ -6201,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
6201
6308
|
#endif
|
6202
6309
|
}
|
6203
6310
|
|
6311
|
+
template<typename dst_t>
|
6312
|
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6313
|
+
const int nb32 = k / 32;
|
6314
|
+
const int nb = (k + 255) / 256;
|
6315
|
+
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
6316
|
+
}
|
6317
|
+
|
6318
|
+
template<typename dst_t>
|
6319
|
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6320
|
+
const int nb32 = k / 32;
|
6321
|
+
const int nb = (k + 255) / 256;
|
6322
|
+
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
6323
|
+
}
|
6324
|
+
|
6204
6325
|
template<typename dst_t>
|
6205
6326
|
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6206
6327
|
const int nb = k / QK_K;
|
@@ -6246,16 +6367,21 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
|
|
6246
6367
|
}
|
6247
6368
|
|
6248
6369
|
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
6370
|
+
int id;
|
6249
6371
|
switch (type) {
|
6250
6372
|
case GGML_TYPE_Q4_0:
|
6251
|
-
return
|
6373
|
+
return dequantize_row_q4_0_cuda;
|
6252
6374
|
case GGML_TYPE_Q4_1:
|
6253
|
-
return
|
6375
|
+
return dequantize_row_q4_1_cuda;
|
6254
6376
|
case GGML_TYPE_Q5_0:
|
6255
6377
|
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
6256
6378
|
case GGML_TYPE_Q5_1:
|
6257
6379
|
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
6258
6380
|
case GGML_TYPE_Q8_0:
|
6381
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
6382
|
+
if (g_device_caps[id].cc >= CC_PASCAL) {
|
6383
|
+
return dequantize_block_q8_0_f16_cuda;
|
6384
|
+
}
|
6259
6385
|
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
6260
6386
|
case GGML_TYPE_Q2_K:
|
6261
6387
|
return dequantize_row_q2_K_cuda;
|
@@ -6281,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
6281
6407
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
6282
6408
|
switch (type) {
|
6283
6409
|
case GGML_TYPE_Q4_0:
|
6284
|
-
return
|
6410
|
+
return dequantize_row_q4_0_cuda;
|
6285
6411
|
case GGML_TYPE_Q4_1:
|
6286
|
-
return
|
6412
|
+
return dequantize_row_q4_1_cuda;
|
6287
6413
|
case GGML_TYPE_Q5_0:
|
6288
6414
|
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
6289
6415
|
case GGML_TYPE_Q5_1:
|
@@ -7489,11 +7615,11 @@ struct cuda_pool_alloc {
|
|
7489
7615
|
|
7490
7616
|
static bool g_cublas_loaded = false;
|
7491
7617
|
|
7492
|
-
bool ggml_cublas_loaded(void) {
|
7618
|
+
GGML_CALL bool ggml_cublas_loaded(void) {
|
7493
7619
|
return g_cublas_loaded;
|
7494
7620
|
}
|
7495
7621
|
|
7496
|
-
void ggml_init_cublas() {
|
7622
|
+
GGML_CALL void ggml_init_cublas() {
|
7497
7623
|
static bool initialized = false;
|
7498
7624
|
|
7499
7625
|
if (!initialized) {
|
@@ -7546,8 +7672,9 @@ void ggml_init_cublas() {
|
|
7546
7672
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
7547
7673
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
7548
7674
|
|
7549
|
-
|
7675
|
+
g_default_tensor_split[id] = total_vram;
|
7550
7676
|
total_vram += prop.totalGlobalMem;
|
7677
|
+
|
7551
7678
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
7552
7679
|
g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
7553
7680
|
#else
|
@@ -7556,7 +7683,7 @@ void ggml_init_cublas() {
|
|
7556
7683
|
g_device_caps[id].smpb = prop.sharedMemPerBlock;
|
7557
7684
|
}
|
7558
7685
|
for (int id = 0; id < g_device_count; ++id) {
|
7559
|
-
|
7686
|
+
g_default_tensor_split[id] /= total_vram;
|
7560
7687
|
}
|
7561
7688
|
|
7562
7689
|
for (int id = 0; id < g_device_count; ++id) {
|
@@ -7580,31 +7707,7 @@ void ggml_init_cublas() {
|
|
7580
7707
|
}
|
7581
7708
|
}
|
7582
7709
|
|
7583
|
-
void
|
7584
|
-
if (tensor_split == nullptr) {
|
7585
|
-
return;
|
7586
|
-
}
|
7587
|
-
bool all_zero = true;
|
7588
|
-
for (int i = 0; i < g_device_count; ++i) {
|
7589
|
-
if (tensor_split[i] != 0.0f) {
|
7590
|
-
all_zero = false;
|
7591
|
-
break;
|
7592
|
-
}
|
7593
|
-
}
|
7594
|
-
if (all_zero) {
|
7595
|
-
return;
|
7596
|
-
}
|
7597
|
-
float split_sum = 0.0f;
|
7598
|
-
for (int i = 0; i < g_device_count; ++i) {
|
7599
|
-
g_tensor_split[i] = split_sum;
|
7600
|
-
split_sum += tensor_split[i];
|
7601
|
-
}
|
7602
|
-
for (int i = 0; i < g_device_count; ++i) {
|
7603
|
-
g_tensor_split[i] /= split_sum;
|
7604
|
-
}
|
7605
|
-
}
|
7606
|
-
|
7607
|
-
void * ggml_cuda_host_malloc(size_t size) {
|
7710
|
+
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
7608
7711
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
7609
7712
|
return nullptr;
|
7610
7713
|
}
|
@@ -7622,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
7622
7725
|
return ptr;
|
7623
7726
|
}
|
7624
7727
|
|
7625
|
-
void ggml_cuda_host_free(void * ptr) {
|
7728
|
+
GGML_CALL void ggml_cuda_host_free(void * ptr) {
|
7626
7729
|
CUDA_CHECK(cudaFreeHost(ptr));
|
7627
7730
|
}
|
7628
7731
|
|
@@ -8055,11 +8158,11 @@ static void ggml_cuda_op_mul_mat_q(
|
|
8055
8158
|
(void) src1_ddf_i;
|
8056
8159
|
}
|
8057
8160
|
|
8058
|
-
static int64_t get_row_rounding(ggml_type type) {
|
8161
|
+
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
8059
8162
|
int64_t min_compute_capability = INT_MAX;
|
8060
8163
|
int64_t max_compute_capability = INT_MIN;
|
8061
8164
|
for (int id = 0; id < g_device_count; ++id) {
|
8062
|
-
if (
|
8165
|
+
if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
8063
8166
|
if (min_compute_capability > g_device_caps[id].cc) {
|
8064
8167
|
min_compute_capability = g_device_caps[id].cc;
|
8065
8168
|
}
|
@@ -8120,6 +8223,21 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
8120
8223
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
8121
8224
|
}
|
8122
8225
|
|
8226
|
+
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
8227
|
+
const int64_t nrows = ggml_nrows(tensor);
|
8228
|
+
const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
|
8229
|
+
|
8230
|
+
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
8231
|
+
*row_low -= *row_low % rounding;
|
8232
|
+
|
8233
|
+
if (id == g_device_count - 1) {
|
8234
|
+
*row_high = nrows;
|
8235
|
+
} else {
|
8236
|
+
*row_high = nrows*tensor_split[id + 1];
|
8237
|
+
*row_high -= *row_high % rounding;
|
8238
|
+
}
|
8239
|
+
}
|
8240
|
+
|
8123
8241
|
static void ggml_cuda_op_mul_mat_vec_q(
|
8124
8242
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
8125
8243
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
@@ -8574,15 +8692,15 @@ static void ggml_cuda_op_soft_max(
|
|
8574
8692
|
float scale = 1.0f;
|
8575
8693
|
memcpy(&scale, dst->op_params, sizeof(float));
|
8576
8694
|
|
8577
|
-
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
8578
|
-
const bool use_f16_soft_max = false;
|
8579
|
-
#else
|
8695
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
|
8580
8696
|
#ifdef GGML_CUDA_F16
|
8581
8697
|
const bool use_f16_soft_max = true;
|
8582
8698
|
#else
|
8583
8699
|
const bool use_f16_soft_max = false;
|
8584
8700
|
#endif // GGML_CUDA_F16
|
8585
|
-
#
|
8701
|
+
#else
|
8702
|
+
const bool use_f16_soft_max = false;
|
8703
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
|
8586
8704
|
|
8587
8705
|
if (use_f16_soft_max) {
|
8588
8706
|
soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
@@ -8737,6 +8855,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
8737
8855
|
peer_access_enabled = enable_peer_access;
|
8738
8856
|
}
|
8739
8857
|
|
8858
|
+
// FIXME: move this somewhere else
|
8859
|
+
struct ggml_backend_cuda_split_buffer_type_context {
|
8860
|
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
8861
|
+
};
|
8862
|
+
|
8740
8863
|
static void ggml_cuda_op_mul_mat(
|
8741
8864
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
8742
8865
|
const bool convert_src1_to_q8_1) {
|
@@ -8788,6 +8911,14 @@ static void ggml_cuda_op_mul_mat(
|
|
8788
8911
|
GGML_ASSERT(!(split && ne03 > 1));
|
8789
8912
|
GGML_ASSERT(!(split && ne02 < ne12));
|
8790
8913
|
|
8914
|
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
8915
|
+
if (split) {
|
8916
|
+
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
|
8917
|
+
// GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
|
8918
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
8919
|
+
tensor_split = buft_ctx->tensor_split;
|
8920
|
+
}
|
8921
|
+
|
8791
8922
|
struct dev_data {
|
8792
8923
|
cuda_pool_alloc<char> src0_dd_alloc;
|
8793
8924
|
cuda_pool_alloc<float> src1_ddf_alloc;
|
@@ -8815,17 +8946,17 @@ static void ggml_cuda_op_mul_mat(
|
|
8815
8946
|
// for multi GPU, get the row boundaries from tensor split
|
8816
8947
|
// and round to mul_mat_q tile sizes
|
8817
8948
|
if (split) {
|
8818
|
-
const int64_t rounding = get_row_rounding(src0->type);
|
8949
|
+
const int64_t rounding = get_row_rounding(src0->type, tensor_split);
|
8819
8950
|
|
8820
8951
|
if (id != 0) {
|
8821
|
-
dev[id].row_low = ne01*
|
8952
|
+
dev[id].row_low = ne01*tensor_split[id];
|
8822
8953
|
if (dev[id].row_low < ne01) {
|
8823
8954
|
dev[id].row_low -= dev[id].row_low % rounding;
|
8824
8955
|
}
|
8825
8956
|
}
|
8826
8957
|
|
8827
8958
|
if (id != g_device_count - 1) {
|
8828
|
-
dev[id].row_high = ne01*
|
8959
|
+
dev[id].row_high = ne01*tensor_split[id + 1];
|
8829
8960
|
if (dev[id].row_high < ne01) {
|
8830
8961
|
dev[id].row_high -= dev[id].row_high % rounding;
|
8831
8962
|
}
|
@@ -9111,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
9111
9242
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
9112
9243
|
}
|
9113
9244
|
|
9114
|
-
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
9245
|
+
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
9115
9246
|
if (!g_cublas_loaded) return false;
|
9116
9247
|
|
9117
9248
|
const int64_t ne10 = src1->ne[0];
|
@@ -9371,10 +9502,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
9371
9502
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
9372
9503
|
|
9373
9504
|
int64_t min_compute_capability = INT_MAX;
|
9374
|
-
|
9375
|
-
|
9376
|
-
|
9505
|
+
|
9506
|
+
if (split) {
|
9507
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
9508
|
+
auto & tensor_split = buft_ctx->tensor_split;
|
9509
|
+
for (int id = 0; id < g_device_count; ++id) {
|
9510
|
+
if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
9511
|
+
min_compute_capability = g_device_caps[id].cc;
|
9512
|
+
}
|
9377
9513
|
}
|
9514
|
+
} else {
|
9515
|
+
min_compute_capability = g_device_caps[g_main_device].cc;
|
9378
9516
|
}
|
9379
9517
|
|
9380
9518
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
@@ -9413,7 +9551,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
9413
9551
|
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
9414
9552
|
// KQV single-batch
|
9415
9553
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
9416
|
-
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
9554
|
+
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
9417
9555
|
// KQ + KQV multi-batch
|
9418
9556
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
9419
9557
|
} else if (src0->type == GGML_TYPE_F32) {
|
@@ -9875,297 +10013,39 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
|
9875
10013
|
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
9876
10014
|
}
|
9877
10015
|
|
9878
|
-
void
|
9879
|
-
|
9880
|
-
|
9881
|
-
|
9882
|
-
|
9883
|
-
const size_t nb1 = tensor->nb[1];
|
9884
|
-
|
9885
|
-
ggml_backend_type backend = tensor->backend;
|
9886
|
-
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
9887
|
-
memset(extra, 0, sizeof(*extra));
|
9888
|
-
|
9889
|
-
for (int id = 0; id < g_device_count; ++id) {
|
9890
|
-
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
9891
|
-
continue;
|
9892
|
-
}
|
9893
|
-
|
9894
|
-
ggml_cuda_set_device(id);
|
9895
|
-
|
9896
|
-
int64_t row_low, row_high;
|
9897
|
-
if (backend == GGML_BACKEND_GPU) {
|
9898
|
-
row_low = 0;
|
9899
|
-
row_high = nrows;
|
9900
|
-
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
9901
|
-
const int64_t rounding = get_row_rounding(tensor->type);
|
9902
|
-
|
9903
|
-
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
9904
|
-
row_low -= row_low % rounding;
|
9905
|
-
|
9906
|
-
if (id == g_device_count - 1) {
|
9907
|
-
row_high = nrows;
|
9908
|
-
} else {
|
9909
|
-
row_high = nrows*g_tensor_split[id + 1];
|
9910
|
-
row_high -= row_high % rounding;
|
9911
|
-
}
|
9912
|
-
} else {
|
9913
|
-
GGML_ASSERT(false);
|
9914
|
-
}
|
9915
|
-
if (row_low == row_high) {
|
9916
|
-
continue;
|
9917
|
-
}
|
9918
|
-
|
9919
|
-
int64_t nrows_split = row_high - row_low;
|
9920
|
-
|
9921
|
-
const size_t offset_split = row_low*nb1;
|
9922
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
9923
|
-
const size_t original_size = size;
|
9924
|
-
|
9925
|
-
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
9926
|
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
9927
|
-
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
9928
|
-
}
|
9929
|
-
|
9930
|
-
char * buf;
|
9931
|
-
CUDA_CHECK(cudaMalloc(&buf, size));
|
9932
|
-
char * buf_host = (char *)data + offset_split;
|
9933
|
-
|
9934
|
-
// set padding to 0 to avoid possible NaN values
|
9935
|
-
if (size > original_size) {
|
9936
|
-
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
9937
|
-
}
|
9938
|
-
|
9939
|
-
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
9940
|
-
|
9941
|
-
extra->data_device[id] = buf;
|
9942
|
-
|
9943
|
-
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
9944
|
-
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
9945
|
-
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
9946
|
-
}
|
9947
|
-
}
|
9948
|
-
}
|
9949
|
-
|
9950
|
-
tensor->extra = extra;
|
9951
|
-
}
|
9952
|
-
|
9953
|
-
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
9954
|
-
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
10016
|
+
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
10017
|
+
if (main_device >= g_device_count) {
|
10018
|
+
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
10019
|
+
main_device, g_device_count, g_main_device);
|
9955
10020
|
return;
|
9956
10021
|
}
|
9957
10022
|
|
9958
|
-
|
9959
|
-
|
9960
|
-
|
9961
|
-
|
9962
|
-
|
9963
|
-
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
9964
|
-
}
|
9965
|
-
|
9966
|
-
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
9967
|
-
if (extra->events[id][is] != nullptr) {
|
9968
|
-
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
9969
|
-
}
|
9970
|
-
}
|
9971
|
-
}
|
9972
|
-
|
9973
|
-
delete extra;
|
9974
|
-
}
|
9975
|
-
|
9976
|
-
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
9977
|
-
static size_t g_temp_tensor_extra_index = 0;
|
9978
|
-
|
9979
|
-
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
9980
|
-
if (g_temp_tensor_extras == nullptr) {
|
9981
|
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
10023
|
+
if (g_main_device != main_device && g_device_count > 1) {
|
10024
|
+
g_main_device = main_device;
|
10025
|
+
//cudaDeviceProp prop;
|
10026
|
+
//CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
10027
|
+
//fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
9982
10028
|
}
|
9983
|
-
|
9984
|
-
size_t alloc_index = g_temp_tensor_extra_index;
|
9985
|
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
9986
|
-
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
9987
|
-
memset(extra, 0, sizeof(*extra));
|
9988
|
-
|
9989
|
-
return extra;
|
9990
10029
|
}
|
9991
10030
|
|
9992
|
-
|
9993
|
-
if (
|
9994
|
-
return;
|
9995
|
-
}
|
9996
|
-
|
9997
|
-
tensor->backend = GGML_BACKEND_GPU;
|
9998
|
-
|
9999
|
-
// recursively assign CUDA buffers until a compute tensor is found
|
10000
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
10001
|
-
const ggml_op src0_op = tensor->src[0]->op;
|
10002
|
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
10003
|
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
10004
|
-
}
|
10005
|
-
}
|
10006
|
-
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
10007
|
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
10008
|
-
}
|
10009
|
-
|
10010
|
-
if (scratch && no_alloc) {
|
10011
|
-
return;
|
10012
|
-
}
|
10013
|
-
|
10014
|
-
ggml_tensor_extra_gpu * extra;
|
10015
|
-
|
10016
|
-
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
10017
|
-
tensor->op == GGML_OP_VIEW ||
|
10018
|
-
force_inplace;
|
10019
|
-
const size_t size = ggml_nbytes(tensor);
|
10020
|
-
|
10021
|
-
ggml_cuda_set_device(g_main_device);
|
10022
|
-
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
10023
|
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
10024
|
-
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
10025
|
-
size_t offset = 0;
|
10026
|
-
if (tensor->op == GGML_OP_VIEW) {
|
10027
|
-
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
10028
|
-
}
|
10029
|
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
10030
|
-
extra->data_device[g_main_device] = src0_ddc + offset;
|
10031
|
-
} else if (tensor->op == GGML_OP_CPY) {
|
10032
|
-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
10033
|
-
void * src1_ddv = src1_extra->data_device[g_main_device];
|
10034
|
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
10035
|
-
extra->data_device[g_main_device] = src1_ddv;
|
10036
|
-
} else if (scratch) {
|
10037
|
-
GGML_ASSERT(size <= g_scratch_size);
|
10038
|
-
if (g_scratch_offset + size > g_scratch_size) {
|
10039
|
-
g_scratch_offset = 0;
|
10040
|
-
}
|
10041
|
-
|
10042
|
-
char * data = (char *) g_scratch_buffer;
|
10043
|
-
if (data == nullptr) {
|
10044
|
-
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
10045
|
-
g_scratch_buffer = data;
|
10046
|
-
}
|
10047
|
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
10048
|
-
extra->data_device[g_main_device] = data + g_scratch_offset;
|
10049
|
-
|
10050
|
-
g_scratch_offset += size;
|
10051
|
-
|
10052
|
-
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
10053
|
-
} else { // allocate new buffers outside of scratch
|
10054
|
-
void * data;
|
10055
|
-
CUDA_CHECK(cudaMalloc(&data, size));
|
10056
|
-
CUDA_CHECK(cudaMemset(data, 0, size));
|
10057
|
-
extra = new ggml_tensor_extra_gpu;
|
10058
|
-
memset(extra, 0, sizeof(*extra));
|
10059
|
-
extra->data_device[g_main_device] = data;
|
10060
|
-
}
|
10031
|
+
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
10032
|
+
if (!g_cublas_loaded) return false;
|
10061
10033
|
|
10062
|
-
|
10063
|
-
|
10034
|
+
ggml_cuda_func_t func;
|
10035
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
10036
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
10037
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
10064
10038
|
|
10065
|
-
|
10066
|
-
|
10067
|
-
return;
|
10068
|
-
}
|
10069
|
-
if (g_scratch_buffer == nullptr) {
|
10070
|
-
ggml_cuda_set_device(g_main_device);
|
10071
|
-
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
10039
|
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
10040
|
+
return false;
|
10072
10041
|
}
|
10073
10042
|
|
10074
|
-
|
10075
|
-
|
10076
|
-
|
10077
|
-
|
10078
|
-
|
10079
|
-
|
10080
|
-
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
10081
|
-
size_t view_offset = 0;
|
10082
|
-
if (tensor->op == GGML_OP_VIEW) {
|
10083
|
-
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
10084
|
-
}
|
10085
|
-
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
10086
|
-
} else {
|
10087
|
-
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
10088
|
-
}
|
10089
|
-
|
10090
|
-
tensor->extra = extra;
|
10091
|
-
}
|
10092
|
-
|
10093
|
-
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
10094
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
10095
|
-
GGML_ASSERT(ggml_is_contiguous(tensor));
|
10096
|
-
|
10097
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
10098
|
-
ggml_cuda_set_device(g_main_device);
|
10099
|
-
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
10100
|
-
}
|
10101
|
-
|
10102
|
-
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
10103
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
10104
|
-
}
|
10105
|
-
|
10106
|
-
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
10107
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
10108
|
-
}
|
10109
|
-
|
10110
|
-
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
10111
|
-
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
10112
|
-
}
|
10113
|
-
|
10114
|
-
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
10115
|
-
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
10116
|
-
}
|
10117
|
-
|
10118
|
-
void ggml_cuda_set_main_device(const int main_device) {
|
10119
|
-
if (main_device >= g_device_count) {
|
10120
|
-
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
10121
|
-
main_device, g_device_count, g_main_device);
|
10122
|
-
return;
|
10123
|
-
}
|
10124
|
-
|
10125
|
-
if (g_main_device != main_device && g_device_count > 1) {
|
10126
|
-
g_main_device = main_device;
|
10127
|
-
cudaDeviceProp prop;
|
10128
|
-
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
10129
|
-
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
10130
|
-
}
|
10131
|
-
}
|
10132
|
-
|
10133
|
-
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
10134
|
-
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
10135
|
-
// it still won't always work as expected, but it's better than nothing
|
10136
|
-
if (scratch_size > g_scratch_size) {
|
10137
|
-
ggml_cuda_free_scratch();
|
10138
|
-
}
|
10139
|
-
g_scratch_size = std::max(g_scratch_size, scratch_size);
|
10140
|
-
}
|
10141
|
-
|
10142
|
-
void ggml_cuda_free_scratch() {
|
10143
|
-
if (g_scratch_buffer == nullptr) {
|
10144
|
-
return;
|
10145
|
-
}
|
10146
|
-
|
10147
|
-
CUDA_CHECK(cudaFree(g_scratch_buffer));
|
10148
|
-
g_scratch_buffer = nullptr;
|
10149
|
-
}
|
10150
|
-
|
10151
|
-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
10152
|
-
if (!g_cublas_loaded) return false;
|
10153
|
-
|
10154
|
-
ggml_cuda_func_t func;
|
10155
|
-
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
10156
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
10157
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
10158
|
-
|
10159
|
-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
10160
|
-
return false;
|
10161
|
-
}
|
10162
|
-
|
10163
|
-
if (tensor->op == GGML_OP_MUL_MAT) {
|
10164
|
-
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
10165
|
-
#ifndef NDEBUG
|
10166
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
10167
|
-
#endif
|
10168
|
-
return false;
|
10043
|
+
if (tensor->op == GGML_OP_MUL_MAT) {
|
10044
|
+
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
10045
|
+
#ifndef NDEBUG
|
10046
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
10047
|
+
#endif
|
10048
|
+
return false;
|
10169
10049
|
}
|
10170
10050
|
}
|
10171
10051
|
|
@@ -10306,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
10306
10186
|
return true;
|
10307
10187
|
}
|
10308
10188
|
|
10309
|
-
int ggml_cuda_get_device_count() {
|
10189
|
+
GGML_CALL int ggml_cuda_get_device_count() {
|
10310
10190
|
int device_count;
|
10311
10191
|
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
10312
10192
|
return 0;
|
@@ -10314,7 +10194,7 @@ int ggml_cuda_get_device_count() {
|
|
10314
10194
|
return device_count;
|
10315
10195
|
}
|
10316
10196
|
|
10317
|
-
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
10197
|
+
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
10318
10198
|
cudaDeviceProp prop;
|
10319
10199
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
10320
10200
|
snprintf(description, description_size, "%s", prop.name);
|
@@ -10326,21 +10206,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
10326
10206
|
|
10327
10207
|
#define UNUSED GGML_UNUSED
|
10328
10208
|
|
10209
|
+
struct ggml_backend_cuda_context {
|
10210
|
+
int device;
|
10211
|
+
std::string name;
|
10212
|
+
};
|
10213
|
+
|
10329
10214
|
// cuda buffer
|
10330
10215
|
|
10331
|
-
struct
|
10216
|
+
struct ggml_backend_cuda_buffer_context {
|
10332
10217
|
int device;
|
10333
10218
|
void * dev_ptr = nullptr;
|
10334
10219
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
10335
10220
|
size_t temp_tensor_extra_index = 0;
|
10221
|
+
std::string name;
|
10336
10222
|
|
10337
|
-
|
10223
|
+
ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
|
10224
|
+
device(device), dev_ptr(dev_ptr),
|
10225
|
+
name(GGML_CUDA_NAME + std::to_string(device)) {
|
10226
|
+
}
|
10338
10227
|
|
10339
|
-
~
|
10228
|
+
~ggml_backend_cuda_buffer_context() {
|
10340
10229
|
delete[] temp_tensor_extras;
|
10341
10230
|
}
|
10342
10231
|
|
10343
10232
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
10233
|
+
// TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
|
10344
10234
|
if (temp_tensor_extras == nullptr) {
|
10345
10235
|
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
10346
10236
|
}
|
@@ -10354,19 +10244,28 @@ struct ggml_backend_buffer_context_cuda {
|
|
10354
10244
|
}
|
10355
10245
|
};
|
10356
10246
|
|
10357
|
-
static
|
10358
|
-
|
10247
|
+
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
10248
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10249
|
+
return ctx->name.c_str();
|
10250
|
+
}
|
10251
|
+
|
10252
|
+
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
10253
|
+
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
10254
|
+
}
|
10255
|
+
|
10256
|
+
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
10257
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10359
10258
|
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
10360
10259
|
delete ctx;
|
10361
10260
|
}
|
10362
10261
|
|
10363
|
-
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
10364
|
-
|
10262
|
+
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
10263
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10365
10264
|
return ctx->dev_ptr;
|
10366
10265
|
}
|
10367
10266
|
|
10368
|
-
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
10369
|
-
|
10267
|
+
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
10268
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10370
10269
|
|
10371
10270
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
10372
10271
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
@@ -10395,14 +10294,12 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
10395
10294
|
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
10396
10295
|
}
|
10397
10296
|
}
|
10398
|
-
|
10399
|
-
UNUSED(buffer);
|
10400
10297
|
}
|
10401
10298
|
|
10402
|
-
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10299
|
+
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10403
10300
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
10404
10301
|
|
10405
|
-
|
10302
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10406
10303
|
|
10407
10304
|
ggml_cuda_set_device(ctx->device);
|
10408
10305
|
CUDA_CHECK(cudaDeviceSynchronize());
|
@@ -10410,61 +10307,93 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|
10410
10307
|
CUDA_CHECK(cudaDeviceSynchronize());
|
10411
10308
|
}
|
10412
10309
|
|
10413
|
-
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10310
|
+
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10414
10311
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
10415
10312
|
|
10416
|
-
|
10313
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10417
10314
|
|
10418
10315
|
ggml_cuda_set_device(ctx->device);
|
10419
10316
|
CUDA_CHECK(cudaDeviceSynchronize());
|
10420
|
-
|
10421
10317
|
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
10318
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10422
10319
|
}
|
10423
10320
|
|
10424
|
-
static
|
10425
|
-
|
10321
|
+
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
10322
|
+
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
10323
|
+
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
10324
|
+
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10325
|
+
|
10326
|
+
ggml_cuda_set_device(src_ctx->device);
|
10327
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10328
|
+
ggml_cuda_set_device(dst_ctx->device);
|
10329
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10330
|
+
CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
|
10331
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10332
|
+
|
10333
|
+
return true;
|
10334
|
+
}
|
10335
|
+
return false;
|
10336
|
+
}
|
10337
|
+
|
10338
|
+
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
10339
|
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10426
10340
|
|
10427
10341
|
ggml_cuda_set_device(ctx->device);
|
10428
10342
|
CUDA_CHECK(cudaDeviceSynchronize());
|
10429
|
-
|
10430
10343
|
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
10344
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
10431
10345
|
}
|
10432
10346
|
|
10433
|
-
static
|
10347
|
+
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
10348
|
+
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
10434
10349
|
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
10435
10350
|
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
10436
10351
|
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
10437
10352
|
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
10438
10353
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
10439
|
-
/* .
|
10440
|
-
/* .cpy_tensor_to = */ NULL,
|
10354
|
+
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
|
10441
10355
|
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
10356
|
+
/* .reset = */ NULL,
|
10442
10357
|
};
|
10443
10358
|
|
10444
10359
|
// cuda buffer type
|
10360
|
+
struct ggml_backend_cuda_buffer_type_context {
|
10361
|
+
int device;
|
10362
|
+
std::string name;
|
10363
|
+
};
|
10445
10364
|
|
10446
|
-
static
|
10447
|
-
|
10365
|
+
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
10366
|
+
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
10448
10367
|
|
10449
|
-
|
10368
|
+
return ctx->name.c_str();
|
10369
|
+
}
|
10370
|
+
|
10371
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
10372
|
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
10373
|
+
|
10374
|
+
ggml_cuda_set_device(buft_ctx->device);
|
10450
10375
|
|
10451
10376
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
10452
10377
|
|
10453
10378
|
void * dev_ptr;
|
10454
|
-
|
10379
|
+
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
10380
|
+
if (err != cudaSuccess) {
|
10381
|
+
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
10382
|
+
return nullptr;
|
10383
|
+
}
|
10455
10384
|
|
10456
|
-
|
10385
|
+
ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
|
10457
10386
|
|
10458
|
-
return ggml_backend_buffer_init(buft,
|
10387
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
10459
10388
|
}
|
10460
10389
|
|
10461
|
-
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
10390
|
+
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
10462
10391
|
return 128;
|
10463
10392
|
|
10464
10393
|
UNUSED(buft);
|
10465
10394
|
}
|
10466
10395
|
|
10467
|
-
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
10396
|
+
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
10468
10397
|
int64_t row_low = 0;
|
10469
10398
|
int64_t row_high = ggml_nrows(tensor);
|
10470
10399
|
int64_t nrows_split = row_high - row_low;
|
@@ -10484,22 +10413,33 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|
10484
10413
|
UNUSED(buft);
|
10485
10414
|
}
|
10486
10415
|
|
10487
|
-
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
10488
|
-
|
10416
|
+
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
10417
|
+
if (!ggml_backend_is_cuda(backend)) {
|
10418
|
+
return false;
|
10419
|
+
}
|
10489
10420
|
|
10490
|
-
|
10421
|
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
10422
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10423
|
+
|
10424
|
+
return buft_ctx->device == cuda_ctx->device;
|
10491
10425
|
}
|
10492
10426
|
|
10493
10427
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
10428
|
+
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
10494
10429
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
10495
10430
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
10496
10431
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
10497
10432
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
10498
|
-
/* .is_host = */
|
10433
|
+
/* .is_host = */ NULL,
|
10499
10434
|
};
|
10500
10435
|
|
10501
|
-
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
10502
|
-
|
10436
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
10437
|
+
// FIXME: this is not thread safe
|
10438
|
+
if (device >= ggml_backend_cuda_get_device_count()) {
|
10439
|
+
return nullptr;
|
10440
|
+
}
|
10441
|
+
|
10442
|
+
static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
10503
10443
|
|
10504
10444
|
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
10505
10445
|
|
@@ -10507,7 +10447,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
10507
10447
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
10508
10448
|
ggml_backend_cuda_buffer_types[i] = {
|
10509
10449
|
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
10510
|
-
/* .context = */
|
10450
|
+
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
10511
10451
|
};
|
10512
10452
|
}
|
10513
10453
|
ggml_backend_cuda_buffer_type_initialized = true;
|
@@ -10516,13 +10456,311 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
10516
10456
|
return &ggml_backend_cuda_buffer_types[device];
|
10517
10457
|
}
|
10518
10458
|
|
10459
|
+
// cuda split buffer
|
10460
|
+
|
10461
|
+
struct ggml_backend_cuda_split_buffer_context {
|
10462
|
+
~ggml_backend_cuda_split_buffer_context() {
|
10463
|
+
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
10464
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10465
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
10466
|
+
if (extra->events[id][is] != nullptr) {
|
10467
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
10468
|
+
}
|
10469
|
+
}
|
10470
|
+
if (extra->data_device[id] != nullptr) {
|
10471
|
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
10472
|
+
}
|
10473
|
+
}
|
10474
|
+
delete extra;
|
10475
|
+
}
|
10476
|
+
}
|
10477
|
+
|
10478
|
+
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
10479
|
+
};
|
10480
|
+
|
10481
|
+
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
10482
|
+
return GGML_CUDA_NAME "_Split";
|
10483
|
+
|
10484
|
+
UNUSED(buffer);
|
10485
|
+
}
|
10486
|
+
|
10487
|
+
// unused at the moment
|
10488
|
+
//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
10489
|
+
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
10490
|
+
//}
|
10491
|
+
|
10492
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
10493
|
+
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
10494
|
+
delete ctx;
|
10495
|
+
}
|
10496
|
+
|
10497
|
+
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
10498
|
+
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
10499
|
+
return (void *)0x1000;
|
10500
|
+
|
10501
|
+
UNUSED(buffer);
|
10502
|
+
}
|
10503
|
+
|
10504
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
10505
|
+
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
10506
|
+
|
10507
|
+
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
10508
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
10509
|
+
|
10510
|
+
const int64_t ne0 = tensor->ne[0];
|
10511
|
+
|
10512
|
+
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
10513
|
+
|
10514
|
+
ctx->tensor_extras.push_back(extra);
|
10515
|
+
|
10516
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10517
|
+
int64_t row_low, row_high;
|
10518
|
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
10519
|
+
|
10520
|
+
int64_t nrows_split = row_high - row_low;
|
10521
|
+
if (nrows_split == 0) {
|
10522
|
+
continue;
|
10523
|
+
}
|
10524
|
+
|
10525
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
10526
|
+
const size_t original_size = size;
|
10527
|
+
|
10528
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
10529
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
10530
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
10531
|
+
}
|
10532
|
+
|
10533
|
+
// FIXME: do not crash if cudaMalloc fails
|
10534
|
+
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
10535
|
+
ggml_cuda_set_device(id);
|
10536
|
+
char * buf;
|
10537
|
+
CUDA_CHECK(cudaMalloc(&buf, size));
|
10538
|
+
|
10539
|
+
// set padding to 0 to avoid possible NaN values
|
10540
|
+
if (size > original_size) {
|
10541
|
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
10542
|
+
}
|
10543
|
+
|
10544
|
+
extra->data_device[id] = buf;
|
10545
|
+
|
10546
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
10547
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
10548
|
+
}
|
10549
|
+
}
|
10550
|
+
tensor->backend = GGML_BACKEND_GPU_SPLIT;
|
10551
|
+
tensor->extra = extra;
|
10552
|
+
}
|
10553
|
+
|
10554
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10555
|
+
// split tensors must always be set in their entirety at once
|
10556
|
+
GGML_ASSERT(offset == 0);
|
10557
|
+
GGML_ASSERT(size == ggml_nbytes(tensor));
|
10558
|
+
|
10559
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
10560
|
+
|
10561
|
+
const int64_t ne0 = tensor->ne[0];
|
10562
|
+
const size_t nb1 = tensor->nb[1];
|
10563
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
10564
|
+
|
10565
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10566
|
+
int64_t row_low, row_high;
|
10567
|
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
10568
|
+
|
10569
|
+
int64_t nrows_split = row_high - row_low;
|
10570
|
+
if (nrows_split == 0) {
|
10571
|
+
continue;
|
10572
|
+
}
|
10573
|
+
|
10574
|
+
const size_t offset_split = row_low*nb1;
|
10575
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
10576
|
+
const size_t original_size = size;
|
10577
|
+
|
10578
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
10579
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
10580
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
10581
|
+
}
|
10582
|
+
|
10583
|
+
const char * buf_host = (const char *)data + offset_split;
|
10584
|
+
CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
|
10585
|
+
}
|
10586
|
+
}
|
10587
|
+
|
10588
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10589
|
+
// split tensors must always be set in their entirety at once
|
10590
|
+
GGML_ASSERT(offset == 0);
|
10591
|
+
GGML_ASSERT(size == ggml_nbytes(tensor));
|
10592
|
+
|
10593
|
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
10594
|
+
|
10595
|
+
const int64_t ne0 = tensor->ne[0];
|
10596
|
+
const size_t nb1 = tensor->nb[1];
|
10597
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
10598
|
+
|
10599
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10600
|
+
int64_t row_low, row_high;
|
10601
|
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
10602
|
+
|
10603
|
+
int64_t nrows_split = row_high - row_low;
|
10604
|
+
if (nrows_split == 0) {
|
10605
|
+
continue;
|
10606
|
+
}
|
10607
|
+
|
10608
|
+
const size_t offset_split = row_low*nb1;
|
10609
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
10610
|
+
const size_t original_size = size;
|
10611
|
+
|
10612
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
10613
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
10614
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
10615
|
+
}
|
10616
|
+
|
10617
|
+
char * buf_host = (char *)data + offset_split;
|
10618
|
+
CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
|
10619
|
+
}
|
10620
|
+
}
|
10621
|
+
|
10622
|
+
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
10623
|
+
UNUSED(buffer);
|
10624
|
+
UNUSED(value);
|
10625
|
+
}
|
10626
|
+
|
10627
|
+
static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
10628
|
+
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
10629
|
+
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
10630
|
+
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
10631
|
+
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
10632
|
+
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
|
10633
|
+
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
|
10634
|
+
/* .cpy_tensor = */ NULL,
|
10635
|
+
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
|
10636
|
+
/* .reset = */ NULL,
|
10637
|
+
};
|
10638
|
+
|
10639
|
+
// cuda split buffer type
|
10640
|
+
|
10641
|
+
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
10642
|
+
return GGML_CUDA_NAME "_Split";
|
10643
|
+
|
10644
|
+
UNUSED(buft);
|
10645
|
+
}
|
10646
|
+
|
10647
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
10648
|
+
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
10649
|
+
// instead, we allocate them for each tensor separately in init_tensor
|
10650
|
+
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
10651
|
+
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
|
10652
|
+
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
|
10653
|
+
|
10654
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
10655
|
+
}
|
10656
|
+
|
10657
|
+
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
10658
|
+
return 128;
|
10659
|
+
|
10660
|
+
UNUSED(buft);
|
10661
|
+
}
|
10662
|
+
|
10663
|
+
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
10664
|
+
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
10665
|
+
|
10666
|
+
size_t total_size = 0;
|
10667
|
+
|
10668
|
+
const int64_t ne0 = tensor->ne[0];
|
10669
|
+
|
10670
|
+
for (int id = 0; id < g_device_count; ++id) {
|
10671
|
+
int64_t row_low, row_high;
|
10672
|
+
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
|
10673
|
+
|
10674
|
+
int64_t nrows_split = row_high - row_low;
|
10675
|
+
if (nrows_split == 0) {
|
10676
|
+
continue;
|
10677
|
+
}
|
10678
|
+
|
10679
|
+
total_size += ggml_nbytes_split(tensor, nrows_split);
|
10680
|
+
|
10681
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
10682
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
10683
|
+
total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
10684
|
+
}
|
10685
|
+
}
|
10686
|
+
|
10687
|
+
return total_size;
|
10688
|
+
}
|
10689
|
+
|
10690
|
+
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
10691
|
+
return ggml_backend_is_cuda(backend);
|
10692
|
+
|
10693
|
+
UNUSED(buft);
|
10694
|
+
}
|
10695
|
+
|
10696
|
+
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
10697
|
+
return false;
|
10698
|
+
|
10699
|
+
UNUSED(buft);
|
10700
|
+
}
|
10701
|
+
|
10702
|
+
static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
10703
|
+
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
10704
|
+
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
10705
|
+
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
10706
|
+
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
10707
|
+
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
10708
|
+
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
10709
|
+
};
|
10710
|
+
|
10711
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
10712
|
+
// FIXME: this is not thread safe
|
10713
|
+
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
10714
|
+
|
10715
|
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
10716
|
+
|
10717
|
+
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
|
10718
|
+
if (all_zero) {
|
10719
|
+
tensor_split_arr = g_default_tensor_split;
|
10720
|
+
} else {
|
10721
|
+
float split_sum = 0.0f;
|
10722
|
+
for (int i = 0; i < g_device_count; ++i) {
|
10723
|
+
tensor_split_arr[i] = split_sum;
|
10724
|
+
split_sum += tensor_split[i];
|
10725
|
+
}
|
10726
|
+
for (int i = 0; i < g_device_count; ++i) {
|
10727
|
+
tensor_split_arr[i] /= split_sum;
|
10728
|
+
}
|
10729
|
+
}
|
10730
|
+
|
10731
|
+
auto it = buft_map.find(tensor_split_arr);
|
10732
|
+
if (it != buft_map.end()) {
|
10733
|
+
return &it->second;
|
10734
|
+
}
|
10735
|
+
|
10736
|
+
struct ggml_backend_buffer_type buft {
|
10737
|
+
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
10738
|
+
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
10739
|
+
};
|
10740
|
+
|
10741
|
+
auto result = buft_map.emplace(tensor_split_arr, buft);
|
10742
|
+
return &result.first->second;
|
10743
|
+
}
|
10744
|
+
|
10519
10745
|
// host buffer type
|
10520
10746
|
|
10521
|
-
static
|
10747
|
+
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
10748
|
+
return GGML_CUDA_NAME "_Host";
|
10749
|
+
|
10750
|
+
UNUSED(buft);
|
10751
|
+
}
|
10752
|
+
|
10753
|
+
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
10754
|
+
return GGML_CUDA_NAME "_Host";
|
10755
|
+
|
10756
|
+
UNUSED(buffer);
|
10757
|
+
}
|
10758
|
+
|
10759
|
+
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
10522
10760
|
ggml_cuda_host_free(buffer->context);
|
10523
10761
|
}
|
10524
10762
|
|
10525
|
-
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
10763
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
10526
10764
|
void * ptr = ggml_cuda_host_malloc(size);
|
10527
10765
|
|
10528
10766
|
if (ptr == nullptr) {
|
@@ -10530,17 +10768,18 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
10530
10768
|
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
10531
10769
|
}
|
10532
10770
|
|
10533
|
-
// FIXME: this is a hack to avoid having to implement a new buffer type
|
10534
10771
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
10535
10772
|
buffer->buft = buft;
|
10773
|
+
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
10536
10774
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
10537
10775
|
|
10538
10776
|
return buffer;
|
10539
10777
|
}
|
10540
10778
|
|
10541
|
-
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
10779
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
10542
10780
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
10543
10781
|
/* .iface = */ {
|
10782
|
+
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
10544
10783
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
10545
10784
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
10546
10785
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
@@ -10555,31 +10794,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
10555
10794
|
|
10556
10795
|
// backend
|
10557
10796
|
|
10558
|
-
|
10559
|
-
|
10560
|
-
};
|
10797
|
+
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
10798
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10561
10799
|
|
10562
|
-
|
10563
|
-
return GGML_CUDA_NAME;
|
10564
|
-
|
10565
|
-
UNUSED(backend);
|
10800
|
+
return cuda_ctx->name.c_str();
|
10566
10801
|
}
|
10567
10802
|
|
10568
|
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
10569
|
-
|
10803
|
+
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
10804
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10570
10805
|
|
10571
10806
|
delete cuda_ctx;
|
10572
10807
|
delete backend;
|
10573
10808
|
}
|
10574
10809
|
|
10575
|
-
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
10576
|
-
|
10810
|
+
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
10811
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10577
10812
|
|
10578
10813
|
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
10579
10814
|
}
|
10580
10815
|
|
10581
|
-
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10582
|
-
|
10816
|
+
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10817
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10583
10818
|
|
10584
10819
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
10585
10820
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -10587,8 +10822,8 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|
10587
10822
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
10588
10823
|
}
|
10589
10824
|
|
10590
|
-
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10591
|
-
|
10825
|
+
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10826
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10592
10827
|
|
10593
10828
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
10594
10829
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -10596,39 +10831,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|
10596
10831
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
10597
10832
|
}
|
10598
10833
|
|
10599
|
-
static
|
10600
|
-
|
10601
|
-
|
10602
|
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
10603
|
-
|
10604
|
-
UNUSED(backend);
|
10605
|
-
}
|
10606
|
-
|
10607
|
-
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
10608
|
-
GGML_ASSERT(!"not implemented");
|
10834
|
+
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
10835
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10609
10836
|
|
10610
|
-
|
10837
|
+
if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
|
10838
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
10839
|
+
return true;
|
10840
|
+
}
|
10611
10841
|
|
10612
|
-
|
10613
|
-
UNUSED(cgraph);
|
10842
|
+
return false;
|
10614
10843
|
}
|
10615
10844
|
|
10616
|
-
static void
|
10617
|
-
|
10845
|
+
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
10846
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10618
10847
|
|
10619
|
-
|
10620
|
-
UNUSED(plan);
|
10621
|
-
}
|
10622
|
-
|
10623
|
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
10624
|
-
GGML_ASSERT(!"not implemented");
|
10848
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
10625
10849
|
|
10626
10850
|
UNUSED(backend);
|
10627
|
-
UNUSED(plan);
|
10628
10851
|
}
|
10629
10852
|
|
10630
|
-
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
10631
|
-
|
10853
|
+
GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
10854
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
10632
10855
|
|
10633
10856
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
10634
10857
|
|
@@ -10638,57 +10861,35 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
10638
10861
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
10639
10862
|
ggml_tensor * node = cgraph->nodes[i];
|
10640
10863
|
|
10641
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
10864
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
10642
10865
|
continue;
|
10866
|
+
}
|
10643
10867
|
|
10644
|
-
|
10868
|
+
#ifndef NDEBUG
|
10869
|
+
assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
|
10645
10870
|
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
10646
10871
|
assert(node->extra != nullptr);
|
10647
10872
|
|
10648
10873
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
10649
10874
|
if (node->src[j] != nullptr) {
|
10650
|
-
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
10875
|
+
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
10651
10876
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
10652
10877
|
assert(node->src[j]->extra != nullptr);
|
10653
10878
|
}
|
10654
10879
|
}
|
10880
|
+
#endif
|
10655
10881
|
|
10656
10882
|
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
10657
10883
|
if (!ok) {
|
10658
10884
|
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
10659
10885
|
}
|
10660
10886
|
GGML_ASSERT(ok);
|
10661
|
-
|
10662
|
-
#if 0
|
10663
|
-
if (node->type == GGML_TYPE_F32) {
|
10664
|
-
cudaDeviceSynchronize();
|
10665
|
-
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
10666
|
-
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
10667
|
-
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
10668
|
-
ggml_type_name(node->src[0]->type),
|
10669
|
-
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
10670
|
-
node->src[0]->name,
|
10671
|
-
node->src[1] ? node->src[1]->name : "none");
|
10672
|
-
double sum = 0.0;
|
10673
|
-
double sq_sum = 0.0;
|
10674
|
-
for (int i = 0; i < ggml_nelements(node); i++) {
|
10675
|
-
printf("%f ", tmp[i]);
|
10676
|
-
sum += tmp[i];
|
10677
|
-
sq_sum += tmp[i]*tmp[i];
|
10678
|
-
}
|
10679
|
-
printf("\n");
|
10680
|
-
printf("sum: %f, ", sum);
|
10681
|
-
printf("sq_sum: %f\n", sq_sum);
|
10682
|
-
}
|
10683
|
-
#endif
|
10684
10887
|
}
|
10685
10888
|
|
10686
|
-
UNUSED(backend);
|
10687
|
-
|
10688
10889
|
return true;
|
10689
10890
|
}
|
10690
10891
|
|
10691
|
-
static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
10892
|
+
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
10692
10893
|
switch (op->op) {
|
10693
10894
|
case GGML_OP_UNARY:
|
10694
10895
|
switch (ggml_get_unary_op(op)) {
|
@@ -10799,23 +11000,22 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
10799
11000
|
UNUSED(backend);
|
10800
11001
|
}
|
10801
11002
|
|
10802
|
-
static ggml_backend_i
|
11003
|
+
static ggml_backend_i ggml_backend_cuda_interface = {
|
10803
11004
|
/* .get_name = */ ggml_backend_cuda_name,
|
10804
11005
|
/* .free = */ ggml_backend_cuda_free,
|
10805
11006
|
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
10806
11007
|
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
10807
11008
|
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
10808
|
-
/* .
|
10809
|
-
/* .cpy_tensor_to_async = */ NULL,
|
11009
|
+
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
10810
11010
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
10811
|
-
/* .graph_plan_create = */
|
10812
|
-
/* .graph_plan_free = */
|
10813
|
-
/* .graph_plan_compute = */
|
11011
|
+
/* .graph_plan_create = */ NULL,
|
11012
|
+
/* .graph_plan_free = */ NULL,
|
11013
|
+
/* .graph_plan_compute = */ NULL,
|
10814
11014
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
10815
11015
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
10816
11016
|
};
|
10817
11017
|
|
10818
|
-
ggml_backend_t ggml_backend_cuda_init(int device) {
|
11018
|
+
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
10819
11019
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
10820
11020
|
|
10821
11021
|
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
@@ -10826,32 +11026,48 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
10826
11026
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
10827
11027
|
ggml_cuda_set_main_device(device);
|
10828
11028
|
|
10829
|
-
|
10830
|
-
/* .device = */ device
|
11029
|
+
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
|
11030
|
+
/* .device = */ device,
|
11031
|
+
/* .name = */ GGML_CUDA_NAME + std::to_string(device),
|
10831
11032
|
};
|
10832
11033
|
|
10833
11034
|
ggml_backend_t cuda_backend = new ggml_backend {
|
10834
|
-
/* .interface = */
|
11035
|
+
/* .interface = */ ggml_backend_cuda_interface,
|
10835
11036
|
/* .context = */ ctx
|
10836
11037
|
};
|
10837
11038
|
|
10838
11039
|
return cuda_backend;
|
10839
11040
|
}
|
10840
11041
|
|
10841
|
-
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
10842
|
-
return backend->iface.get_name == ggml_backend_cuda_name;
|
11042
|
+
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
11043
|
+
return backend && backend->iface.get_name == ggml_backend_cuda_name;
|
11044
|
+
}
|
11045
|
+
|
11046
|
+
GGML_CALL int ggml_backend_cuda_get_device_count() {
|
11047
|
+
return ggml_cuda_get_device_count();
|
11048
|
+
}
|
11049
|
+
|
11050
|
+
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
11051
|
+
ggml_cuda_get_device_description(device, description, description_size);
|
11052
|
+
}
|
11053
|
+
|
11054
|
+
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
11055
|
+
ggml_cuda_set_device(device);
|
11056
|
+
|
11057
|
+
CUDA_CHECK(cudaMemGetInfo(free, total));
|
10843
11058
|
}
|
10844
11059
|
|
10845
|
-
|
11060
|
+
// backend registry
|
11061
|
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
10846
11062
|
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
10847
11063
|
return cuda_backend;
|
10848
11064
|
|
10849
11065
|
UNUSED(params);
|
10850
11066
|
}
|
10851
11067
|
|
10852
|
-
extern "C" int ggml_backend_cuda_reg_devices();
|
11068
|
+
extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
10853
11069
|
|
10854
|
-
int ggml_backend_cuda_reg_devices() {
|
11070
|
+
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
10855
11071
|
int device_count = ggml_cuda_get_device_count();
|
10856
11072
|
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
10857
11073
|
for (int i = 0; i < device_count; i++) {
|