llama_cpp 0.12.1 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,8 +8,13 @@
8
8
  #include <limits>
9
9
  #include <stdint.h>
10
10
  #include <stdio.h>
11
+ #include <string>
11
12
  #include <vector>
12
-
13
+ #include <map>
14
+ #include <array>
15
+ #include "ggml-cuda.h"
16
+ #include "ggml.h"
17
+ #include "ggml-backend-impl.h"
13
18
 
14
19
  #if defined(GGML_USE_HIPBLAS)
15
20
  #include <hip/hip_runtime.h>
@@ -77,6 +82,7 @@
77
82
  #define cudaMemcpyKind hipMemcpyKind
78
83
  #define cudaMemset hipMemset
79
84
  #define cudaMemsetAsync hipMemsetAsync
85
+ #define cudaMemGetInfo hipMemGetInfo
80
86
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
81
87
  #define cudaSetDevice hipSetDevice
82
88
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -112,9 +118,7 @@
112
118
 
113
119
  #endif // defined(GGML_USE_HIPBLAS)
114
120
 
115
- #include "ggml-cuda.h"
116
- #include "ggml.h"
117
- #include "ggml-backend-impl.h"
121
+ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
118
122
 
119
123
  #define CC_PASCAL 600
120
124
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
@@ -519,6 +523,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
519
523
  #define CUDA_ACC_BLOCK_SIZE 256
520
524
  #define CUDA_IM2COL_BLOCK_SIZE 256
521
525
 
526
+ #define CUDA_Q8_0_NE_ALIGN 2048
527
+
522
528
  // dmmv = dequantize_mul_mat_vec
523
529
  #ifndef GGML_CUDA_DMMV_X
524
530
  #define GGML_CUDA_DMMV_X 32
@@ -562,7 +568,7 @@ static void ggml_cuda_set_device(const int device) {
562
568
 
563
569
  static int g_device_count = -1;
564
570
  static int g_main_device = 0;
565
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
571
+ static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
566
572
 
567
573
  struct cuda_device_capabilities {
568
574
  int cc; // compute capability
@@ -573,10 +579,6 @@ struct cuda_device_capabilities {
573
579
 
574
580
  static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
575
581
 
576
- static void * g_scratch_buffer = nullptr;
577
- static size_t g_scratch_size = 0; // disabled by default
578
- static size_t g_scratch_offset = 0;
579
-
580
582
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
581
583
 
582
584
  [[noreturn]]
@@ -605,16 +607,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
605
607
  }
606
608
 
607
609
  static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
608
- #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
609
- (void) a;
610
- bad_arch();
611
- #else
610
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
612
611
  #pragma unroll
613
612
  for (int mask = 16; mask > 0; mask >>= 1) {
614
613
  a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
615
614
  }
616
615
  return a;
617
- #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
616
+ #else
617
+ (void) a;
618
+ bad_arch();
619
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
618
620
  }
619
621
 
620
622
  static __device__ __forceinline__ float warp_reduce_max(float x) {
@@ -626,16 +628,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
626
628
  }
627
629
 
628
630
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
629
- #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
630
- (void) x;
631
- bad_arch();
632
- #else
631
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
633
632
  #pragma unroll
634
633
  for (int mask = 16; mask > 0; mask >>= 1) {
635
634
  x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
636
635
  }
637
636
  return x;
638
- #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
637
+ #else
638
+ (void) x;
639
+ bad_arch();
640
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
639
641
  }
640
642
 
641
643
  static __device__ __forceinline__ float op_repeat(const float a, const float b) {
@@ -1103,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
1103
1105
  #endif // GGML_CUDA_F16
1104
1106
  }
1105
1107
 
1108
+ template<typename dst_t>
1109
+ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
1110
+
1111
+ const int i = blockIdx.x;
1112
+
1113
+ // assume 32 threads
1114
+ const int tid = threadIdx.x;
1115
+ const int il = tid/8;
1116
+ const int ir = tid%8;
1117
+ const int ib = 8*i + ir;
1118
+ if (ib >= nb32) {
1119
+ return;
1120
+ }
1121
+
1122
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
1123
+
1124
+ const block_q4_0 * x = (const block_q4_0 *)vx + ib;
1125
+ const float d = __half2float(x->d);
1126
+ const float dm = -8*d;
1127
+
1128
+ const uint8_t * q = x->qs + 4*il;
1129
+
1130
+ for (int l = 0; l < 4; ++l) {
1131
+ y[l+ 0] = d * (q[l] & 0xF) + dm;
1132
+ y[l+16] = d * (q[l] >> 4) + dm;
1133
+ }
1134
+ }
1135
+
1136
+ template<typename dst_t>
1137
+ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
1138
+
1139
+ const int i = blockIdx.x;
1140
+
1141
+ // assume 32 threads
1142
+ const int tid = threadIdx.x;
1143
+ const int il = tid/8;
1144
+ const int ir = tid%8;
1145
+ const int ib = 8*i + ir;
1146
+ if (ib >= nb32) {
1147
+ return;
1148
+ }
1149
+
1150
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
1151
+
1152
+ const block_q4_1 * x = (const block_q4_1 *)vx + ib;
1153
+ const float2 d = __half22float2(x->dm);
1154
+
1155
+ const uint8_t * q = x->qs + 4*il;
1156
+
1157
+ for (int l = 0; l < 4; ++l) {
1158
+ y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
1159
+ y[l+16] = d.x * (q[l] >> 4) + d.y;
1160
+ }
1161
+ }
1162
+
1106
1163
  //================================== k-quants
1107
1164
 
1108
1165
  template<typename dst_t>
@@ -2327,6 +2384,45 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
2327
2384
  y[i] = x[i];
2328
2385
  }
2329
2386
 
2387
+ template <bool need_check>
2388
+ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
2389
+ #if __CUDA_ARCH__ >= CC_PASCAL
2390
+ constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
2391
+
2392
+ const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
2393
+ const int * x0 = ((int *) vx) + blockIdx.x * nint;
2394
+ half2 * y2 = (half2 *) (y + i0);
2395
+
2396
+ __shared__ int vals[nint];
2397
+
2398
+ #pragma unroll
2399
+ for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
2400
+ if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
2401
+ break;
2402
+ }
2403
+
2404
+ const int ix = ix0 + threadIdx.x;
2405
+ vals[ix] = x0[ix];
2406
+ }
2407
+
2408
+ #pragma unroll
2409
+ for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
2410
+ if (need_check && i0 + iy + 2*threadIdx.x >= k) {
2411
+ return;
2412
+ }
2413
+
2414
+ const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
2415
+ const half d = *b0;
2416
+ const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
2417
+
2418
+ y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
2419
+ }
2420
+ #else
2421
+ (void) vx; (void) y; (void) k;
2422
+ bad_arch();
2423
+ #endif // __CUDA_ARCH__ >= CC_PASCAL
2424
+ }
2425
+
2330
2426
  // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
2331
2427
  // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
2332
2428
 
@@ -5613,7 +5709,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
5613
5709
 
5614
5710
  template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
5615
5711
  static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
5616
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
5712
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5617
5713
  const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
5618
5714
  const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
5619
5715
 
@@ -5738,7 +5834,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
5738
5834
  #else
5739
5835
  (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
5740
5836
  bad_arch();
5741
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
5837
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5742
5838
  }
5743
5839
 
5744
5840
  template <bool vals_smem, int ncols_template, int block_size_template>
@@ -6181,6 +6277,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri
6181
6277
  dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
6182
6278
  }
6183
6279
 
6280
+ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
6281
+ const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
6282
+ if (k % CUDA_Q8_0_NE_ALIGN == 0) {
6283
+ const bool need_check = false;
6284
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
6285
+ } else {
6286
+ const bool need_check = true;
6287
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
6288
+ }
6289
+ }
6290
+
6184
6291
  template<typename dst_t>
6185
6292
  static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6186
6293
  const int nb = k / QK_K;
@@ -6201,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
6201
6308
  #endif
6202
6309
  }
6203
6310
 
6311
+ template<typename dst_t>
6312
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6313
+ const int nb32 = k / 32;
6314
+ const int nb = (k + 255) / 256;
6315
+ dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
6316
+ }
6317
+
6318
+ template<typename dst_t>
6319
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6320
+ const int nb32 = k / 32;
6321
+ const int nb = (k + 255) / 256;
6322
+ dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
6323
+ }
6324
+
6204
6325
  template<typename dst_t>
6205
6326
  static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6206
6327
  const int nb = k / QK_K;
@@ -6246,16 +6367,21 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
6246
6367
  }
6247
6368
 
6248
6369
  static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6370
+ int id;
6249
6371
  switch (type) {
6250
6372
  case GGML_TYPE_Q4_0:
6251
- return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
6373
+ return dequantize_row_q4_0_cuda;
6252
6374
  case GGML_TYPE_Q4_1:
6253
- return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
6375
+ return dequantize_row_q4_1_cuda;
6254
6376
  case GGML_TYPE_Q5_0:
6255
6377
  return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
6256
6378
  case GGML_TYPE_Q5_1:
6257
6379
  return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
6258
6380
  case GGML_TYPE_Q8_0:
6381
+ CUDA_CHECK(cudaGetDevice(&id));
6382
+ if (g_device_caps[id].cc >= CC_PASCAL) {
6383
+ return dequantize_block_q8_0_f16_cuda;
6384
+ }
6259
6385
  return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
6260
6386
  case GGML_TYPE_Q2_K:
6261
6387
  return dequantize_row_q2_K_cuda;
@@ -6281,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6281
6407
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
6282
6408
  switch (type) {
6283
6409
  case GGML_TYPE_Q4_0:
6284
- return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
6410
+ return dequantize_row_q4_0_cuda;
6285
6411
  case GGML_TYPE_Q4_1:
6286
- return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
6412
+ return dequantize_row_q4_1_cuda;
6287
6413
  case GGML_TYPE_Q5_0:
6288
6414
  return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
6289
6415
  case GGML_TYPE_Q5_1:
@@ -7489,11 +7615,11 @@ struct cuda_pool_alloc {
7489
7615
 
7490
7616
  static bool g_cublas_loaded = false;
7491
7617
 
7492
- bool ggml_cublas_loaded(void) {
7618
+ GGML_CALL bool ggml_cublas_loaded(void) {
7493
7619
  return g_cublas_loaded;
7494
7620
  }
7495
7621
 
7496
- void ggml_init_cublas() {
7622
+ GGML_CALL void ggml_init_cublas() {
7497
7623
  static bool initialized = false;
7498
7624
 
7499
7625
  if (!initialized) {
@@ -7546,8 +7672,9 @@ void ggml_init_cublas() {
7546
7672
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
7547
7673
  fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
7548
7674
 
7549
- g_tensor_split[id] = total_vram;
7675
+ g_default_tensor_split[id] = total_vram;
7550
7676
  total_vram += prop.totalGlobalMem;
7677
+
7551
7678
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
7552
7679
  g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
7553
7680
  #else
@@ -7556,7 +7683,7 @@ void ggml_init_cublas() {
7556
7683
  g_device_caps[id].smpb = prop.sharedMemPerBlock;
7557
7684
  }
7558
7685
  for (int id = 0; id < g_device_count; ++id) {
7559
- g_tensor_split[id] /= total_vram;
7686
+ g_default_tensor_split[id] /= total_vram;
7560
7687
  }
7561
7688
 
7562
7689
  for (int id = 0; id < g_device_count; ++id) {
@@ -7580,31 +7707,7 @@ void ggml_init_cublas() {
7580
7707
  }
7581
7708
  }
7582
7709
 
7583
- void ggml_cuda_set_tensor_split(const float * tensor_split) {
7584
- if (tensor_split == nullptr) {
7585
- return;
7586
- }
7587
- bool all_zero = true;
7588
- for (int i = 0; i < g_device_count; ++i) {
7589
- if (tensor_split[i] != 0.0f) {
7590
- all_zero = false;
7591
- break;
7592
- }
7593
- }
7594
- if (all_zero) {
7595
- return;
7596
- }
7597
- float split_sum = 0.0f;
7598
- for (int i = 0; i < g_device_count; ++i) {
7599
- g_tensor_split[i] = split_sum;
7600
- split_sum += tensor_split[i];
7601
- }
7602
- for (int i = 0; i < g_device_count; ++i) {
7603
- g_tensor_split[i] /= split_sum;
7604
- }
7605
- }
7606
-
7607
- void * ggml_cuda_host_malloc(size_t size) {
7710
+ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
7608
7711
  if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
7609
7712
  return nullptr;
7610
7713
  }
@@ -7622,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
7622
7725
  return ptr;
7623
7726
  }
7624
7727
 
7625
- void ggml_cuda_host_free(void * ptr) {
7728
+ GGML_CALL void ggml_cuda_host_free(void * ptr) {
7626
7729
  CUDA_CHECK(cudaFreeHost(ptr));
7627
7730
  }
7628
7731
 
@@ -8055,11 +8158,11 @@ static void ggml_cuda_op_mul_mat_q(
8055
8158
  (void) src1_ddf_i;
8056
8159
  }
8057
8160
 
8058
- static int64_t get_row_rounding(ggml_type type) {
8161
+ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
8059
8162
  int64_t min_compute_capability = INT_MAX;
8060
8163
  int64_t max_compute_capability = INT_MIN;
8061
8164
  for (int id = 0; id < g_device_count; ++id) {
8062
- if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8165
+ if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
8063
8166
  if (min_compute_capability > g_device_caps[id].cc) {
8064
8167
  min_compute_capability = g_device_caps[id].cc;
8065
8168
  }
@@ -8120,6 +8223,21 @@ static int64_t get_row_rounding(ggml_type type) {
8120
8223
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8121
8224
  }
8122
8225
 
8226
+ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
8227
+ const int64_t nrows = ggml_nrows(tensor);
8228
+ const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
8229
+
8230
+ *row_low = id == 0 ? 0 : nrows*tensor_split[id];
8231
+ *row_low -= *row_low % rounding;
8232
+
8233
+ if (id == g_device_count - 1) {
8234
+ *row_high = nrows;
8235
+ } else {
8236
+ *row_high = nrows*tensor_split[id + 1];
8237
+ *row_high -= *row_high % rounding;
8238
+ }
8239
+ }
8240
+
8123
8241
  static void ggml_cuda_op_mul_mat_vec_q(
8124
8242
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
8125
8243
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
@@ -8574,15 +8692,15 @@ static void ggml_cuda_op_soft_max(
8574
8692
  float scale = 1.0f;
8575
8693
  memcpy(&scale, dst->op_params, sizeof(float));
8576
8694
 
8577
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8578
- const bool use_f16_soft_max = false;
8579
- #else
8695
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
8580
8696
  #ifdef GGML_CUDA_F16
8581
8697
  const bool use_f16_soft_max = true;
8582
8698
  #else
8583
8699
  const bool use_f16_soft_max = false;
8584
8700
  #endif // GGML_CUDA_F16
8585
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8701
+ #else
8702
+ const bool use_f16_soft_max = false;
8703
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
8586
8704
 
8587
8705
  if (use_f16_soft_max) {
8588
8706
  soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
@@ -8737,6 +8855,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
8737
8855
  peer_access_enabled = enable_peer_access;
8738
8856
  }
8739
8857
 
8858
+ // FIXME: move this somewhere else
8859
+ struct ggml_backend_cuda_split_buffer_type_context {
8860
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
8861
+ };
8862
+
8740
8863
  static void ggml_cuda_op_mul_mat(
8741
8864
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
8742
8865
  const bool convert_src1_to_q8_1) {
@@ -8788,6 +8911,14 @@ static void ggml_cuda_op_mul_mat(
8788
8911
  GGML_ASSERT(!(split && ne03 > 1));
8789
8912
  GGML_ASSERT(!(split && ne02 < ne12));
8790
8913
 
8914
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
8915
+ if (split) {
8916
+ // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
8917
+ // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
8918
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
8919
+ tensor_split = buft_ctx->tensor_split;
8920
+ }
8921
+
8791
8922
  struct dev_data {
8792
8923
  cuda_pool_alloc<char> src0_dd_alloc;
8793
8924
  cuda_pool_alloc<float> src1_ddf_alloc;
@@ -8815,17 +8946,17 @@ static void ggml_cuda_op_mul_mat(
8815
8946
  // for multi GPU, get the row boundaries from tensor split
8816
8947
  // and round to mul_mat_q tile sizes
8817
8948
  if (split) {
8818
- const int64_t rounding = get_row_rounding(src0->type);
8949
+ const int64_t rounding = get_row_rounding(src0->type, tensor_split);
8819
8950
 
8820
8951
  if (id != 0) {
8821
- dev[id].row_low = ne01*g_tensor_split[id];
8952
+ dev[id].row_low = ne01*tensor_split[id];
8822
8953
  if (dev[id].row_low < ne01) {
8823
8954
  dev[id].row_low -= dev[id].row_low % rounding;
8824
8955
  }
8825
8956
  }
8826
8957
 
8827
8958
  if (id != g_device_count - 1) {
8828
- dev[id].row_high = ne01*g_tensor_split[id + 1];
8959
+ dev[id].row_high = ne01*tensor_split[id + 1];
8829
8960
  if (dev[id].row_high < ne01) {
8830
8961
  dev[id].row_high -= dev[id].row_high % rounding;
8831
8962
  }
@@ -9111,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
9111
9242
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
9112
9243
  }
9113
9244
 
9114
- bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
9245
+ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
9115
9246
  if (!g_cublas_loaded) return false;
9116
9247
 
9117
9248
  const int64_t ne10 = src1->ne[0];
@@ -9371,10 +9502,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
9371
9502
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
9372
9503
 
9373
9504
  int64_t min_compute_capability = INT_MAX;
9374
- for (int id = 0; id < g_device_count; ++id) {
9375
- if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
9376
- min_compute_capability = g_device_caps[id].cc;
9505
+
9506
+ if (split) {
9507
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
9508
+ auto & tensor_split = buft_ctx->tensor_split;
9509
+ for (int id = 0; id < g_device_count; ++id) {
9510
+ if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
9511
+ min_compute_capability = g_device_caps[id].cc;
9512
+ }
9377
9513
  }
9514
+ } else {
9515
+ min_compute_capability = g_device_caps[g_main_device].cc;
9378
9516
  }
9379
9517
 
9380
9518
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
@@ -9413,7 +9551,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
9413
9551
  } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
9414
9552
  // KQV single-batch
9415
9553
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
9416
- } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
9554
+ } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
9417
9555
  // KQ + KQV multi-batch
9418
9556
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
9419
9557
  } else if (src0->type == GGML_TYPE_F32) {
@@ -9875,297 +10013,39 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
9875
10013
  return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
9876
10014
  }
9877
10015
 
9878
- void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
9879
- const int64_t nrows = ggml_nrows(tensor);
9880
-
9881
- const int64_t ne0 = tensor->ne[0];
9882
-
9883
- const size_t nb1 = tensor->nb[1];
9884
-
9885
- ggml_backend_type backend = tensor->backend;
9886
- ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
9887
- memset(extra, 0, sizeof(*extra));
9888
-
9889
- for (int id = 0; id < g_device_count; ++id) {
9890
- if (backend == GGML_BACKEND_GPU && id != g_main_device) {
9891
- continue;
9892
- }
9893
-
9894
- ggml_cuda_set_device(id);
9895
-
9896
- int64_t row_low, row_high;
9897
- if (backend == GGML_BACKEND_GPU) {
9898
- row_low = 0;
9899
- row_high = nrows;
9900
- } else if (backend == GGML_BACKEND_GPU_SPLIT) {
9901
- const int64_t rounding = get_row_rounding(tensor->type);
9902
-
9903
- row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
9904
- row_low -= row_low % rounding;
9905
-
9906
- if (id == g_device_count - 1) {
9907
- row_high = nrows;
9908
- } else {
9909
- row_high = nrows*g_tensor_split[id + 1];
9910
- row_high -= row_high % rounding;
9911
- }
9912
- } else {
9913
- GGML_ASSERT(false);
9914
- }
9915
- if (row_low == row_high) {
9916
- continue;
9917
- }
9918
-
9919
- int64_t nrows_split = row_high - row_low;
9920
-
9921
- const size_t offset_split = row_low*nb1;
9922
- size_t size = ggml_nbytes_split(tensor, nrows_split);
9923
- const size_t original_size = size;
9924
-
9925
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
9926
- if (ne0 % MATRIX_ROW_PADDING != 0) {
9927
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
9928
- }
9929
-
9930
- char * buf;
9931
- CUDA_CHECK(cudaMalloc(&buf, size));
9932
- char * buf_host = (char *)data + offset_split;
9933
-
9934
- // set padding to 0 to avoid possible NaN values
9935
- if (size > original_size) {
9936
- CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
9937
- }
9938
-
9939
- CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
9940
-
9941
- extra->data_device[id] = buf;
9942
-
9943
- if (backend == GGML_BACKEND_GPU_SPLIT) {
9944
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9945
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
9946
- }
9947
- }
9948
- }
9949
-
9950
- tensor->extra = extra;
9951
- }
9952
-
9953
- void ggml_cuda_free_data(struct ggml_tensor * tensor) {
9954
- if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
10016
+ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
10017
+ if (main_device >= g_device_count) {
10018
+ fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
10019
+ main_device, g_device_count, g_main_device);
9955
10020
  return;
9956
10021
  }
9957
10022
 
9958
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9959
-
9960
- for (int id = 0; id < g_device_count; ++id) {
9961
- ggml_cuda_set_device(id);
9962
- if (extra->data_device[id] != nullptr) {
9963
- CUDA_CHECK(cudaFree(extra->data_device[id]));
9964
- }
9965
-
9966
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9967
- if (extra->events[id][is] != nullptr) {
9968
- CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
9969
- }
9970
- }
9971
- }
9972
-
9973
- delete extra;
9974
- }
9975
-
9976
- static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
9977
- static size_t g_temp_tensor_extra_index = 0;
9978
-
9979
- static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
9980
- if (g_temp_tensor_extras == nullptr) {
9981
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
10023
+ if (g_main_device != main_device && g_device_count > 1) {
10024
+ g_main_device = main_device;
10025
+ //cudaDeviceProp prop;
10026
+ //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
10027
+ //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
9982
10028
  }
9983
-
9984
- size_t alloc_index = g_temp_tensor_extra_index;
9985
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
9986
- ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
9987
- memset(extra, 0, sizeof(*extra));
9988
-
9989
- return extra;
9990
10029
  }
9991
10030
 
9992
- static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
9993
- if (scratch && g_scratch_size == 0) {
9994
- return;
9995
- }
9996
-
9997
- tensor->backend = GGML_BACKEND_GPU;
9998
-
9999
- // recursively assign CUDA buffers until a compute tensor is found
10000
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
10001
- const ggml_op src0_op = tensor->src[0]->op;
10002
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
10003
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
10004
- }
10005
- }
10006
- if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
10007
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
10008
- }
10009
-
10010
- if (scratch && no_alloc) {
10011
- return;
10012
- }
10013
-
10014
- ggml_tensor_extra_gpu * extra;
10015
-
10016
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
10017
- tensor->op == GGML_OP_VIEW ||
10018
- force_inplace;
10019
- const size_t size = ggml_nbytes(tensor);
10020
-
10021
- ggml_cuda_set_device(g_main_device);
10022
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
10023
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
10024
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
10025
- size_t offset = 0;
10026
- if (tensor->op == GGML_OP_VIEW) {
10027
- memcpy(&offset, tensor->op_params, sizeof(size_t));
10028
- }
10029
- extra = ggml_cuda_alloc_temp_tensor_extra();
10030
- extra->data_device[g_main_device] = src0_ddc + offset;
10031
- } else if (tensor->op == GGML_OP_CPY) {
10032
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
10033
- void * src1_ddv = src1_extra->data_device[g_main_device];
10034
- extra = ggml_cuda_alloc_temp_tensor_extra();
10035
- extra->data_device[g_main_device] = src1_ddv;
10036
- } else if (scratch) {
10037
- GGML_ASSERT(size <= g_scratch_size);
10038
- if (g_scratch_offset + size > g_scratch_size) {
10039
- g_scratch_offset = 0;
10040
- }
10041
-
10042
- char * data = (char *) g_scratch_buffer;
10043
- if (data == nullptr) {
10044
- CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
10045
- g_scratch_buffer = data;
10046
- }
10047
- extra = ggml_cuda_alloc_temp_tensor_extra();
10048
- extra->data_device[g_main_device] = data + g_scratch_offset;
10049
-
10050
- g_scratch_offset += size;
10051
-
10052
- GGML_ASSERT(g_scratch_offset <= g_scratch_size);
10053
- } else { // allocate new buffers outside of scratch
10054
- void * data;
10055
- CUDA_CHECK(cudaMalloc(&data, size));
10056
- CUDA_CHECK(cudaMemset(data, 0, size));
10057
- extra = new ggml_tensor_extra_gpu;
10058
- memset(extra, 0, sizeof(*extra));
10059
- extra->data_device[g_main_device] = data;
10060
- }
10031
+ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
10032
+ if (!g_cublas_loaded) return false;
10061
10033
 
10062
- tensor->extra = extra;
10063
- }
10034
+ ggml_cuda_func_t func;
10035
+ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
10036
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
10037
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
10064
10038
 
10065
- void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
10066
- if (g_scratch_size == 0) {
10067
- return;
10068
- }
10069
- if (g_scratch_buffer == nullptr) {
10070
- ggml_cuda_set_device(g_main_device);
10071
- CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
10039
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
10040
+ return false;
10072
10041
  }
10073
10042
 
10074
- ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
10075
-
10076
- const bool inplace = tensor->view_src != nullptr;
10077
-
10078
- if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
10079
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
10080
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
10081
- size_t view_offset = 0;
10082
- if (tensor->op == GGML_OP_VIEW) {
10083
- memcpy(&view_offset, tensor->op_params, sizeof(size_t));
10084
- }
10085
- extra->data_device[g_main_device] = src0_ddc + view_offset;
10086
- } else {
10087
- extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
10088
- }
10089
-
10090
- tensor->extra = extra;
10091
- }
10092
-
10093
- void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
10094
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10095
- GGML_ASSERT(ggml_is_contiguous(tensor));
10096
-
10097
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
10098
- ggml_cuda_set_device(g_main_device);
10099
- CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
10100
- }
10101
-
10102
- void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
10103
- ggml_cuda_assign_buffers_impl(tensor, true, false, false);
10104
- }
10105
-
10106
- void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
10107
- ggml_cuda_assign_buffers_impl(tensor, true, false, true);
10108
- }
10109
-
10110
- void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
10111
- ggml_cuda_assign_buffers_impl(tensor, false, false, false);
10112
- }
10113
-
10114
- void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
10115
- ggml_cuda_assign_buffers_impl(tensor, false, true, false);
10116
- }
10117
-
10118
- void ggml_cuda_set_main_device(const int main_device) {
10119
- if (main_device >= g_device_count) {
10120
- fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
10121
- main_device, g_device_count, g_main_device);
10122
- return;
10123
- }
10124
-
10125
- if (g_main_device != main_device && g_device_count > 1) {
10126
- g_main_device = main_device;
10127
- cudaDeviceProp prop;
10128
- CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
10129
- fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
10130
- }
10131
- }
10132
-
10133
- void ggml_cuda_set_scratch_size(const size_t scratch_size) {
10134
- // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
10135
- // it still won't always work as expected, but it's better than nothing
10136
- if (scratch_size > g_scratch_size) {
10137
- ggml_cuda_free_scratch();
10138
- }
10139
- g_scratch_size = std::max(g_scratch_size, scratch_size);
10140
- }
10141
-
10142
- void ggml_cuda_free_scratch() {
10143
- if (g_scratch_buffer == nullptr) {
10144
- return;
10145
- }
10146
-
10147
- CUDA_CHECK(cudaFree(g_scratch_buffer));
10148
- g_scratch_buffer = nullptr;
10149
- }
10150
-
10151
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
10152
- if (!g_cublas_loaded) return false;
10153
-
10154
- ggml_cuda_func_t func;
10155
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
10156
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
10157
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
10158
-
10159
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
10160
- return false;
10161
- }
10162
-
10163
- if (tensor->op == GGML_OP_MUL_MAT) {
10164
- if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
10165
- #ifndef NDEBUG
10166
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
10167
- #endif
10168
- return false;
10043
+ if (tensor->op == GGML_OP_MUL_MAT) {
10044
+ if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
10045
+ #ifndef NDEBUG
10046
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
10047
+ #endif
10048
+ return false;
10169
10049
  }
10170
10050
  }
10171
10051
 
@@ -10306,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
10306
10186
  return true;
10307
10187
  }
10308
10188
 
10309
- int ggml_cuda_get_device_count() {
10189
+ GGML_CALL int ggml_cuda_get_device_count() {
10310
10190
  int device_count;
10311
10191
  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
10312
10192
  return 0;
@@ -10314,7 +10194,7 @@ int ggml_cuda_get_device_count() {
10314
10194
  return device_count;
10315
10195
  }
10316
10196
 
10317
- void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
10197
+ GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
10318
10198
  cudaDeviceProp prop;
10319
10199
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
10320
10200
  snprintf(description, description_size, "%s", prop.name);
@@ -10326,21 +10206,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
10326
10206
 
10327
10207
  #define UNUSED GGML_UNUSED
10328
10208
 
10209
+ struct ggml_backend_cuda_context {
10210
+ int device;
10211
+ std::string name;
10212
+ };
10213
+
10329
10214
  // cuda buffer
10330
10215
 
10331
- struct ggml_backend_buffer_context_cuda {
10216
+ struct ggml_backend_cuda_buffer_context {
10332
10217
  int device;
10333
10218
  void * dev_ptr = nullptr;
10334
10219
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
10335
10220
  size_t temp_tensor_extra_index = 0;
10221
+ std::string name;
10336
10222
 
10337
- ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
10223
+ ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
10224
+ device(device), dev_ptr(dev_ptr),
10225
+ name(GGML_CUDA_NAME + std::to_string(device)) {
10226
+ }
10338
10227
 
10339
- ~ggml_backend_buffer_context_cuda() {
10228
+ ~ggml_backend_cuda_buffer_context() {
10340
10229
  delete[] temp_tensor_extras;
10341
10230
  }
10342
10231
 
10343
10232
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
10233
+ // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
10344
10234
  if (temp_tensor_extras == nullptr) {
10345
10235
  temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
10346
10236
  }
@@ -10354,19 +10244,28 @@ struct ggml_backend_buffer_context_cuda {
10354
10244
  }
10355
10245
  };
10356
10246
 
10357
- static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10358
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10247
+ GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
10248
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10249
+ return ctx->name.c_str();
10250
+ }
10251
+
10252
+ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
10253
+ return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
10254
+ }
10255
+
10256
+ GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10257
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10359
10258
  CUDA_CHECK(cudaFree(ctx->dev_ptr));
10360
10259
  delete ctx;
10361
10260
  }
10362
10261
 
10363
- static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
10364
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10262
+ GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
10263
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10365
10264
  return ctx->dev_ptr;
10366
10265
  }
10367
10266
 
10368
- static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10369
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10267
+ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10268
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10370
10269
 
10371
10270
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
10372
10271
  assert(tensor->view_src->buffer->buft == buffer->buft);
@@ -10395,14 +10294,12 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
10395
10294
  CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
10396
10295
  }
10397
10296
  }
10398
-
10399
- UNUSED(buffer);
10400
10297
  }
10401
10298
 
10402
- static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10299
+ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10403
10300
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10404
10301
 
10405
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10302
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10406
10303
 
10407
10304
  ggml_cuda_set_device(ctx->device);
10408
10305
  CUDA_CHECK(cudaDeviceSynchronize());
@@ -10410,61 +10307,93 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
10410
10307
  CUDA_CHECK(cudaDeviceSynchronize());
10411
10308
  }
10412
10309
 
10413
- static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10310
+ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10414
10311
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10415
10312
 
10416
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10313
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10417
10314
 
10418
10315
  ggml_cuda_set_device(ctx->device);
10419
10316
  CUDA_CHECK(cudaDeviceSynchronize());
10420
-
10421
10317
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
10318
+ CUDA_CHECK(cudaDeviceSynchronize());
10422
10319
  }
10423
10320
 
10424
- static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10425
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10321
+ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
10322
+ if (ggml_backend_buffer_is_cuda(src->buffer)) {
10323
+ ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
10324
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10325
+
10326
+ ggml_cuda_set_device(src_ctx->device);
10327
+ CUDA_CHECK(cudaDeviceSynchronize());
10328
+ ggml_cuda_set_device(dst_ctx->device);
10329
+ CUDA_CHECK(cudaDeviceSynchronize());
10330
+ CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
10331
+ CUDA_CHECK(cudaDeviceSynchronize());
10332
+
10333
+ return true;
10334
+ }
10335
+ return false;
10336
+ }
10337
+
10338
+ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10339
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10426
10340
 
10427
10341
  ggml_cuda_set_device(ctx->device);
10428
10342
  CUDA_CHECK(cudaDeviceSynchronize());
10429
-
10430
10343
  CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
10344
+ CUDA_CHECK(cudaDeviceSynchronize());
10431
10345
  }
10432
10346
 
10433
- static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
10347
+ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
10348
+ /* .get_name = */ ggml_backend_cuda_buffer_get_name,
10434
10349
  /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
10435
10350
  /* .get_base = */ ggml_backend_cuda_buffer_get_base,
10436
10351
  /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
10437
10352
  /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
10438
10353
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
10439
- /* .cpy_tensor_from = */ NULL,
10440
- /* .cpy_tensor_to = */ NULL,
10354
+ /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
10441
10355
  /* .clear = */ ggml_backend_cuda_buffer_clear,
10356
+ /* .reset = */ NULL,
10442
10357
  };
10443
10358
 
10444
10359
  // cuda buffer type
10360
+ struct ggml_backend_cuda_buffer_type_context {
10361
+ int device;
10362
+ std::string name;
10363
+ };
10445
10364
 
10446
- static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10447
- int device = (int) (intptr_t) buft->context;
10365
+ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
10366
+ ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10448
10367
 
10449
- ggml_cuda_set_device(device);
10368
+ return ctx->name.c_str();
10369
+ }
10370
+
10371
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10372
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10373
+
10374
+ ggml_cuda_set_device(buft_ctx->device);
10450
10375
 
10451
10376
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
10452
10377
 
10453
10378
  void * dev_ptr;
10454
- CUDA_CHECK(cudaMalloc(&dev_ptr, size));
10379
+ cudaError_t err = cudaMalloc(&dev_ptr, size);
10380
+ if (err != cudaSuccess) {
10381
+ fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
10382
+ return nullptr;
10383
+ }
10455
10384
 
10456
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
10385
+ ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
10457
10386
 
10458
- return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
10387
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
10459
10388
  }
10460
10389
 
10461
- static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10390
+ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10462
10391
  return 128;
10463
10392
 
10464
10393
  UNUSED(buft);
10465
10394
  }
10466
10395
 
10467
- static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
10396
+ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10468
10397
  int64_t row_low = 0;
10469
10398
  int64_t row_high = ggml_nrows(tensor);
10470
10399
  int64_t nrows_split = row_high - row_low;
@@ -10484,22 +10413,33 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
10484
10413
  UNUSED(buft);
10485
10414
  }
10486
10415
 
10487
- static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10488
- return ggml_backend_is_cuda(backend);
10416
+ GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10417
+ if (!ggml_backend_is_cuda(backend)) {
10418
+ return false;
10419
+ }
10489
10420
 
10490
- UNUSED(buft);
10421
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10422
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10423
+
10424
+ return buft_ctx->device == cuda_ctx->device;
10491
10425
  }
10492
10426
 
10493
10427
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
10428
+ /* .get_name = */ ggml_backend_cuda_buffer_type_name,
10494
10429
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
10495
10430
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
10496
10431
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
10497
10432
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
10498
- /* .is_host = */ nullptr,
10433
+ /* .is_host = */ NULL,
10499
10434
  };
10500
10435
 
10501
- ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10502
- static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
10436
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10437
+ // FIXME: this is not thread safe
10438
+ if (device >= ggml_backend_cuda_get_device_count()) {
10439
+ return nullptr;
10440
+ }
10441
+
10442
+ static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
10503
10443
 
10504
10444
  static bool ggml_backend_cuda_buffer_type_initialized = false;
10505
10445
 
@@ -10507,7 +10447,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10507
10447
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
10508
10448
  ggml_backend_cuda_buffer_types[i] = {
10509
10449
  /* .iface = */ ggml_backend_cuda_buffer_type_interface,
10510
- /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
10450
+ /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
10511
10451
  };
10512
10452
  }
10513
10453
  ggml_backend_cuda_buffer_type_initialized = true;
@@ -10516,13 +10456,311 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10516
10456
  return &ggml_backend_cuda_buffer_types[device];
10517
10457
  }
10518
10458
 
10459
+ // cuda split buffer
10460
+
10461
+ struct ggml_backend_cuda_split_buffer_context {
10462
+ ~ggml_backend_cuda_split_buffer_context() {
10463
+ for (ggml_tensor_extra_gpu * extra : tensor_extras) {
10464
+ for (int id = 0; id < g_device_count; ++id) {
10465
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
10466
+ if (extra->events[id][is] != nullptr) {
10467
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
10468
+ }
10469
+ }
10470
+ if (extra->data_device[id] != nullptr) {
10471
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
10472
+ }
10473
+ }
10474
+ delete extra;
10475
+ }
10476
+ }
10477
+
10478
+ std::vector<ggml_tensor_extra_gpu *> tensor_extras;
10479
+ };
10480
+
10481
+ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
10482
+ return GGML_CUDA_NAME "_Split";
10483
+
10484
+ UNUSED(buffer);
10485
+ }
10486
+
10487
+ // unused at the moment
10488
+ //static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
10489
+ // return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
10490
+ //}
10491
+
10492
+ GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10493
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10494
+ delete ctx;
10495
+ }
10496
+
10497
+ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
10498
+ // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
10499
+ return (void *)0x1000;
10500
+
10501
+ UNUSED(buffer);
10502
+ }
10503
+
10504
+ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10505
+ GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
10506
+
10507
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10508
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10509
+
10510
+ const int64_t ne0 = tensor->ne[0];
10511
+
10512
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
10513
+
10514
+ ctx->tensor_extras.push_back(extra);
10515
+
10516
+ for (int id = 0; id < g_device_count; ++id) {
10517
+ int64_t row_low, row_high;
10518
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10519
+
10520
+ int64_t nrows_split = row_high - row_low;
10521
+ if (nrows_split == 0) {
10522
+ continue;
10523
+ }
10524
+
10525
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10526
+ const size_t original_size = size;
10527
+
10528
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10529
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10530
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10531
+ }
10532
+
10533
+ // FIXME: do not crash if cudaMalloc fails
10534
+ // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
10535
+ ggml_cuda_set_device(id);
10536
+ char * buf;
10537
+ CUDA_CHECK(cudaMalloc(&buf, size));
10538
+
10539
+ // set padding to 0 to avoid possible NaN values
10540
+ if (size > original_size) {
10541
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
10542
+ }
10543
+
10544
+ extra->data_device[id] = buf;
10545
+
10546
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
10547
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
10548
+ }
10549
+ }
10550
+ tensor->backend = GGML_BACKEND_GPU_SPLIT;
10551
+ tensor->extra = extra;
10552
+ }
10553
+
10554
+ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10555
+ // split tensors must always be set in their entirety at once
10556
+ GGML_ASSERT(offset == 0);
10557
+ GGML_ASSERT(size == ggml_nbytes(tensor));
10558
+
10559
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10560
+
10561
+ const int64_t ne0 = tensor->ne[0];
10562
+ const size_t nb1 = tensor->nb[1];
10563
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
10564
+
10565
+ for (int id = 0; id < g_device_count; ++id) {
10566
+ int64_t row_low, row_high;
10567
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10568
+
10569
+ int64_t nrows_split = row_high - row_low;
10570
+ if (nrows_split == 0) {
10571
+ continue;
10572
+ }
10573
+
10574
+ const size_t offset_split = row_low*nb1;
10575
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10576
+ const size_t original_size = size;
10577
+
10578
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10579
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10580
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10581
+ }
10582
+
10583
+ const char * buf_host = (const char *)data + offset_split;
10584
+ CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
10585
+ }
10586
+ }
10587
+
10588
+ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10589
+ // split tensors must always be set in their entirety at once
10590
+ GGML_ASSERT(offset == 0);
10591
+ GGML_ASSERT(size == ggml_nbytes(tensor));
10592
+
10593
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10594
+
10595
+ const int64_t ne0 = tensor->ne[0];
10596
+ const size_t nb1 = tensor->nb[1];
10597
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
10598
+
10599
+ for (int id = 0; id < g_device_count; ++id) {
10600
+ int64_t row_low, row_high;
10601
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10602
+
10603
+ int64_t nrows_split = row_high - row_low;
10604
+ if (nrows_split == 0) {
10605
+ continue;
10606
+ }
10607
+
10608
+ const size_t offset_split = row_low*nb1;
10609
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10610
+ const size_t original_size = size;
10611
+
10612
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10613
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10614
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10615
+ }
10616
+
10617
+ char * buf_host = (char *)data + offset_split;
10618
+ CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
10619
+ }
10620
+ }
10621
+
10622
+ GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10623
+ UNUSED(buffer);
10624
+ UNUSED(value);
10625
+ }
10626
+
10627
+ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
10628
+ /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
10629
+ /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
10630
+ /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
10631
+ /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
10632
+ /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
10633
+ /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
10634
+ /* .cpy_tensor = */ NULL,
10635
+ /* .clear = */ ggml_backend_cuda_split_buffer_clear,
10636
+ /* .reset = */ NULL,
10637
+ };
10638
+
10639
+ // cuda split buffer type
10640
+
10641
+ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
10642
+ return GGML_CUDA_NAME "_Split";
10643
+
10644
+ UNUSED(buft);
10645
+ }
10646
+
10647
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10648
+ // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
10649
+ // instead, we allocate them for each tensor separately in init_tensor
10650
+ // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
10651
+ // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
10652
+ ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
10653
+
10654
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
10655
+ }
10656
+
10657
+ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10658
+ return 128;
10659
+
10660
+ UNUSED(buft);
10661
+ }
10662
+
10663
+ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10664
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
10665
+
10666
+ size_t total_size = 0;
10667
+
10668
+ const int64_t ne0 = tensor->ne[0];
10669
+
10670
+ for (int id = 0; id < g_device_count; ++id) {
10671
+ int64_t row_low, row_high;
10672
+ get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
10673
+
10674
+ int64_t nrows_split = row_high - row_low;
10675
+ if (nrows_split == 0) {
10676
+ continue;
10677
+ }
10678
+
10679
+ total_size += ggml_nbytes_split(tensor, nrows_split);
10680
+
10681
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10682
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10683
+ total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10684
+ }
10685
+ }
10686
+
10687
+ return total_size;
10688
+ }
10689
+
10690
+ GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10691
+ return ggml_backend_is_cuda(backend);
10692
+
10693
+ UNUSED(buft);
10694
+ }
10695
+
10696
+ GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
10697
+ return false;
10698
+
10699
+ UNUSED(buft);
10700
+ }
10701
+
10702
+ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
10703
+ /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
10704
+ /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
10705
+ /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
10706
+ /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
10707
+ /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
10708
+ /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
10709
+ };
10710
+
10711
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
10712
+ // FIXME: this is not thread safe
10713
+ static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
10714
+
10715
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
10716
+
10717
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
10718
+ if (all_zero) {
10719
+ tensor_split_arr = g_default_tensor_split;
10720
+ } else {
10721
+ float split_sum = 0.0f;
10722
+ for (int i = 0; i < g_device_count; ++i) {
10723
+ tensor_split_arr[i] = split_sum;
10724
+ split_sum += tensor_split[i];
10725
+ }
10726
+ for (int i = 0; i < g_device_count; ++i) {
10727
+ tensor_split_arr[i] /= split_sum;
10728
+ }
10729
+ }
10730
+
10731
+ auto it = buft_map.find(tensor_split_arr);
10732
+ if (it != buft_map.end()) {
10733
+ return &it->second;
10734
+ }
10735
+
10736
+ struct ggml_backend_buffer_type buft {
10737
+ /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
10738
+ /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
10739
+ };
10740
+
10741
+ auto result = buft_map.emplace(tensor_split_arr, buft);
10742
+ return &result.first->second;
10743
+ }
10744
+
10519
10745
  // host buffer type
10520
10746
 
10521
- static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10747
+ GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
10748
+ return GGML_CUDA_NAME "_Host";
10749
+
10750
+ UNUSED(buft);
10751
+ }
10752
+
10753
+ GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
10754
+ return GGML_CUDA_NAME "_Host";
10755
+
10756
+ UNUSED(buffer);
10757
+ }
10758
+
10759
+ GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10522
10760
  ggml_cuda_host_free(buffer->context);
10523
10761
  }
10524
10762
 
10525
- static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10763
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10526
10764
  void * ptr = ggml_cuda_host_malloc(size);
10527
10765
 
10528
10766
  if (ptr == nullptr) {
@@ -10530,17 +10768,18 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
10530
10768
  return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
10531
10769
  }
10532
10770
 
10533
- // FIXME: this is a hack to avoid having to implement a new buffer type
10534
10771
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
10535
10772
  buffer->buft = buft;
10773
+ buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
10536
10774
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
10537
10775
 
10538
10776
  return buffer;
10539
10777
  }
10540
10778
 
10541
- ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10779
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10542
10780
  static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
10543
10781
  /* .iface = */ {
10782
+ /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
10544
10783
  /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
10545
10784
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
10546
10785
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
@@ -10555,31 +10794,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10555
10794
 
10556
10795
  // backend
10557
10796
 
10558
- struct ggml_backend_context_cuda {
10559
- int device;
10560
- };
10797
+ GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
10798
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10561
10799
 
10562
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
10563
- return GGML_CUDA_NAME;
10564
-
10565
- UNUSED(backend);
10800
+ return cuda_ctx->name.c_str();
10566
10801
  }
10567
10802
 
10568
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
10569
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10803
+ GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
10804
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10570
10805
 
10571
10806
  delete cuda_ctx;
10572
10807
  delete backend;
10573
10808
  }
10574
10809
 
10575
- static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
10576
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10810
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
10811
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10577
10812
 
10578
10813
  return ggml_backend_cuda_buffer_type(cuda_ctx->device);
10579
10814
  }
10580
10815
 
10581
- static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10582
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10816
+ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10817
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10583
10818
 
10584
10819
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
10585
10820
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -10587,8 +10822,8 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
10587
10822
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
10588
10823
  }
10589
10824
 
10590
- static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10591
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10825
+ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10826
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10592
10827
 
10593
10828
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
10594
10829
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -10596,39 +10831,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
10596
10831
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
10597
10832
  }
10598
10833
 
10599
- static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10600
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10601
-
10602
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
10603
-
10604
- UNUSED(backend);
10605
- }
10606
-
10607
- static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
10608
- GGML_ASSERT(!"not implemented");
10834
+ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
10835
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10609
10836
 
10610
- return nullptr;
10837
+ if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
10838
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
10839
+ return true;
10840
+ }
10611
10841
 
10612
- UNUSED(backend);
10613
- UNUSED(cgraph);
10842
+ return false;
10614
10843
  }
10615
10844
 
10616
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
10617
- GGML_ASSERT(!"not implemented");
10845
+ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10846
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10618
10847
 
10619
- UNUSED(backend);
10620
- UNUSED(plan);
10621
- }
10622
-
10623
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
10624
- GGML_ASSERT(!"not implemented");
10848
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
10625
10849
 
10626
10850
  UNUSED(backend);
10627
- UNUSED(plan);
10628
10851
  }
10629
10852
 
10630
- static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
10631
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10853
+ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
10854
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10632
10855
 
10633
10856
  ggml_cuda_set_main_device(cuda_ctx->device);
10634
10857
 
@@ -10638,57 +10861,35 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
10638
10861
  for (int i = 0; i < cgraph->n_nodes; i++) {
10639
10862
  ggml_tensor * node = cgraph->nodes[i];
10640
10863
 
10641
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
10864
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
10642
10865
  continue;
10866
+ }
10643
10867
 
10644
- assert(node->backend == GGML_BACKEND_GPU);
10868
+ #ifndef NDEBUG
10869
+ assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
10645
10870
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
10646
10871
  assert(node->extra != nullptr);
10647
10872
 
10648
10873
  for (int j = 0; j < GGML_MAX_SRC; j++) {
10649
10874
  if (node->src[j] != nullptr) {
10650
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
10875
+ assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
10651
10876
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
10652
10877
  assert(node->src[j]->extra != nullptr);
10653
10878
  }
10654
10879
  }
10880
+ #endif
10655
10881
 
10656
10882
  bool ok = ggml_cuda_compute_forward(&params, node);
10657
10883
  if (!ok) {
10658
10884
  fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
10659
10885
  }
10660
10886
  GGML_ASSERT(ok);
10661
-
10662
- #if 0
10663
- if (node->type == GGML_TYPE_F32) {
10664
- cudaDeviceSynchronize();
10665
- std::vector<float> tmp(ggml_nelements(node), 0.0f);
10666
- cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
10667
- printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
10668
- ggml_type_name(node->src[0]->type),
10669
- node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
10670
- node->src[0]->name,
10671
- node->src[1] ? node->src[1]->name : "none");
10672
- double sum = 0.0;
10673
- double sq_sum = 0.0;
10674
- for (int i = 0; i < ggml_nelements(node); i++) {
10675
- printf("%f ", tmp[i]);
10676
- sum += tmp[i];
10677
- sq_sum += tmp[i]*tmp[i];
10678
- }
10679
- printf("\n");
10680
- printf("sum: %f, ", sum);
10681
- printf("sq_sum: %f\n", sq_sum);
10682
- }
10683
- #endif
10684
10887
  }
10685
10888
 
10686
- UNUSED(backend);
10687
-
10688
10889
  return true;
10689
10890
  }
10690
10891
 
10691
- static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
10892
+ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
10692
10893
  switch (op->op) {
10693
10894
  case GGML_OP_UNARY:
10694
10895
  switch (ggml_get_unary_op(op)) {
@@ -10799,23 +11000,22 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
10799
11000
  UNUSED(backend);
10800
11001
  }
10801
11002
 
10802
- static ggml_backend_i cuda_backend_i = {
11003
+ static ggml_backend_i ggml_backend_cuda_interface = {
10803
11004
  /* .get_name = */ ggml_backend_cuda_name,
10804
11005
  /* .free = */ ggml_backend_cuda_free,
10805
11006
  /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
10806
11007
  /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
10807
11008
  /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
10808
- /* .cpy_tensor_from_async = */ NULL,
10809
- /* .cpy_tensor_to_async = */ NULL,
11009
+ /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
10810
11010
  /* .synchronize = */ ggml_backend_cuda_synchronize,
10811
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
10812
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
10813
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
11011
+ /* .graph_plan_create = */ NULL,
11012
+ /* .graph_plan_free = */ NULL,
11013
+ /* .graph_plan_compute = */ NULL,
10814
11014
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
10815
11015
  /* .supports_op = */ ggml_backend_cuda_supports_op,
10816
11016
  };
10817
11017
 
10818
- ggml_backend_t ggml_backend_cuda_init(int device) {
11018
+ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
10819
11019
  ggml_init_cublas(); // TODO: remove from ggml.c
10820
11020
 
10821
11021
  if (device < 0 || device >= ggml_cuda_get_device_count()) {
@@ -10826,32 +11026,48 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
10826
11026
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
10827
11027
  ggml_cuda_set_main_device(device);
10828
11028
 
10829
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
10830
- /* .device = */ device
11029
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
11030
+ /* .device = */ device,
11031
+ /* .name = */ GGML_CUDA_NAME + std::to_string(device),
10831
11032
  };
10832
11033
 
10833
11034
  ggml_backend_t cuda_backend = new ggml_backend {
10834
- /* .interface = */ cuda_backend_i,
11035
+ /* .interface = */ ggml_backend_cuda_interface,
10835
11036
  /* .context = */ ctx
10836
11037
  };
10837
11038
 
10838
11039
  return cuda_backend;
10839
11040
  }
10840
11041
 
10841
- bool ggml_backend_is_cuda(ggml_backend_t backend) {
10842
- return backend->iface.get_name == ggml_backend_cuda_name;
11042
+ GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
11043
+ return backend && backend->iface.get_name == ggml_backend_cuda_name;
11044
+ }
11045
+
11046
+ GGML_CALL int ggml_backend_cuda_get_device_count() {
11047
+ return ggml_cuda_get_device_count();
11048
+ }
11049
+
11050
+ GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
11051
+ ggml_cuda_get_device_description(device, description, description_size);
11052
+ }
11053
+
11054
+ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
11055
+ ggml_cuda_set_device(device);
11056
+
11057
+ CUDA_CHECK(cudaMemGetInfo(free, total));
10843
11058
  }
10844
11059
 
10845
- static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
11060
+ // backend registry
11061
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
10846
11062
  ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
10847
11063
  return cuda_backend;
10848
11064
 
10849
11065
  UNUSED(params);
10850
11066
  }
10851
11067
 
10852
- extern "C" int ggml_backend_cuda_reg_devices();
11068
+ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
10853
11069
 
10854
- int ggml_backend_cuda_reg_devices() {
11070
+ GGML_CALL int ggml_backend_cuda_reg_devices() {
10855
11071
  int device_count = ggml_cuda_get_device_count();
10856
11072
  //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
10857
11073
  for (int i = 0; i < device_count; i++) {