llama_cpp 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,8 +8,13 @@
8
8
  #include <limits>
9
9
  #include <stdint.h>
10
10
  #include <stdio.h>
11
+ #include <string>
11
12
  #include <vector>
12
-
13
+ #include <map>
14
+ #include <array>
15
+ #include "ggml-cuda.h"
16
+ #include "ggml.h"
17
+ #include "ggml-backend-impl.h"
13
18
 
14
19
  #if defined(GGML_USE_HIPBLAS)
15
20
  #include <hip/hip_runtime.h>
@@ -77,6 +82,7 @@
77
82
  #define cudaMemcpyKind hipMemcpyKind
78
83
  #define cudaMemset hipMemset
79
84
  #define cudaMemsetAsync hipMemsetAsync
85
+ #define cudaMemGetInfo hipMemGetInfo
80
86
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
81
87
  #define cudaSetDevice hipSetDevice
82
88
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -112,9 +118,7 @@
112
118
 
113
119
  #endif // defined(GGML_USE_HIPBLAS)
114
120
 
115
- #include "ggml-cuda.h"
116
- #include "ggml.h"
117
- #include "ggml-backend-impl.h"
121
+ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
118
122
 
119
123
  #define CC_PASCAL 600
120
124
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
@@ -519,6 +523,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
519
523
  #define CUDA_ACC_BLOCK_SIZE 256
520
524
  #define CUDA_IM2COL_BLOCK_SIZE 256
521
525
 
526
+ #define CUDA_Q8_0_NE_ALIGN 2048
527
+
522
528
  // dmmv = dequantize_mul_mat_vec
523
529
  #ifndef GGML_CUDA_DMMV_X
524
530
  #define GGML_CUDA_DMMV_X 32
@@ -562,7 +568,7 @@ static void ggml_cuda_set_device(const int device) {
562
568
 
563
569
  static int g_device_count = -1;
564
570
  static int g_main_device = 0;
565
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
571
+ static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
566
572
 
567
573
  struct cuda_device_capabilities {
568
574
  int cc; // compute capability
@@ -573,10 +579,6 @@ struct cuda_device_capabilities {
573
579
 
574
580
  static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
575
581
 
576
- static void * g_scratch_buffer = nullptr;
577
- static size_t g_scratch_size = 0; // disabled by default
578
- static size_t g_scratch_offset = 0;
579
-
580
582
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
581
583
 
582
584
  [[noreturn]]
@@ -605,16 +607,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
605
607
  }
606
608
 
607
609
  static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
608
- #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
609
- (void) a;
610
- bad_arch();
611
- #else
610
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
612
611
  #pragma unroll
613
612
  for (int mask = 16; mask > 0; mask >>= 1) {
614
613
  a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
615
614
  }
616
615
  return a;
617
- #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
616
+ #else
617
+ (void) a;
618
+ bad_arch();
619
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
618
620
  }
619
621
 
620
622
  static __device__ __forceinline__ float warp_reduce_max(float x) {
@@ -626,16 +628,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
626
628
  }
627
629
 
628
630
  static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
629
- #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
630
- (void) x;
631
- bad_arch();
632
- #else
631
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
633
632
  #pragma unroll
634
633
  for (int mask = 16; mask > 0; mask >>= 1) {
635
634
  x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
636
635
  }
637
636
  return x;
638
- #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
637
+ #else
638
+ (void) x;
639
+ bad_arch();
640
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
639
641
  }
640
642
 
641
643
  static __device__ __forceinline__ float op_repeat(const float a, const float b) {
@@ -1103,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
1103
1105
  #endif // GGML_CUDA_F16
1104
1106
  }
1105
1107
 
1108
+ template<typename dst_t>
1109
+ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
1110
+
1111
+ const int i = blockIdx.x;
1112
+
1113
+ // assume 32 threads
1114
+ const int tid = threadIdx.x;
1115
+ const int il = tid/8;
1116
+ const int ir = tid%8;
1117
+ const int ib = 8*i + ir;
1118
+ if (ib >= nb32) {
1119
+ return;
1120
+ }
1121
+
1122
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
1123
+
1124
+ const block_q4_0 * x = (const block_q4_0 *)vx + ib;
1125
+ const float d = __half2float(x->d);
1126
+ const float dm = -8*d;
1127
+
1128
+ const uint8_t * q = x->qs + 4*il;
1129
+
1130
+ for (int l = 0; l < 4; ++l) {
1131
+ y[l+ 0] = d * (q[l] & 0xF) + dm;
1132
+ y[l+16] = d * (q[l] >> 4) + dm;
1133
+ }
1134
+ }
1135
+
1136
+ template<typename dst_t>
1137
+ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
1138
+
1139
+ const int i = blockIdx.x;
1140
+
1141
+ // assume 32 threads
1142
+ const int tid = threadIdx.x;
1143
+ const int il = tid/8;
1144
+ const int ir = tid%8;
1145
+ const int ib = 8*i + ir;
1146
+ if (ib >= nb32) {
1147
+ return;
1148
+ }
1149
+
1150
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
1151
+
1152
+ const block_q4_1 * x = (const block_q4_1 *)vx + ib;
1153
+ const float2 d = __half22float2(x->dm);
1154
+
1155
+ const uint8_t * q = x->qs + 4*il;
1156
+
1157
+ for (int l = 0; l < 4; ++l) {
1158
+ y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
1159
+ y[l+16] = d.x * (q[l] >> 4) + d.y;
1160
+ }
1161
+ }
1162
+
1106
1163
  //================================== k-quants
1107
1164
 
1108
1165
  template<typename dst_t>
@@ -2327,6 +2384,45 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
2327
2384
  y[i] = x[i];
2328
2385
  }
2329
2386
 
2387
+ template <bool need_check>
2388
+ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
2389
+ #if __CUDA_ARCH__ >= CC_PASCAL
2390
+ constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
2391
+
2392
+ const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
2393
+ const int * x0 = ((int *) vx) + blockIdx.x * nint;
2394
+ half2 * y2 = (half2 *) (y + i0);
2395
+
2396
+ __shared__ int vals[nint];
2397
+
2398
+ #pragma unroll
2399
+ for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
2400
+ if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
2401
+ break;
2402
+ }
2403
+
2404
+ const int ix = ix0 + threadIdx.x;
2405
+ vals[ix] = x0[ix];
2406
+ }
2407
+
2408
+ #pragma unroll
2409
+ for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
2410
+ if (need_check && i0 + iy + 2*threadIdx.x >= k) {
2411
+ return;
2412
+ }
2413
+
2414
+ const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
2415
+ const half d = *b0;
2416
+ const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
2417
+
2418
+ y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
2419
+ }
2420
+ #else
2421
+ (void) vx; (void) y; (void) k;
2422
+ bad_arch();
2423
+ #endif // __CUDA_ARCH__ >= CC_PASCAL
2424
+ }
2425
+
2330
2426
  // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
2331
2427
  // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
2332
2428
 
@@ -5613,7 +5709,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
5613
5709
 
5614
5710
  template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
5615
5711
  static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
5616
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
5712
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5617
5713
  const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
5618
5714
  const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
5619
5715
 
@@ -5738,7 +5834,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
5738
5834
  #else
5739
5835
  (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
5740
5836
  bad_arch();
5741
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
5837
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5742
5838
  }
5743
5839
 
5744
5840
  template <bool vals_smem, int ncols_template, int block_size_template>
@@ -6181,6 +6277,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri
6181
6277
  dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
6182
6278
  }
6183
6279
 
6280
+ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
6281
+ const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
6282
+ if (k % CUDA_Q8_0_NE_ALIGN == 0) {
6283
+ const bool need_check = false;
6284
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
6285
+ } else {
6286
+ const bool need_check = true;
6287
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
6288
+ }
6289
+ }
6290
+
6184
6291
  template<typename dst_t>
6185
6292
  static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6186
6293
  const int nb = k / QK_K;
@@ -6201,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
6201
6308
  #endif
6202
6309
  }
6203
6310
 
6311
+ template<typename dst_t>
6312
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6313
+ const int nb32 = k / 32;
6314
+ const int nb = (k + 255) / 256;
6315
+ dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
6316
+ }
6317
+
6318
+ template<typename dst_t>
6319
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6320
+ const int nb32 = k / 32;
6321
+ const int nb = (k + 255) / 256;
6322
+ dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
6323
+ }
6324
+
6204
6325
  template<typename dst_t>
6205
6326
  static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6206
6327
  const int nb = k / QK_K;
@@ -6246,16 +6367,21 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
6246
6367
  }
6247
6368
 
6248
6369
  static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6370
+ int id;
6249
6371
  switch (type) {
6250
6372
  case GGML_TYPE_Q4_0:
6251
- return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
6373
+ return dequantize_row_q4_0_cuda;
6252
6374
  case GGML_TYPE_Q4_1:
6253
- return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
6375
+ return dequantize_row_q4_1_cuda;
6254
6376
  case GGML_TYPE_Q5_0:
6255
6377
  return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
6256
6378
  case GGML_TYPE_Q5_1:
6257
6379
  return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
6258
6380
  case GGML_TYPE_Q8_0:
6381
+ CUDA_CHECK(cudaGetDevice(&id));
6382
+ if (g_device_caps[id].cc >= CC_PASCAL) {
6383
+ return dequantize_block_q8_0_f16_cuda;
6384
+ }
6259
6385
  return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
6260
6386
  case GGML_TYPE_Q2_K:
6261
6387
  return dequantize_row_q2_K_cuda;
@@ -6281,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6281
6407
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
6282
6408
  switch (type) {
6283
6409
  case GGML_TYPE_Q4_0:
6284
- return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
6410
+ return dequantize_row_q4_0_cuda;
6285
6411
  case GGML_TYPE_Q4_1:
6286
- return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
6412
+ return dequantize_row_q4_1_cuda;
6287
6413
  case GGML_TYPE_Q5_0:
6288
6414
  return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
6289
6415
  case GGML_TYPE_Q5_1:
@@ -7489,11 +7615,11 @@ struct cuda_pool_alloc {
7489
7615
 
7490
7616
  static bool g_cublas_loaded = false;
7491
7617
 
7492
- bool ggml_cublas_loaded(void) {
7618
+ GGML_CALL bool ggml_cublas_loaded(void) {
7493
7619
  return g_cublas_loaded;
7494
7620
  }
7495
7621
 
7496
- void ggml_init_cublas() {
7622
+ GGML_CALL void ggml_init_cublas() {
7497
7623
  static bool initialized = false;
7498
7624
 
7499
7625
  if (!initialized) {
@@ -7546,8 +7672,9 @@ void ggml_init_cublas() {
7546
7672
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
7547
7673
  fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
7548
7674
 
7549
- g_tensor_split[id] = total_vram;
7675
+ g_default_tensor_split[id] = total_vram;
7550
7676
  total_vram += prop.totalGlobalMem;
7677
+
7551
7678
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
7552
7679
  g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
7553
7680
  #else
@@ -7556,7 +7683,7 @@ void ggml_init_cublas() {
7556
7683
  g_device_caps[id].smpb = prop.sharedMemPerBlock;
7557
7684
  }
7558
7685
  for (int id = 0; id < g_device_count; ++id) {
7559
- g_tensor_split[id] /= total_vram;
7686
+ g_default_tensor_split[id] /= total_vram;
7560
7687
  }
7561
7688
 
7562
7689
  for (int id = 0; id < g_device_count; ++id) {
@@ -7580,31 +7707,7 @@ void ggml_init_cublas() {
7580
7707
  }
7581
7708
  }
7582
7709
 
7583
- void ggml_cuda_set_tensor_split(const float * tensor_split) {
7584
- if (tensor_split == nullptr) {
7585
- return;
7586
- }
7587
- bool all_zero = true;
7588
- for (int i = 0; i < g_device_count; ++i) {
7589
- if (tensor_split[i] != 0.0f) {
7590
- all_zero = false;
7591
- break;
7592
- }
7593
- }
7594
- if (all_zero) {
7595
- return;
7596
- }
7597
- float split_sum = 0.0f;
7598
- for (int i = 0; i < g_device_count; ++i) {
7599
- g_tensor_split[i] = split_sum;
7600
- split_sum += tensor_split[i];
7601
- }
7602
- for (int i = 0; i < g_device_count; ++i) {
7603
- g_tensor_split[i] /= split_sum;
7604
- }
7605
- }
7606
-
7607
- void * ggml_cuda_host_malloc(size_t size) {
7710
+ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
7608
7711
  if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
7609
7712
  return nullptr;
7610
7713
  }
@@ -7622,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
7622
7725
  return ptr;
7623
7726
  }
7624
7727
 
7625
- void ggml_cuda_host_free(void * ptr) {
7728
+ GGML_CALL void ggml_cuda_host_free(void * ptr) {
7626
7729
  CUDA_CHECK(cudaFreeHost(ptr));
7627
7730
  }
7628
7731
 
@@ -8055,11 +8158,11 @@ static void ggml_cuda_op_mul_mat_q(
8055
8158
  (void) src1_ddf_i;
8056
8159
  }
8057
8160
 
8058
- static int64_t get_row_rounding(ggml_type type) {
8161
+ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
8059
8162
  int64_t min_compute_capability = INT_MAX;
8060
8163
  int64_t max_compute_capability = INT_MIN;
8061
8164
  for (int id = 0; id < g_device_count; ++id) {
8062
- if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8165
+ if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
8063
8166
  if (min_compute_capability > g_device_caps[id].cc) {
8064
8167
  min_compute_capability = g_device_caps[id].cc;
8065
8168
  }
@@ -8120,6 +8223,21 @@ static int64_t get_row_rounding(ggml_type type) {
8120
8223
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8121
8224
  }
8122
8225
 
8226
+ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
8227
+ const int64_t nrows = ggml_nrows(tensor);
8228
+ const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
8229
+
8230
+ *row_low = id == 0 ? 0 : nrows*tensor_split[id];
8231
+ *row_low -= *row_low % rounding;
8232
+
8233
+ if (id == g_device_count - 1) {
8234
+ *row_high = nrows;
8235
+ } else {
8236
+ *row_high = nrows*tensor_split[id + 1];
8237
+ *row_high -= *row_high % rounding;
8238
+ }
8239
+ }
8240
+
8123
8241
  static void ggml_cuda_op_mul_mat_vec_q(
8124
8242
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
8125
8243
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
@@ -8574,15 +8692,15 @@ static void ggml_cuda_op_soft_max(
8574
8692
  float scale = 1.0f;
8575
8693
  memcpy(&scale, dst->op_params, sizeof(float));
8576
8694
 
8577
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8578
- const bool use_f16_soft_max = false;
8579
- #else
8695
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
8580
8696
  #ifdef GGML_CUDA_F16
8581
8697
  const bool use_f16_soft_max = true;
8582
8698
  #else
8583
8699
  const bool use_f16_soft_max = false;
8584
8700
  #endif // GGML_CUDA_F16
8585
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8701
+ #else
8702
+ const bool use_f16_soft_max = false;
8703
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
8586
8704
 
8587
8705
  if (use_f16_soft_max) {
8588
8706
  soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
@@ -8737,6 +8855,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
8737
8855
  peer_access_enabled = enable_peer_access;
8738
8856
  }
8739
8857
 
8858
+ // FIXME: move this somewhere else
8859
+ struct ggml_backend_cuda_split_buffer_type_context {
8860
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
8861
+ };
8862
+
8740
8863
  static void ggml_cuda_op_mul_mat(
8741
8864
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
8742
8865
  const bool convert_src1_to_q8_1) {
@@ -8788,6 +8911,14 @@ static void ggml_cuda_op_mul_mat(
8788
8911
  GGML_ASSERT(!(split && ne03 > 1));
8789
8912
  GGML_ASSERT(!(split && ne02 < ne12));
8790
8913
 
8914
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
8915
+ if (split) {
8916
+ // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
8917
+ // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
8918
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
8919
+ tensor_split = buft_ctx->tensor_split;
8920
+ }
8921
+
8791
8922
  struct dev_data {
8792
8923
  cuda_pool_alloc<char> src0_dd_alloc;
8793
8924
  cuda_pool_alloc<float> src1_ddf_alloc;
@@ -8815,17 +8946,17 @@ static void ggml_cuda_op_mul_mat(
8815
8946
  // for multi GPU, get the row boundaries from tensor split
8816
8947
  // and round to mul_mat_q tile sizes
8817
8948
  if (split) {
8818
- const int64_t rounding = get_row_rounding(src0->type);
8949
+ const int64_t rounding = get_row_rounding(src0->type, tensor_split);
8819
8950
 
8820
8951
  if (id != 0) {
8821
- dev[id].row_low = ne01*g_tensor_split[id];
8952
+ dev[id].row_low = ne01*tensor_split[id];
8822
8953
  if (dev[id].row_low < ne01) {
8823
8954
  dev[id].row_low -= dev[id].row_low % rounding;
8824
8955
  }
8825
8956
  }
8826
8957
 
8827
8958
  if (id != g_device_count - 1) {
8828
- dev[id].row_high = ne01*g_tensor_split[id + 1];
8959
+ dev[id].row_high = ne01*tensor_split[id + 1];
8829
8960
  if (dev[id].row_high < ne01) {
8830
8961
  dev[id].row_high -= dev[id].row_high % rounding;
8831
8962
  }
@@ -9111,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
9111
9242
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
9112
9243
  }
9113
9244
 
9114
- bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
9245
+ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
9115
9246
  if (!g_cublas_loaded) return false;
9116
9247
 
9117
9248
  const int64_t ne10 = src1->ne[0];
@@ -9371,10 +9502,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
9371
9502
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
9372
9503
 
9373
9504
  int64_t min_compute_capability = INT_MAX;
9374
- for (int id = 0; id < g_device_count; ++id) {
9375
- if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
9376
- min_compute_capability = g_device_caps[id].cc;
9505
+
9506
+ if (split) {
9507
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
9508
+ auto & tensor_split = buft_ctx->tensor_split;
9509
+ for (int id = 0; id < g_device_count; ++id) {
9510
+ if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
9511
+ min_compute_capability = g_device_caps[id].cc;
9512
+ }
9377
9513
  }
9514
+ } else {
9515
+ min_compute_capability = g_device_caps[g_main_device].cc;
9378
9516
  }
9379
9517
 
9380
9518
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
@@ -9413,7 +9551,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
9413
9551
  } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
9414
9552
  // KQV single-batch
9415
9553
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
9416
- } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
9554
+ } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
9417
9555
  // KQ + KQV multi-batch
9418
9556
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
9419
9557
  } else if (src0->type == GGML_TYPE_F32) {
@@ -9875,297 +10013,39 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
9875
10013
  return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
9876
10014
  }
9877
10015
 
9878
- void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
9879
- const int64_t nrows = ggml_nrows(tensor);
9880
-
9881
- const int64_t ne0 = tensor->ne[0];
9882
-
9883
- const size_t nb1 = tensor->nb[1];
9884
-
9885
- ggml_backend_type backend = tensor->backend;
9886
- ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
9887
- memset(extra, 0, sizeof(*extra));
9888
-
9889
- for (int id = 0; id < g_device_count; ++id) {
9890
- if (backend == GGML_BACKEND_GPU && id != g_main_device) {
9891
- continue;
9892
- }
9893
-
9894
- ggml_cuda_set_device(id);
9895
-
9896
- int64_t row_low, row_high;
9897
- if (backend == GGML_BACKEND_GPU) {
9898
- row_low = 0;
9899
- row_high = nrows;
9900
- } else if (backend == GGML_BACKEND_GPU_SPLIT) {
9901
- const int64_t rounding = get_row_rounding(tensor->type);
9902
-
9903
- row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
9904
- row_low -= row_low % rounding;
9905
-
9906
- if (id == g_device_count - 1) {
9907
- row_high = nrows;
9908
- } else {
9909
- row_high = nrows*g_tensor_split[id + 1];
9910
- row_high -= row_high % rounding;
9911
- }
9912
- } else {
9913
- GGML_ASSERT(false);
9914
- }
9915
- if (row_low == row_high) {
9916
- continue;
9917
- }
9918
-
9919
- int64_t nrows_split = row_high - row_low;
9920
-
9921
- const size_t offset_split = row_low*nb1;
9922
- size_t size = ggml_nbytes_split(tensor, nrows_split);
9923
- const size_t original_size = size;
9924
-
9925
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
9926
- if (ne0 % MATRIX_ROW_PADDING != 0) {
9927
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
9928
- }
9929
-
9930
- char * buf;
9931
- CUDA_CHECK(cudaMalloc(&buf, size));
9932
- char * buf_host = (char *)data + offset_split;
9933
-
9934
- // set padding to 0 to avoid possible NaN values
9935
- if (size > original_size) {
9936
- CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
9937
- }
9938
-
9939
- CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
9940
-
9941
- extra->data_device[id] = buf;
9942
-
9943
- if (backend == GGML_BACKEND_GPU_SPLIT) {
9944
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9945
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
9946
- }
9947
- }
9948
- }
9949
-
9950
- tensor->extra = extra;
9951
- }
9952
-
9953
- void ggml_cuda_free_data(struct ggml_tensor * tensor) {
9954
- if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
10016
+ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
10017
+ if (main_device >= g_device_count) {
10018
+ fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
10019
+ main_device, g_device_count, g_main_device);
9955
10020
  return;
9956
10021
  }
9957
10022
 
9958
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9959
-
9960
- for (int id = 0; id < g_device_count; ++id) {
9961
- ggml_cuda_set_device(id);
9962
- if (extra->data_device[id] != nullptr) {
9963
- CUDA_CHECK(cudaFree(extra->data_device[id]));
9964
- }
9965
-
9966
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9967
- if (extra->events[id][is] != nullptr) {
9968
- CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
9969
- }
9970
- }
9971
- }
9972
-
9973
- delete extra;
9974
- }
9975
-
9976
- static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
9977
- static size_t g_temp_tensor_extra_index = 0;
9978
-
9979
- static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
9980
- if (g_temp_tensor_extras == nullptr) {
9981
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
10023
+ if (g_main_device != main_device && g_device_count > 1) {
10024
+ g_main_device = main_device;
10025
+ //cudaDeviceProp prop;
10026
+ //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
10027
+ //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
9982
10028
  }
9983
-
9984
- size_t alloc_index = g_temp_tensor_extra_index;
9985
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
9986
- ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
9987
- memset(extra, 0, sizeof(*extra));
9988
-
9989
- return extra;
9990
10029
  }
9991
10030
 
9992
- static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
9993
- if (scratch && g_scratch_size == 0) {
9994
- return;
9995
- }
9996
-
9997
- tensor->backend = GGML_BACKEND_GPU;
9998
-
9999
- // recursively assign CUDA buffers until a compute tensor is found
10000
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
10001
- const ggml_op src0_op = tensor->src[0]->op;
10002
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
10003
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
10004
- }
10005
- }
10006
- if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
10007
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
10008
- }
10009
-
10010
- if (scratch && no_alloc) {
10011
- return;
10012
- }
10013
-
10014
- ggml_tensor_extra_gpu * extra;
10015
-
10016
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
10017
- tensor->op == GGML_OP_VIEW ||
10018
- force_inplace;
10019
- const size_t size = ggml_nbytes(tensor);
10020
-
10021
- ggml_cuda_set_device(g_main_device);
10022
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
10023
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
10024
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
10025
- size_t offset = 0;
10026
- if (tensor->op == GGML_OP_VIEW) {
10027
- memcpy(&offset, tensor->op_params, sizeof(size_t));
10028
- }
10029
- extra = ggml_cuda_alloc_temp_tensor_extra();
10030
- extra->data_device[g_main_device] = src0_ddc + offset;
10031
- } else if (tensor->op == GGML_OP_CPY) {
10032
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
10033
- void * src1_ddv = src1_extra->data_device[g_main_device];
10034
- extra = ggml_cuda_alloc_temp_tensor_extra();
10035
- extra->data_device[g_main_device] = src1_ddv;
10036
- } else if (scratch) {
10037
- GGML_ASSERT(size <= g_scratch_size);
10038
- if (g_scratch_offset + size > g_scratch_size) {
10039
- g_scratch_offset = 0;
10040
- }
10041
-
10042
- char * data = (char *) g_scratch_buffer;
10043
- if (data == nullptr) {
10044
- CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
10045
- g_scratch_buffer = data;
10046
- }
10047
- extra = ggml_cuda_alloc_temp_tensor_extra();
10048
- extra->data_device[g_main_device] = data + g_scratch_offset;
10049
-
10050
- g_scratch_offset += size;
10051
-
10052
- GGML_ASSERT(g_scratch_offset <= g_scratch_size);
10053
- } else { // allocate new buffers outside of scratch
10054
- void * data;
10055
- CUDA_CHECK(cudaMalloc(&data, size));
10056
- CUDA_CHECK(cudaMemset(data, 0, size));
10057
- extra = new ggml_tensor_extra_gpu;
10058
- memset(extra, 0, sizeof(*extra));
10059
- extra->data_device[g_main_device] = data;
10060
- }
10031
+ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
10032
+ if (!g_cublas_loaded) return false;
10061
10033
 
10062
- tensor->extra = extra;
10063
- }
10034
+ ggml_cuda_func_t func;
10035
+ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
10036
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
10037
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
10064
10038
 
10065
- void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
10066
- if (g_scratch_size == 0) {
10067
- return;
10068
- }
10069
- if (g_scratch_buffer == nullptr) {
10070
- ggml_cuda_set_device(g_main_device);
10071
- CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
10039
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
10040
+ return false;
10072
10041
  }
10073
10042
 
10074
- ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
10075
-
10076
- const bool inplace = tensor->view_src != nullptr;
10077
-
10078
- if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
10079
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
10080
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
10081
- size_t view_offset = 0;
10082
- if (tensor->op == GGML_OP_VIEW) {
10083
- memcpy(&view_offset, tensor->op_params, sizeof(size_t));
10084
- }
10085
- extra->data_device[g_main_device] = src0_ddc + view_offset;
10086
- } else {
10087
- extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
10088
- }
10089
-
10090
- tensor->extra = extra;
10091
- }
10092
-
10093
- void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
10094
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10095
- GGML_ASSERT(ggml_is_contiguous(tensor));
10096
-
10097
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
10098
- ggml_cuda_set_device(g_main_device);
10099
- CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
10100
- }
10101
-
10102
- void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
10103
- ggml_cuda_assign_buffers_impl(tensor, true, false, false);
10104
- }
10105
-
10106
- void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
10107
- ggml_cuda_assign_buffers_impl(tensor, true, false, true);
10108
- }
10109
-
10110
- void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
10111
- ggml_cuda_assign_buffers_impl(tensor, false, false, false);
10112
- }
10113
-
10114
- void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
10115
- ggml_cuda_assign_buffers_impl(tensor, false, true, false);
10116
- }
10117
-
10118
- void ggml_cuda_set_main_device(const int main_device) {
10119
- if (main_device >= g_device_count) {
10120
- fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
10121
- main_device, g_device_count, g_main_device);
10122
- return;
10123
- }
10124
-
10125
- if (g_main_device != main_device && g_device_count > 1) {
10126
- g_main_device = main_device;
10127
- cudaDeviceProp prop;
10128
- CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
10129
- fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
10130
- }
10131
- }
10132
-
10133
- void ggml_cuda_set_scratch_size(const size_t scratch_size) {
10134
- // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
10135
- // it still won't always work as expected, but it's better than nothing
10136
- if (scratch_size > g_scratch_size) {
10137
- ggml_cuda_free_scratch();
10138
- }
10139
- g_scratch_size = std::max(g_scratch_size, scratch_size);
10140
- }
10141
-
10142
- void ggml_cuda_free_scratch() {
10143
- if (g_scratch_buffer == nullptr) {
10144
- return;
10145
- }
10146
-
10147
- CUDA_CHECK(cudaFree(g_scratch_buffer));
10148
- g_scratch_buffer = nullptr;
10149
- }
10150
-
10151
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
10152
- if (!g_cublas_loaded) return false;
10153
-
10154
- ggml_cuda_func_t func;
10155
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
10156
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
10157
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
10158
-
10159
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
10160
- return false;
10161
- }
10162
-
10163
- if (tensor->op == GGML_OP_MUL_MAT) {
10164
- if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
10165
- #ifndef NDEBUG
10166
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
10167
- #endif
10168
- return false;
10043
+ if (tensor->op == GGML_OP_MUL_MAT) {
10044
+ if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
10045
+ #ifndef NDEBUG
10046
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
10047
+ #endif
10048
+ return false;
10169
10049
  }
10170
10050
  }
10171
10051
 
@@ -10306,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
10306
10186
  return true;
10307
10187
  }
10308
10188
 
10309
- int ggml_cuda_get_device_count() {
10189
+ GGML_CALL int ggml_cuda_get_device_count() {
10310
10190
  int device_count;
10311
10191
  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
10312
10192
  return 0;
@@ -10314,7 +10194,7 @@ int ggml_cuda_get_device_count() {
10314
10194
  return device_count;
10315
10195
  }
10316
10196
 
10317
- void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
10197
+ GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
10318
10198
  cudaDeviceProp prop;
10319
10199
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
10320
10200
  snprintf(description, description_size, "%s", prop.name);
@@ -10326,21 +10206,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
10326
10206
 
10327
10207
  #define UNUSED GGML_UNUSED
10328
10208
 
10209
+ struct ggml_backend_cuda_context {
10210
+ int device;
10211
+ std::string name;
10212
+ };
10213
+
10329
10214
  // cuda buffer
10330
10215
 
10331
- struct ggml_backend_buffer_context_cuda {
10216
+ struct ggml_backend_cuda_buffer_context {
10332
10217
  int device;
10333
10218
  void * dev_ptr = nullptr;
10334
10219
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
10335
10220
  size_t temp_tensor_extra_index = 0;
10221
+ std::string name;
10336
10222
 
10337
- ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
10223
+ ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
10224
+ device(device), dev_ptr(dev_ptr),
10225
+ name(GGML_CUDA_NAME + std::to_string(device)) {
10226
+ }
10338
10227
 
10339
- ~ggml_backend_buffer_context_cuda() {
10228
+ ~ggml_backend_cuda_buffer_context() {
10340
10229
  delete[] temp_tensor_extras;
10341
10230
  }
10342
10231
 
10343
10232
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
10233
+ // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
10344
10234
  if (temp_tensor_extras == nullptr) {
10345
10235
  temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
10346
10236
  }
@@ -10354,19 +10244,28 @@ struct ggml_backend_buffer_context_cuda {
10354
10244
  }
10355
10245
  };
10356
10246
 
10357
- static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10358
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10247
+ GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
10248
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10249
+ return ctx->name.c_str();
10250
+ }
10251
+
10252
+ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
10253
+ return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
10254
+ }
10255
+
10256
+ GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10257
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10359
10258
  CUDA_CHECK(cudaFree(ctx->dev_ptr));
10360
10259
  delete ctx;
10361
10260
  }
10362
10261
 
10363
- static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
10364
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10262
+ GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
10263
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10365
10264
  return ctx->dev_ptr;
10366
10265
  }
10367
10266
 
10368
- static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10369
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10267
+ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10268
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10370
10269
 
10371
10270
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
10372
10271
  assert(tensor->view_src->buffer->buft == buffer->buft);
@@ -10395,14 +10294,12 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
10395
10294
  CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
10396
10295
  }
10397
10296
  }
10398
-
10399
- UNUSED(buffer);
10400
10297
  }
10401
10298
 
10402
- static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10299
+ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10403
10300
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10404
10301
 
10405
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10302
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10406
10303
 
10407
10304
  ggml_cuda_set_device(ctx->device);
10408
10305
  CUDA_CHECK(cudaDeviceSynchronize());
@@ -10410,61 +10307,93 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
10410
10307
  CUDA_CHECK(cudaDeviceSynchronize());
10411
10308
  }
10412
10309
 
10413
- static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10310
+ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10414
10311
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10415
10312
 
10416
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10313
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10417
10314
 
10418
10315
  ggml_cuda_set_device(ctx->device);
10419
10316
  CUDA_CHECK(cudaDeviceSynchronize());
10420
-
10421
10317
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
10318
+ CUDA_CHECK(cudaDeviceSynchronize());
10422
10319
  }
10423
10320
 
10424
- static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10425
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10321
+ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
10322
+ if (ggml_backend_buffer_is_cuda(src->buffer)) {
10323
+ ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
10324
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10325
+
10326
+ ggml_cuda_set_device(src_ctx->device);
10327
+ CUDA_CHECK(cudaDeviceSynchronize());
10328
+ ggml_cuda_set_device(dst_ctx->device);
10329
+ CUDA_CHECK(cudaDeviceSynchronize());
10330
+ CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
10331
+ CUDA_CHECK(cudaDeviceSynchronize());
10332
+
10333
+ return true;
10334
+ }
10335
+ return false;
10336
+ }
10337
+
10338
+ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10339
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10426
10340
 
10427
10341
  ggml_cuda_set_device(ctx->device);
10428
10342
  CUDA_CHECK(cudaDeviceSynchronize());
10429
-
10430
10343
  CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
10344
+ CUDA_CHECK(cudaDeviceSynchronize());
10431
10345
  }
10432
10346
 
10433
- static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
10347
+ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
10348
+ /* .get_name = */ ggml_backend_cuda_buffer_get_name,
10434
10349
  /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
10435
10350
  /* .get_base = */ ggml_backend_cuda_buffer_get_base,
10436
10351
  /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
10437
10352
  /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
10438
10353
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
10439
- /* .cpy_tensor_from = */ NULL,
10440
- /* .cpy_tensor_to = */ NULL,
10354
+ /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
10441
10355
  /* .clear = */ ggml_backend_cuda_buffer_clear,
10356
+ /* .reset = */ NULL,
10442
10357
  };
10443
10358
 
10444
10359
  // cuda buffer type
10360
+ struct ggml_backend_cuda_buffer_type_context {
10361
+ int device;
10362
+ std::string name;
10363
+ };
10445
10364
 
10446
- static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10447
- int device = (int) (intptr_t) buft->context;
10365
+ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
10366
+ ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10448
10367
 
10449
- ggml_cuda_set_device(device);
10368
+ return ctx->name.c_str();
10369
+ }
10370
+
10371
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10372
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10373
+
10374
+ ggml_cuda_set_device(buft_ctx->device);
10450
10375
 
10451
10376
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
10452
10377
 
10453
10378
  void * dev_ptr;
10454
- CUDA_CHECK(cudaMalloc(&dev_ptr, size));
10379
+ cudaError_t err = cudaMalloc(&dev_ptr, size);
10380
+ if (err != cudaSuccess) {
10381
+ fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
10382
+ return nullptr;
10383
+ }
10455
10384
 
10456
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
10385
+ ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
10457
10386
 
10458
- return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
10387
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
10459
10388
  }
10460
10389
 
10461
- static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10390
+ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10462
10391
  return 128;
10463
10392
 
10464
10393
  UNUSED(buft);
10465
10394
  }
10466
10395
 
10467
- static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
10396
+ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10468
10397
  int64_t row_low = 0;
10469
10398
  int64_t row_high = ggml_nrows(tensor);
10470
10399
  int64_t nrows_split = row_high - row_low;
@@ -10484,22 +10413,33 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
10484
10413
  UNUSED(buft);
10485
10414
  }
10486
10415
 
10487
- static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10488
- return ggml_backend_is_cuda(backend);
10416
+ GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10417
+ if (!ggml_backend_is_cuda(backend)) {
10418
+ return false;
10419
+ }
10489
10420
 
10490
- UNUSED(buft);
10421
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10422
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10423
+
10424
+ return buft_ctx->device == cuda_ctx->device;
10491
10425
  }
10492
10426
 
10493
10427
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
10428
+ /* .get_name = */ ggml_backend_cuda_buffer_type_name,
10494
10429
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
10495
10430
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
10496
10431
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
10497
10432
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
10498
- /* .is_host = */ nullptr,
10433
+ /* .is_host = */ NULL,
10499
10434
  };
10500
10435
 
10501
- ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10502
- static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
10436
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10437
+ // FIXME: this is not thread safe
10438
+ if (device >= ggml_backend_cuda_get_device_count()) {
10439
+ return nullptr;
10440
+ }
10441
+
10442
+ static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
10503
10443
 
10504
10444
  static bool ggml_backend_cuda_buffer_type_initialized = false;
10505
10445
 
@@ -10507,7 +10447,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10507
10447
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
10508
10448
  ggml_backend_cuda_buffer_types[i] = {
10509
10449
  /* .iface = */ ggml_backend_cuda_buffer_type_interface,
10510
- /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
10450
+ /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
10511
10451
  };
10512
10452
  }
10513
10453
  ggml_backend_cuda_buffer_type_initialized = true;
@@ -10516,13 +10456,311 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10516
10456
  return &ggml_backend_cuda_buffer_types[device];
10517
10457
  }
10518
10458
 
10459
+ // cuda split buffer
10460
+
10461
+ struct ggml_backend_cuda_split_buffer_context {
10462
+ ~ggml_backend_cuda_split_buffer_context() {
10463
+ for (ggml_tensor_extra_gpu * extra : tensor_extras) {
10464
+ for (int id = 0; id < g_device_count; ++id) {
10465
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
10466
+ if (extra->events[id][is] != nullptr) {
10467
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
10468
+ }
10469
+ }
10470
+ if (extra->data_device[id] != nullptr) {
10471
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
10472
+ }
10473
+ }
10474
+ delete extra;
10475
+ }
10476
+ }
10477
+
10478
+ std::vector<ggml_tensor_extra_gpu *> tensor_extras;
10479
+ };
10480
+
10481
+ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
10482
+ return GGML_CUDA_NAME "_Split";
10483
+
10484
+ UNUSED(buffer);
10485
+ }
10486
+
10487
+ // unused at the moment
10488
+ //static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
10489
+ // return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
10490
+ //}
10491
+
10492
+ GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10493
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10494
+ delete ctx;
10495
+ }
10496
+
10497
+ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
10498
+ // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
10499
+ return (void *)0x1000;
10500
+
10501
+ UNUSED(buffer);
10502
+ }
10503
+
10504
+ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10505
+ GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
10506
+
10507
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10508
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10509
+
10510
+ const int64_t ne0 = tensor->ne[0];
10511
+
10512
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
10513
+
10514
+ ctx->tensor_extras.push_back(extra);
10515
+
10516
+ for (int id = 0; id < g_device_count; ++id) {
10517
+ int64_t row_low, row_high;
10518
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10519
+
10520
+ int64_t nrows_split = row_high - row_low;
10521
+ if (nrows_split == 0) {
10522
+ continue;
10523
+ }
10524
+
10525
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10526
+ const size_t original_size = size;
10527
+
10528
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10529
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10530
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10531
+ }
10532
+
10533
+ // FIXME: do not crash if cudaMalloc fails
10534
+ // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
10535
+ ggml_cuda_set_device(id);
10536
+ char * buf;
10537
+ CUDA_CHECK(cudaMalloc(&buf, size));
10538
+
10539
+ // set padding to 0 to avoid possible NaN values
10540
+ if (size > original_size) {
10541
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
10542
+ }
10543
+
10544
+ extra->data_device[id] = buf;
10545
+
10546
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
10547
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
10548
+ }
10549
+ }
10550
+ tensor->backend = GGML_BACKEND_GPU_SPLIT;
10551
+ tensor->extra = extra;
10552
+ }
10553
+
10554
+ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10555
+ // split tensors must always be set in their entirety at once
10556
+ GGML_ASSERT(offset == 0);
10557
+ GGML_ASSERT(size == ggml_nbytes(tensor));
10558
+
10559
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10560
+
10561
+ const int64_t ne0 = tensor->ne[0];
10562
+ const size_t nb1 = tensor->nb[1];
10563
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
10564
+
10565
+ for (int id = 0; id < g_device_count; ++id) {
10566
+ int64_t row_low, row_high;
10567
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10568
+
10569
+ int64_t nrows_split = row_high - row_low;
10570
+ if (nrows_split == 0) {
10571
+ continue;
10572
+ }
10573
+
10574
+ const size_t offset_split = row_low*nb1;
10575
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10576
+ const size_t original_size = size;
10577
+
10578
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10579
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10580
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10581
+ }
10582
+
10583
+ const char * buf_host = (const char *)data + offset_split;
10584
+ CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
10585
+ }
10586
+ }
10587
+
10588
+ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10589
+ // split tensors must always be set in their entirety at once
10590
+ GGML_ASSERT(offset == 0);
10591
+ GGML_ASSERT(size == ggml_nbytes(tensor));
10592
+
10593
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10594
+
10595
+ const int64_t ne0 = tensor->ne[0];
10596
+ const size_t nb1 = tensor->nb[1];
10597
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
10598
+
10599
+ for (int id = 0; id < g_device_count; ++id) {
10600
+ int64_t row_low, row_high;
10601
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10602
+
10603
+ int64_t nrows_split = row_high - row_low;
10604
+ if (nrows_split == 0) {
10605
+ continue;
10606
+ }
10607
+
10608
+ const size_t offset_split = row_low*nb1;
10609
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10610
+ const size_t original_size = size;
10611
+
10612
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10613
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10614
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10615
+ }
10616
+
10617
+ char * buf_host = (char *)data + offset_split;
10618
+ CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
10619
+ }
10620
+ }
10621
+
10622
+ GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10623
+ UNUSED(buffer);
10624
+ UNUSED(value);
10625
+ }
10626
+
10627
+ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
10628
+ /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
10629
+ /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
10630
+ /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
10631
+ /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
10632
+ /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
10633
+ /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
10634
+ /* .cpy_tensor = */ NULL,
10635
+ /* .clear = */ ggml_backend_cuda_split_buffer_clear,
10636
+ /* .reset = */ NULL,
10637
+ };
10638
+
10639
+ // cuda split buffer type
10640
+
10641
+ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
10642
+ return GGML_CUDA_NAME "_Split";
10643
+
10644
+ UNUSED(buft);
10645
+ }
10646
+
10647
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10648
+ // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
10649
+ // instead, we allocate them for each tensor separately in init_tensor
10650
+ // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
10651
+ // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
10652
+ ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
10653
+
10654
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
10655
+ }
10656
+
10657
+ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10658
+ return 128;
10659
+
10660
+ UNUSED(buft);
10661
+ }
10662
+
10663
+ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10664
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
10665
+
10666
+ size_t total_size = 0;
10667
+
10668
+ const int64_t ne0 = tensor->ne[0];
10669
+
10670
+ for (int id = 0; id < g_device_count; ++id) {
10671
+ int64_t row_low, row_high;
10672
+ get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
10673
+
10674
+ int64_t nrows_split = row_high - row_low;
10675
+ if (nrows_split == 0) {
10676
+ continue;
10677
+ }
10678
+
10679
+ total_size += ggml_nbytes_split(tensor, nrows_split);
10680
+
10681
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10682
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10683
+ total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10684
+ }
10685
+ }
10686
+
10687
+ return total_size;
10688
+ }
10689
+
10690
+ GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10691
+ return ggml_backend_is_cuda(backend);
10692
+
10693
+ UNUSED(buft);
10694
+ }
10695
+
10696
+ GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
10697
+ return false;
10698
+
10699
+ UNUSED(buft);
10700
+ }
10701
+
10702
+ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
10703
+ /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
10704
+ /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
10705
+ /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
10706
+ /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
10707
+ /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
10708
+ /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
10709
+ };
10710
+
10711
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
10712
+ // FIXME: this is not thread safe
10713
+ static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
10714
+
10715
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
10716
+
10717
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
10718
+ if (all_zero) {
10719
+ tensor_split_arr = g_default_tensor_split;
10720
+ } else {
10721
+ float split_sum = 0.0f;
10722
+ for (int i = 0; i < g_device_count; ++i) {
10723
+ tensor_split_arr[i] = split_sum;
10724
+ split_sum += tensor_split[i];
10725
+ }
10726
+ for (int i = 0; i < g_device_count; ++i) {
10727
+ tensor_split_arr[i] /= split_sum;
10728
+ }
10729
+ }
10730
+
10731
+ auto it = buft_map.find(tensor_split_arr);
10732
+ if (it != buft_map.end()) {
10733
+ return &it->second;
10734
+ }
10735
+
10736
+ struct ggml_backend_buffer_type buft {
10737
+ /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
10738
+ /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
10739
+ };
10740
+
10741
+ auto result = buft_map.emplace(tensor_split_arr, buft);
10742
+ return &result.first->second;
10743
+ }
10744
+
10519
10745
  // host buffer type
10520
10746
 
10521
- static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10747
+ GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
10748
+ return GGML_CUDA_NAME "_Host";
10749
+
10750
+ UNUSED(buft);
10751
+ }
10752
+
10753
+ GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
10754
+ return GGML_CUDA_NAME "_Host";
10755
+
10756
+ UNUSED(buffer);
10757
+ }
10758
+
10759
+ GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10522
10760
  ggml_cuda_host_free(buffer->context);
10523
10761
  }
10524
10762
 
10525
- static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10763
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10526
10764
  void * ptr = ggml_cuda_host_malloc(size);
10527
10765
 
10528
10766
  if (ptr == nullptr) {
@@ -10530,17 +10768,18 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
10530
10768
  return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
10531
10769
  }
10532
10770
 
10533
- // FIXME: this is a hack to avoid having to implement a new buffer type
10534
10771
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
10535
10772
  buffer->buft = buft;
10773
+ buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
10536
10774
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
10537
10775
 
10538
10776
  return buffer;
10539
10777
  }
10540
10778
 
10541
- ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10779
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10542
10780
  static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
10543
10781
  /* .iface = */ {
10782
+ /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
10544
10783
  /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
10545
10784
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
10546
10785
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
@@ -10555,31 +10794,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10555
10794
 
10556
10795
  // backend
10557
10796
 
10558
- struct ggml_backend_context_cuda {
10559
- int device;
10560
- };
10797
+ GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
10798
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10561
10799
 
10562
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
10563
- return GGML_CUDA_NAME;
10564
-
10565
- UNUSED(backend);
10800
+ return cuda_ctx->name.c_str();
10566
10801
  }
10567
10802
 
10568
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
10569
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10803
+ GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
10804
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10570
10805
 
10571
10806
  delete cuda_ctx;
10572
10807
  delete backend;
10573
10808
  }
10574
10809
 
10575
- static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
10576
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10810
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
10811
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10577
10812
 
10578
10813
  return ggml_backend_cuda_buffer_type(cuda_ctx->device);
10579
10814
  }
10580
10815
 
10581
- static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10582
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10816
+ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10817
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10583
10818
 
10584
10819
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
10585
10820
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -10587,8 +10822,8 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
10587
10822
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
10588
10823
  }
10589
10824
 
10590
- static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10591
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10825
+ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10826
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10592
10827
 
10593
10828
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
10594
10829
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -10596,39 +10831,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
10596
10831
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
10597
10832
  }
10598
10833
 
10599
- static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10600
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10601
-
10602
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
10603
-
10604
- UNUSED(backend);
10605
- }
10606
-
10607
- static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
10608
- GGML_ASSERT(!"not implemented");
10834
+ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
10835
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10609
10836
 
10610
- return nullptr;
10837
+ if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
10838
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
10839
+ return true;
10840
+ }
10611
10841
 
10612
- UNUSED(backend);
10613
- UNUSED(cgraph);
10842
+ return false;
10614
10843
  }
10615
10844
 
10616
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
10617
- GGML_ASSERT(!"not implemented");
10845
+ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10846
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10618
10847
 
10619
- UNUSED(backend);
10620
- UNUSED(plan);
10621
- }
10622
-
10623
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
10624
- GGML_ASSERT(!"not implemented");
10848
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
10625
10849
 
10626
10850
  UNUSED(backend);
10627
- UNUSED(plan);
10628
10851
  }
10629
10852
 
10630
- static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
10631
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10853
+ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
10854
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10632
10855
 
10633
10856
  ggml_cuda_set_main_device(cuda_ctx->device);
10634
10857
 
@@ -10638,57 +10861,35 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
10638
10861
  for (int i = 0; i < cgraph->n_nodes; i++) {
10639
10862
  ggml_tensor * node = cgraph->nodes[i];
10640
10863
 
10641
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
10864
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
10642
10865
  continue;
10866
+ }
10643
10867
 
10644
- assert(node->backend == GGML_BACKEND_GPU);
10868
+ #ifndef NDEBUG
10869
+ assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
10645
10870
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
10646
10871
  assert(node->extra != nullptr);
10647
10872
 
10648
10873
  for (int j = 0; j < GGML_MAX_SRC; j++) {
10649
10874
  if (node->src[j] != nullptr) {
10650
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
10875
+ assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
10651
10876
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
10652
10877
  assert(node->src[j]->extra != nullptr);
10653
10878
  }
10654
10879
  }
10880
+ #endif
10655
10881
 
10656
10882
  bool ok = ggml_cuda_compute_forward(&params, node);
10657
10883
  if (!ok) {
10658
10884
  fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
10659
10885
  }
10660
10886
  GGML_ASSERT(ok);
10661
-
10662
- #if 0
10663
- if (node->type == GGML_TYPE_F32) {
10664
- cudaDeviceSynchronize();
10665
- std::vector<float> tmp(ggml_nelements(node), 0.0f);
10666
- cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
10667
- printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
10668
- ggml_type_name(node->src[0]->type),
10669
- node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
10670
- node->src[0]->name,
10671
- node->src[1] ? node->src[1]->name : "none");
10672
- double sum = 0.0;
10673
- double sq_sum = 0.0;
10674
- for (int i = 0; i < ggml_nelements(node); i++) {
10675
- printf("%f ", tmp[i]);
10676
- sum += tmp[i];
10677
- sq_sum += tmp[i]*tmp[i];
10678
- }
10679
- printf("\n");
10680
- printf("sum: %f, ", sum);
10681
- printf("sq_sum: %f\n", sq_sum);
10682
- }
10683
- #endif
10684
10887
  }
10685
10888
 
10686
- UNUSED(backend);
10687
-
10688
10889
  return true;
10689
10890
  }
10690
10891
 
10691
- static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
10892
+ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
10692
10893
  switch (op->op) {
10693
10894
  case GGML_OP_UNARY:
10694
10895
  switch (ggml_get_unary_op(op)) {
@@ -10799,23 +11000,22 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
10799
11000
  UNUSED(backend);
10800
11001
  }
10801
11002
 
10802
- static ggml_backend_i cuda_backend_i = {
11003
+ static ggml_backend_i ggml_backend_cuda_interface = {
10803
11004
  /* .get_name = */ ggml_backend_cuda_name,
10804
11005
  /* .free = */ ggml_backend_cuda_free,
10805
11006
  /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
10806
11007
  /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
10807
11008
  /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
10808
- /* .cpy_tensor_from_async = */ NULL,
10809
- /* .cpy_tensor_to_async = */ NULL,
11009
+ /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
10810
11010
  /* .synchronize = */ ggml_backend_cuda_synchronize,
10811
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
10812
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
10813
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
11011
+ /* .graph_plan_create = */ NULL,
11012
+ /* .graph_plan_free = */ NULL,
11013
+ /* .graph_plan_compute = */ NULL,
10814
11014
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
10815
11015
  /* .supports_op = */ ggml_backend_cuda_supports_op,
10816
11016
  };
10817
11017
 
10818
- ggml_backend_t ggml_backend_cuda_init(int device) {
11018
+ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
10819
11019
  ggml_init_cublas(); // TODO: remove from ggml.c
10820
11020
 
10821
11021
  if (device < 0 || device >= ggml_cuda_get_device_count()) {
@@ -10826,32 +11026,48 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
10826
11026
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
10827
11027
  ggml_cuda_set_main_device(device);
10828
11028
 
10829
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
10830
- /* .device = */ device
11029
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
11030
+ /* .device = */ device,
11031
+ /* .name = */ GGML_CUDA_NAME + std::to_string(device),
10831
11032
  };
10832
11033
 
10833
11034
  ggml_backend_t cuda_backend = new ggml_backend {
10834
- /* .interface = */ cuda_backend_i,
11035
+ /* .interface = */ ggml_backend_cuda_interface,
10835
11036
  /* .context = */ ctx
10836
11037
  };
10837
11038
 
10838
11039
  return cuda_backend;
10839
11040
  }
10840
11041
 
10841
- bool ggml_backend_is_cuda(ggml_backend_t backend) {
10842
- return backend->iface.get_name == ggml_backend_cuda_name;
11042
+ GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
11043
+ return backend && backend->iface.get_name == ggml_backend_cuda_name;
11044
+ }
11045
+
11046
+ GGML_CALL int ggml_backend_cuda_get_device_count() {
11047
+ return ggml_cuda_get_device_count();
11048
+ }
11049
+
11050
+ GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
11051
+ ggml_cuda_get_device_description(device, description, description_size);
11052
+ }
11053
+
11054
+ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
11055
+ ggml_cuda_set_device(device);
11056
+
11057
+ CUDA_CHECK(cudaMemGetInfo(free, total));
10843
11058
  }
10844
11059
 
10845
- static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
11060
+ // backend registry
11061
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
10846
11062
  ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
10847
11063
  return cuda_backend;
10848
11064
 
10849
11065
  UNUSED(params);
10850
11066
  }
10851
11067
 
10852
- extern "C" int ggml_backend_cuda_reg_devices();
11068
+ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
10853
11069
 
10854
- int ggml_backend_cuda_reg_devices() {
11070
+ GGML_CALL int ggml_backend_cuda_reg_devices() {
10855
11071
  int device_count = ggml_cuda_get_device_count();
10856
11072
  //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
10857
11073
  for (int i = 0; i < device_count; i++) {