llama_cpp 0.10.2 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,8 +68,9 @@
68
68
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
69
69
  #endif
70
70
  #define cudaMemcpy hipMemcpy
71
- #define cudaMemcpy2DAsync hipMemcpy2DAsync
72
71
  #define cudaMemcpyAsync hipMemcpyAsync
72
+ #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
73
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
73
74
  #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
74
75
  #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
75
76
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
@@ -86,10 +87,29 @@
86
87
  #define cudaStream_t hipStream_t
87
88
  #define cudaSuccess hipSuccess
88
89
  #define __trap abort
90
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
91
+ #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
92
+ #define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
93
+ #define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
94
+ #define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
95
+ #define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
96
+ #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
97
+ #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
98
+ #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
89
99
  #else
90
100
  #include <cuda_runtime.h>
101
+ #include <cuda.h>
91
102
  #include <cublas_v2.h>
92
103
  #include <cuda_fp16.h>
104
+
105
+ #if CUDART_VERSION < 11020
106
+ #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
107
+ #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
108
+ #define CUBLAS_COMPUTE_16F CUDA_R_16F
109
+ #define CUBLAS_COMPUTE_32F CUDA_R_32F
110
+ #define cublasComputeType_t cudaDataType_t
111
+ #endif // CUDART_VERSION < 11020
112
+
93
113
  #endif // defined(GGML_USE_HIPBLAS)
94
114
 
95
115
  #include "ggml-cuda.h"
@@ -144,7 +164,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
144
164
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
145
165
  #if __has_builtin(__builtin_elementwise_sub_sat)
146
166
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
147
- return reinterpret_cast<const int&>(c);
167
+ return reinterpret_cast<const int &>(c);
148
168
  #else
149
169
  int8x4_t c;
150
170
  int16_t tmp;
@@ -155,7 +175,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
155
175
  if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
156
176
  c[i] = tmp;
157
177
  }
158
- return reinterpret_cast<int&>(c);
178
+ return reinterpret_cast<int &>(c);
159
179
  #endif // __has_builtin(__builtin_elementwise_sub_sat)
160
180
  }
161
181
 
@@ -193,45 +213,59 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
193
213
 
194
214
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
195
215
 
196
- #define CUDA_CHECK(err) \
197
- do { \
198
- cudaError_t err_ = (err); \
199
- if (err_ != cudaSuccess) { \
200
- int id; \
201
- cudaGetDevice(&id); \
202
- fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
203
- cudaGetErrorString(err_)); \
204
- fprintf(stderr, "current device: %d\n", id); \
205
- GGML_ASSERT(!"CUDA error"); \
206
- } \
216
+ [[noreturn]]
217
+ static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
218
+ int id = -1; // in case cudaGetDevice fails
219
+ cudaGetDevice(&id);
220
+
221
+ fprintf(stderr, "CUDA error: %s\n", msg);
222
+ fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
223
+ fprintf(stderr, " %s\n", stmt);
224
+ // abort with GGML_ASSERT to get a stack trace
225
+ GGML_ASSERT(!"CUDA error");
226
+ }
227
+
228
+ #define CUDA_CHECK_GEN(err, success, error_fn) \
229
+ do { \
230
+ auto err_ = (err); \
231
+ if (err_ != (success)) { \
232
+ ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
233
+ } \
207
234
  } while (0)
208
235
 
236
+ #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
237
+
209
238
  #if CUDART_VERSION >= 12000
210
- #define CUBLAS_CHECK(err) \
211
- do { \
212
- cublasStatus_t err_ = (err); \
213
- if (err_ != CUBLAS_STATUS_SUCCESS) { \
214
- int id; \
215
- cudaGetDevice(&id); \
216
- fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
217
- err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
218
- fprintf(stderr, "current device: %d\n", id); \
219
- GGML_ASSERT(!"cuBLAS error"); \
220
- } \
221
- } while (0)
239
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
240
+ return cublasGetStatusString(err);
241
+ }
222
242
  #else
223
- #define CUBLAS_CHECK(err) \
224
- do { \
225
- cublasStatus_t err_ = (err); \
226
- if (err_ != CUBLAS_STATUS_SUCCESS) { \
227
- int id; \
228
- cudaGetDevice(&id); \
229
- fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
230
- fprintf(stderr, "current device: %d\n", id); \
231
- GGML_ASSERT(!"cuBLAS error"); \
232
- } \
233
- } while (0)
234
- #endif // CUDART_VERSION >= 11
243
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
244
+ switch (err) {
245
+ case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
246
+ case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
247
+ case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
248
+ case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
249
+ case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
250
+ case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
251
+ case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
252
+ case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
253
+ case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
254
+ default: return "unknown error";
255
+ }
256
+ }
257
+ #endif // CUDART_VERSION >= 12000
258
+
259
+ #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
260
+
261
+ #if !defined(GGML_USE_HIPBLAS)
262
+ static const char * cu_get_error_str(CUresult err) {
263
+ const char * err_str;
264
+ cuGetErrorString(err, &err_str);
265
+ return err_str;
266
+ }
267
+ #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
268
+ #endif
235
269
 
236
270
  #if CUDART_VERSION >= 11100
237
271
  #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
@@ -287,10 +321,10 @@ typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * s
287
321
  typedef void (*ggml_cuda_op_mul_mat_t)(
288
322
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
289
323
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
290
- const int64_t src1_padded_row_size, const cudaStream_t & stream);
324
+ const int64_t src1_padded_row_size, cudaStream_t stream);
291
325
  typedef void (*ggml_cuda_op_flatten_t)(
292
326
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
293
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
327
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
294
328
 
295
329
  // QK = number of values after dequantization
296
330
  // QR = QK / number of values before dequantization
@@ -496,22 +530,29 @@ struct ggml_tensor_extra_gpu {
496
530
 
497
531
  // this is faster on Windows
498
532
  // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
499
- inline cudaError_t ggml_cuda_set_device(const int device) {
533
+ static void ggml_cuda_set_device(const int device) {
500
534
  int current_device;
501
535
  CUDA_CHECK(cudaGetDevice(&current_device));
502
536
 
503
537
  if (device == current_device) {
504
- return cudaSuccess;
538
+ return;
505
539
  }
506
540
 
507
- return cudaSetDevice(device);
541
+ CUDA_CHECK(cudaSetDevice(device));
508
542
  }
509
543
 
510
544
  static int g_device_count = -1;
511
545
  static int g_main_device = 0;
512
- static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
513
546
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
514
547
 
548
+ struct cuda_device_capabilities {
549
+ int cc; // compute capability
550
+ bool vmm; // virtual memory support
551
+ size_t vmm_granularity; // granularity of virtual memory
552
+ };
553
+
554
+ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
555
+
515
556
  static void * g_scratch_buffer = nullptr;
516
557
  static size_t g_scratch_size = 0; // disabled by default
517
558
  static size_t g_scratch_offset = 0;
@@ -553,6 +594,7 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
553
594
 
554
595
  static __device__ __forceinline__ float op_repeat(const float a, const float b) {
555
596
  return b;
597
+ GGML_UNUSED(a);
556
598
  }
557
599
 
558
600
  static __device__ __forceinline__ float op_add(const float a, const float b) {
@@ -674,7 +716,7 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
674
716
  dst[i] = x[i] / (1.0f + expf(-x[i]));
675
717
  }
676
718
 
677
- static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
719
+ static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
678
720
  const float GELU_QUICK_COEF = -1.702f;
679
721
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
680
722
  if (i >= k) {
@@ -683,7 +725,7 @@ static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
683
725
  dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
684
726
  }
685
727
 
686
- static __global__ void tanh_f32(const float *x, float *dst, int k) {
728
+ static __global__ void tanh_f32(const float * x, float * dst, int k) {
687
729
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
688
730
  if (i >= k) {
689
731
  return;
@@ -700,7 +742,7 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
700
742
  dst[i] = fmaxf(x[i], 0);
701
743
  }
702
744
 
703
- static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
745
+ static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
704
746
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
705
747
  if (i >= k) {
706
748
  return;
@@ -753,7 +795,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
753
795
  }
754
796
  }
755
797
 
756
- static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
798
+ static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
757
799
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
758
800
  if (nidx >= ne0) {
759
801
  return;
@@ -778,7 +820,7 @@ static __global__ void concat_f32(const float *x,const float *y, float *dst, c
778
820
  }
779
821
  }
780
822
 
781
- static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
823
+ static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
782
824
  int ne0 = ne00 * scale_factor;
783
825
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
784
826
  if (nidx >= ne0) {
@@ -798,7 +840,7 @@ static __global__ void upscale_f32(const float *x, float *dst, const int ne00,
798
840
  dst[offset_dst] = x[offset_src];
799
841
  }
800
842
 
801
- static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
843
+ static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
802
844
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
803
845
  if (nidx >= ne0) {
804
846
  return;
@@ -4700,7 +4742,6 @@ static __global__ void mul_mat_p021_f16_f32(
4700
4742
 
4701
4743
  const int row_y = col_x;
4702
4744
 
4703
-
4704
4745
  // y is not transposed but permuted
4705
4746
  const int iy = channel*nrows_y + row_y;
4706
4747
 
@@ -5266,17 +5307,17 @@ static __global__ void im2col_f32_f16(
5266
5307
  const int ky = (i - kd) / OW;
5267
5308
  const int ix = i % OW;
5268
5309
 
5269
- const int iiw = ix * s0 + kx * d0 - p0;
5270
- const int iih = blockIdx.y * s1 + ky * d1 - p1;
5310
+ const int64_t iiw = ix * s0 + kx * d0 - p0;
5311
+ const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
5271
5312
 
5272
- const int offset_dst =
5313
+ const int64_t offset_dst =
5273
5314
  (blockIdx.y * OW + ix) * CHW +
5274
5315
  (blockIdx.z * (KW * KH) + ky * KW + kx);
5275
5316
 
5276
5317
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5277
5318
  dst[offset_dst] = __float2half(0.0f);
5278
5319
  } else {
5279
- const int offset_src = blockIdx.z * offset_delta;
5320
+ const int64_t offset_src = blockIdx.z * offset_delta;
5280
5321
  dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
5281
5322
  }
5282
5323
  }
@@ -5375,7 +5416,7 @@ struct bin_bcast_cuda {
5375
5416
  cne[3] = 1;
5376
5417
  };
5377
5418
 
5378
- auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
5419
+ auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
5379
5420
  cnb[1] *= cne[1];
5380
5421
  cnb[2] *= cne[2];
5381
5422
  cnb[3] *= cne[3];
@@ -5868,7 +5909,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
5868
5909
 
5869
5910
  int id;
5870
5911
  CUDA_CHECK(cudaGetDevice(&id));
5871
- const int compute_capability = g_compute_capabilities[id];
5912
+ const int compute_capability = g_device_caps[id].cc;
5872
5913
 
5873
5914
  int mmq_x, mmq_y, nwarps;
5874
5915
  if (compute_capability >= CC_RDNA2) {
@@ -5913,7 +5954,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
5913
5954
 
5914
5955
  int id;
5915
5956
  CUDA_CHECK(cudaGetDevice(&id));
5916
- const int compute_capability = g_compute_capabilities[id];
5957
+ const int compute_capability = g_device_caps[id].cc;
5917
5958
 
5918
5959
  int mmq_x, mmq_y, nwarps;
5919
5960
  if (compute_capability >= CC_RDNA2) {
@@ -5958,7 +5999,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
5958
5999
 
5959
6000
  int id;
5960
6001
  CUDA_CHECK(cudaGetDevice(&id));
5961
- const int compute_capability = g_compute_capabilities[id];
6002
+ const int compute_capability = g_device_caps[id].cc;
5962
6003
 
5963
6004
  int mmq_x, mmq_y, nwarps;
5964
6005
  if (compute_capability >= CC_RDNA2) {
@@ -6003,7 +6044,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
6003
6044
 
6004
6045
  int id;
6005
6046
  CUDA_CHECK(cudaGetDevice(&id));
6006
- const int compute_capability = g_compute_capabilities[id];
6047
+ const int compute_capability = g_device_caps[id].cc;
6007
6048
 
6008
6049
  int mmq_x, mmq_y, nwarps;
6009
6050
  if (compute_capability >= CC_RDNA2) {
@@ -6048,7 +6089,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
6048
6089
 
6049
6090
  int id;
6050
6091
  CUDA_CHECK(cudaGetDevice(&id));
6051
- const int compute_capability = g_compute_capabilities[id];
6092
+ const int compute_capability = g_device_caps[id].cc;
6052
6093
 
6053
6094
  int mmq_x, mmq_y, nwarps;
6054
6095
  if (compute_capability >= CC_RDNA2) {
@@ -6093,7 +6134,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
6093
6134
 
6094
6135
  int id;
6095
6136
  CUDA_CHECK(cudaGetDevice(&id));
6096
- const int compute_capability = g_compute_capabilities[id];
6137
+ const int compute_capability = g_device_caps[id].cc;
6097
6138
 
6098
6139
  int mmq_x, mmq_y, nwarps;
6099
6140
  if (compute_capability >= CC_RDNA2) {
@@ -6140,7 +6181,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
6140
6181
 
6141
6182
  int id;
6142
6183
  CUDA_CHECK(cudaGetDevice(&id));
6143
- const int compute_capability = g_compute_capabilities[id];
6184
+ const int compute_capability = g_device_caps[id].cc;
6144
6185
 
6145
6186
  int mmq_x, mmq_y, nwarps;
6146
6187
  if (compute_capability >= CC_RDNA2) {
@@ -6186,7 +6227,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
6186
6227
 
6187
6228
  int id;
6188
6229
  CUDA_CHECK(cudaGetDevice(&id));
6189
- const int compute_capability = g_compute_capabilities[id];
6230
+ const int compute_capability = g_device_caps[id].cc;
6190
6231
 
6191
6232
  int mmq_x, mmq_y, nwarps;
6192
6233
  if (compute_capability >= CC_RDNA2) {
@@ -6231,7 +6272,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
6231
6272
 
6232
6273
  int id;
6233
6274
  CUDA_CHECK(cudaGetDevice(&id));
6234
- const int compute_capability = g_compute_capabilities[id];
6275
+ const int compute_capability = g_device_caps[id].cc;
6235
6276
 
6236
6277
  int mmq_x, mmq_y, nwarps;
6237
6278
  if (compute_capability >= CC_RDNA2) {
@@ -6276,7 +6317,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
6276
6317
 
6277
6318
  int id;
6278
6319
  CUDA_CHECK(cudaGetDevice(&id));
6279
- const int compute_capability = g_compute_capabilities[id];
6320
+ const int compute_capability = g_device_caps[id].cc;
6280
6321
 
6281
6322
  int mmq_x, mmq_y, nwarps;
6282
6323
  if (compute_capability >= CC_RDNA2) {
@@ -6536,30 +6577,30 @@ struct scoped_spin_lock {
6536
6577
  scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
6537
6578
  };
6538
6579
 
6539
- struct cuda_buffer {
6580
+ static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
6581
+
6582
+ // #define DEBUG_CUDA_MALLOC
6583
+ struct ggml_cuda_buffer {
6540
6584
  void * ptr = nullptr;
6541
6585
  size_t size = 0;
6542
6586
  };
6543
6587
 
6544
- static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6545
- static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
6588
+ static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6589
+ static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
6546
6590
 
6547
- static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
6591
+ static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
6548
6592
  scoped_spin_lock lock(g_cuda_pool_lock);
6549
- int id;
6550
- CUDA_CHECK(cudaGetDevice(&id));
6551
6593
  #ifdef DEBUG_CUDA_MALLOC
6552
6594
  int nnz = 0;
6553
- size_t max_size = 0, tot_size = 0;
6595
+ size_t max_size = 0;
6554
6596
  #endif
6555
6597
  size_t best_diff = 1ull << 36;
6556
6598
  int ibest = -1;
6557
6599
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
6558
- cuda_buffer& b = g_cuda_buffer_pool[id][i];
6600
+ ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
6559
6601
  if (b.ptr != nullptr) {
6560
6602
  #ifdef DEBUG_CUDA_MALLOC
6561
6603
  ++nnz;
6562
- tot_size += b.size;
6563
6604
  if (b.size > max_size) max_size = b.size;
6564
6605
  #endif
6565
6606
  if (b.size >= size) {
@@ -6579,32 +6620,32 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
6579
6620
  }
6580
6621
  }
6581
6622
  if (ibest >= 0) {
6582
- cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
6623
+ ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest];
6583
6624
  void * ptr = b.ptr;
6584
6625
  *actual_size = b.size;
6585
6626
  b.ptr = nullptr;
6586
6627
  b.size = 0;
6587
6628
  return ptr;
6588
6629
  }
6589
- #ifdef DEBUG_CUDA_MALLOC
6590
- fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
6591
- (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
6592
- #endif
6593
6630
  void * ptr;
6594
6631
  size_t look_ahead_size = (size_t) (1.05 * size);
6595
6632
  look_ahead_size = 256 * ((look_ahead_size + 255)/256);
6633
+ ggml_cuda_set_device(device);
6596
6634
  CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
6597
6635
  *actual_size = look_ahead_size;
6636
+ g_cuda_pool_size[device] += look_ahead_size;
6637
+ #ifdef DEBUG_CUDA_MALLOC
6638
+ fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
6639
+ (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
6640
+ #endif
6598
6641
  return ptr;
6599
6642
  }
6600
6643
 
6601
- static void ggml_cuda_pool_free(void * ptr, size_t size) {
6644
+ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
6602
6645
  scoped_spin_lock lock(g_cuda_pool_lock);
6603
- int id;
6604
- CUDA_CHECK(cudaGetDevice(&id));
6605
6646
 
6606
6647
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
6607
- cuda_buffer& b = g_cuda_buffer_pool[id][i];
6648
+ ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
6608
6649
  if (b.ptr == nullptr) {
6609
6650
  b.ptr = ptr;
6610
6651
  b.size = size;
@@ -6612,9 +6653,149 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
6612
6653
  }
6613
6654
  }
6614
6655
  fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
6656
+ ggml_cuda_set_device(device);
6615
6657
  CUDA_CHECK(cudaFree(ptr));
6658
+ g_cuda_pool_size[device] -= size;
6616
6659
  }
6617
6660
 
6661
+ #if !defined(GGML_USE_HIPBLAS)
6662
+ // pool with virtual memory
6663
+ static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
6664
+ static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
6665
+ static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
6666
+
6667
+ static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
6668
+ scoped_spin_lock lock(g_cuda_pool_lock);
6669
+
6670
+ // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
6671
+ const size_t alignment = 128;
6672
+ size = alignment * ((size + alignment - 1) / alignment);
6673
+
6674
+ size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];
6675
+
6676
+ if (size > avail) {
6677
+ // round up to the next multiple of the granularity
6678
+ size_t reserve_size = size - avail;
6679
+ const size_t granularity = g_device_caps[device].vmm_granularity;
6680
+ reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
6681
+
6682
+ GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
6683
+
6684
+ // allocate more physical memory
6685
+ CUmemAllocationProp prop = {};
6686
+ prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
6687
+ prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
6688
+ prop.location.id = device;
6689
+ CUmemGenericAllocationHandle handle;
6690
+ CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
6691
+
6692
+ // reserve virtual address space (if not already reserved)
6693
+ if (g_cuda_pool_addr[device] == 0) {
6694
+ CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
6695
+ }
6696
+
6697
+ // map at the end of the pool
6698
+ CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));
6699
+
6700
+ // the memory allocation handle is no longer needed after mapping
6701
+ CU_CHECK(cuMemRelease(handle));
6702
+
6703
+ // set access
6704
+ CUmemAccessDesc access = {};
6705
+ access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
6706
+ access.location.id = device;
6707
+ access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
6708
+ CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));
6709
+
6710
+ // add to the pool
6711
+ g_cuda_pool_size[device] += reserve_size;
6712
+
6713
+ //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
6714
+ // id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
6715
+ // (unsigned long long) (reserve_size/1024/1024));
6716
+ }
6717
+
6718
+ GGML_ASSERT(g_cuda_pool_addr[device] != 0);
6719
+
6720
+ void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
6721
+ *actual_size = size;
6722
+ g_cuda_pool_used[device] += size;
6723
+
6724
+ #ifdef DEBUG_CUDA_MALLOC
6725
+ printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
6726
+ #endif
6727
+
6728
+ return ptr;
6729
+ }
6730
+
6731
+ static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
6732
+ scoped_spin_lock lock(g_cuda_pool_lock);
6733
+
6734
+ #ifdef DEBUG_CUDA_MALLOC
6735
+ printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
6736
+ #endif
6737
+
6738
+ g_cuda_pool_used[device] -= size;
6739
+
6740
+ // all deallocations must be in reverse order of the allocations
6741
+ GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
6742
+ }
6743
+
6744
+ static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
6745
+ if (g_device_caps[device].vmm) {
6746
+ return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
6747
+ } else {
6748
+ return ggml_cuda_pool_malloc_leg(device, size, actual_size);
6749
+ }
6750
+ }
6751
+
6752
+ static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
6753
+ if (g_device_caps[device].vmm) {
6754
+ ggml_cuda_pool_free_vmm(device, ptr, size);
6755
+ } else {
6756
+ ggml_cuda_pool_free_leg(device, ptr, size);
6757
+ }
6758
+ }
6759
+ #else
6760
+ #define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
6761
+ #define ggml_cuda_pool_free ggml_cuda_pool_free_leg
6762
+ #endif // !defined(GGML_USE_HIPBLAS)
6763
+
6764
+ template<typename T>
6765
+ struct cuda_pool_alloc {
6766
+ int device = -1;
6767
+ T * ptr = nullptr;
6768
+ size_t actual_size = 0;
6769
+
6770
+ // size is in number of elements
6771
+ T * alloc(size_t size) {
6772
+ GGML_ASSERT(ptr == nullptr);
6773
+ CUDA_CHECK(cudaGetDevice(&device));
6774
+ ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
6775
+ return ptr;
6776
+ }
6777
+
6778
+ cuda_pool_alloc(size_t size) {
6779
+ alloc(size);
6780
+ }
6781
+
6782
+ ~cuda_pool_alloc() {
6783
+ if (ptr != nullptr) {
6784
+ ggml_cuda_pool_free(device, ptr, actual_size);
6785
+ }
6786
+ }
6787
+
6788
+ T * get() {
6789
+ return ptr;
6790
+ }
6791
+
6792
+ cuda_pool_alloc() = default;
6793
+ cuda_pool_alloc(const cuda_pool_alloc &) = delete;
6794
+ cuda_pool_alloc(cuda_pool_alloc &&) = delete;
6795
+ cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
6796
+ cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
6797
+ };
6798
+
6618
6799
  static bool g_cublas_loaded = false;
6619
6800
 
6620
6801
  bool ggml_cublas_loaded(void) {
@@ -6653,16 +6834,33 @@ void ggml_init_cublas() {
6653
6834
  #endif
6654
6835
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
6655
6836
  for (int id = 0; id < g_device_count; ++id) {
6837
+ int device_vmm = 0;
6838
+
6839
+ #if !defined(GGML_USE_HIPBLAS)
6840
+ CUdevice device;
6841
+ CU_CHECK(cuDeviceGet(&device, id));
6842
+ CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
6843
+
6844
+ if (device_vmm) {
6845
+ CUmemAllocationProp alloc_prop = {};
6846
+ alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
6847
+ alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
6848
+ alloc_prop.location.id = id;
6849
+ CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
6850
+ }
6851
+ #endif // !defined(GGML_USE_HIPBLAS)
6852
+ g_device_caps[id].vmm = !!device_vmm;
6853
+
6656
6854
  cudaDeviceProp prop;
6657
6855
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
6658
- fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
6856
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
6659
6857
 
6660
6858
  g_tensor_split[id] = total_vram;
6661
6859
  total_vram += prop.totalGlobalMem;
6662
6860
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6663
- g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
6861
+ g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
6664
6862
  #else
6665
- g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
6863
+ g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
6666
6864
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6667
6865
  }
6668
6866
  for (int id = 0; id < g_device_count; ++id) {
@@ -6670,7 +6868,7 @@ void ggml_init_cublas() {
6670
6868
  }
6671
6869
 
6672
6870
  for (int id = 0; id < g_device_count; ++id) {
6673
- CUDA_CHECK(ggml_cuda_set_device(id));
6871
+ ggml_cuda_set_device(id);
6674
6872
 
6675
6873
  // create cuda streams
6676
6874
  for (int is = 0; is < MAX_STREAMS; ++is) {
@@ -6722,8 +6920,7 @@ void * ggml_cuda_host_malloc(size_t size) {
6722
6920
  void * ptr = nullptr;
6723
6921
  cudaError_t err = cudaMallocHost((void **) &ptr, size);
6724
6922
  if (err != cudaSuccess) {
6725
- // The allocation error can be bypassed. A null ptr will assigned out of this function.
6726
- // This can fixed the OOM error in WSL.
6923
+ // clear the error
6727
6924
  cudaGetLastError();
6728
6925
  fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6729
6926
  size/1024.0/1024.0, cudaGetErrorString(err));
@@ -6786,7 +6983,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6786
6983
 
6787
6984
  static void ggml_cuda_op_get_rows(
6788
6985
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6789
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
6986
+ const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
6790
6987
 
6791
6988
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
6792
6989
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -6828,9 +7025,9 @@ static void ggml_cuda_op_get_rows(
6828
7025
  }
6829
7026
 
6830
7027
  template<class op>
6831
- inline void ggml_cuda_op_bin_bcast(
7028
+ static void ggml_cuda_op_bin_bcast(
6832
7029
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6833
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7030
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6834
7031
 
6835
7032
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6836
7033
 
@@ -6849,7 +7046,7 @@ inline void ggml_cuda_op_bin_bcast(
6849
7046
 
6850
7047
  static void ggml_cuda_op_repeat(
6851
7048
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6852
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
7049
+ const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
6853
7050
 
6854
7051
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
6855
7052
 
@@ -6857,16 +7054,16 @@ static void ggml_cuda_op_repeat(
6857
7054
  (void) src1_d;
6858
7055
  }
6859
7056
 
6860
- inline void ggml_cuda_op_add(
7057
+ static void ggml_cuda_op_add(
6861
7058
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6862
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7059
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6863
7060
 
6864
7061
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6865
7062
  }
6866
7063
 
6867
- inline void ggml_cuda_op_acc(
7064
+ static void ggml_cuda_op_acc(
6868
7065
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6869
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7066
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6870
7067
 
6871
7068
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6872
7069
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -6883,23 +7080,23 @@ inline void ggml_cuda_op_acc(
6883
7080
  (void) dst;
6884
7081
  }
6885
7082
 
6886
- inline void ggml_cuda_op_mul(
7083
+ static void ggml_cuda_op_mul(
6887
7084
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6888
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7085
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6889
7086
 
6890
7087
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6891
7088
  }
6892
7089
 
6893
- inline void ggml_cuda_op_div(
7090
+ static void ggml_cuda_op_div(
6894
7091
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6895
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7092
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6896
7093
 
6897
7094
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6898
7095
  }
6899
7096
 
6900
- inline void ggml_cuda_op_gelu(
7097
+ static void ggml_cuda_op_gelu(
6901
7098
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6902
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7099
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6903
7100
 
6904
7101
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6905
7102
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6911,9 +7108,9 @@ inline void ggml_cuda_op_gelu(
6911
7108
  (void) src1_dd;
6912
7109
  }
6913
7110
 
6914
- inline void ggml_cuda_op_silu(
7111
+ static void ggml_cuda_op_silu(
6915
7112
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6916
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7113
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6917
7114
 
6918
7115
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6919
7116
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6925,9 +7122,9 @@ inline void ggml_cuda_op_silu(
6925
7122
  (void) src1_dd;
6926
7123
  }
6927
7124
 
6928
- inline void ggml_cuda_op_gelu_quick(
7125
+ static void ggml_cuda_op_gelu_quick(
6929
7126
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6930
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7127
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6931
7128
 
6932
7129
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6933
7130
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6939,9 +7136,9 @@ inline void ggml_cuda_op_gelu_quick(
6939
7136
  (void) src1_dd;
6940
7137
  }
6941
7138
 
6942
- inline void ggml_cuda_op_tanh(
7139
+ static void ggml_cuda_op_tanh(
6943
7140
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6944
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7141
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6945
7142
 
6946
7143
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6947
7144
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6953,9 +7150,9 @@ inline void ggml_cuda_op_tanh(
6953
7150
  (void) src1_dd;
6954
7151
  }
6955
7152
 
6956
- inline void ggml_cuda_op_relu(
7153
+ static void ggml_cuda_op_relu(
6957
7154
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6958
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7155
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6959
7156
 
6960
7157
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6961
7158
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6967,9 +7164,9 @@ inline void ggml_cuda_op_relu(
6967
7164
  (void) src1_dd;
6968
7165
  }
6969
7166
 
6970
- inline void ggml_cuda_op_leaky_relu(
7167
+ static void ggml_cuda_op_leaky_relu(
6971
7168
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6972
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7169
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6973
7170
 
6974
7171
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6975
7172
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6984,9 +7181,9 @@ inline void ggml_cuda_op_leaky_relu(
6984
7181
  (void) src1_dd;
6985
7182
  }
6986
7183
 
6987
- inline void ggml_cuda_op_sqr(
7184
+ static void ggml_cuda_op_sqr(
6988
7185
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6989
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7186
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
6990
7187
 
6991
7188
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6992
7189
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6998,9 +7195,9 @@ inline void ggml_cuda_op_sqr(
6998
7195
  (void) src1_dd;
6999
7196
  }
7000
7197
 
7001
- inline void ggml_cuda_op_norm(
7198
+ static void ggml_cuda_op_norm(
7002
7199
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7003
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7200
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7004
7201
 
7005
7202
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7006
7203
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7018,10 +7215,9 @@ inline void ggml_cuda_op_norm(
7018
7215
  (void) src1_dd;
7019
7216
  }
7020
7217
 
7021
-
7022
- inline void ggml_cuda_op_group_norm(
7218
+ static void ggml_cuda_op_group_norm(
7023
7219
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7024
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7220
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7025
7221
 
7026
7222
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7027
7223
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7035,9 +7231,9 @@ inline void ggml_cuda_op_group_norm(
7035
7231
  (void) src1_dd;
7036
7232
  }
7037
7233
 
7038
- inline void ggml_cuda_op_concat(
7234
+ static void ggml_cuda_op_concat(
7039
7235
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7040
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7236
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7041
7237
 
7042
7238
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7043
7239
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -7051,9 +7247,9 @@ inline void ggml_cuda_op_concat(
7051
7247
  (void) dst;
7052
7248
  }
7053
7249
 
7054
- inline void ggml_cuda_op_upscale(
7250
+ static void ggml_cuda_op_upscale(
7055
7251
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7056
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7252
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7057
7253
 
7058
7254
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7059
7255
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -7068,9 +7264,9 @@ inline void ggml_cuda_op_upscale(
7068
7264
  (void) src1_dd;
7069
7265
  }
7070
7266
 
7071
- inline void ggml_cuda_op_pad(
7267
+ static void ggml_cuda_op_pad(
7072
7268
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7073
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7269
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7074
7270
 
7075
7271
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7076
7272
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -7085,9 +7281,9 @@ inline void ggml_cuda_op_pad(
7085
7281
  (void) src1_dd;
7086
7282
  }
7087
7283
 
7088
- inline void ggml_cuda_op_rms_norm(
7284
+ static void ggml_cuda_op_rms_norm(
7089
7285
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7090
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7286
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7091
7287
 
7092
7288
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7093
7289
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7105,10 +7301,10 @@ inline void ggml_cuda_op_rms_norm(
7105
7301
  (void) src1_dd;
7106
7302
  }
7107
7303
 
7108
- inline void ggml_cuda_op_mul_mat_q(
7304
+ static void ggml_cuda_op_mul_mat_q(
7109
7305
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7110
7306
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7111
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7307
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
7112
7308
 
7113
7309
  const int64_t ne00 = src0->ne[0];
7114
7310
 
@@ -7170,13 +7366,13 @@ inline void ggml_cuda_op_mul_mat_q(
7170
7366
  static int64_t get_row_rounding(ggml_type type) {
7171
7367
  int64_t min_compute_capability = INT_MAX;
7172
7368
  int64_t max_compute_capability = INT_MIN;
7173
- for (int64_t id = 0; id < g_device_count; ++id) {
7369
+ for (int id = 0; id < g_device_count; ++id) {
7174
7370
  if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7175
- if (min_compute_capability > g_compute_capabilities[id]) {
7176
- min_compute_capability = g_compute_capabilities[id];
7371
+ if (min_compute_capability > g_device_caps[id].cc) {
7372
+ min_compute_capability = g_device_caps[id].cc;
7177
7373
  }
7178
- if (max_compute_capability < g_compute_capabilities[id]) {
7179
- max_compute_capability = g_compute_capabilities[id];
7374
+ if (max_compute_capability < g_device_caps[id].cc) {
7375
+ max_compute_capability = g_device_caps[id].cc;
7180
7376
  }
7181
7377
  }
7182
7378
  }
@@ -7228,10 +7424,10 @@ static int64_t get_row_rounding(ggml_type type) {
7228
7424
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
7229
7425
  }
7230
7426
 
7231
- inline void ggml_cuda_op_mul_mat_vec_q(
7427
+ static void ggml_cuda_op_mul_mat_vec_q(
7232
7428
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7233
7429
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7234
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7430
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
7235
7431
 
7236
7432
  GGML_ASSERT(ggml_nrows(src1) == 1);
7237
7433
 
@@ -7281,18 +7477,18 @@ inline void ggml_cuda_op_mul_mat_vec_q(
7281
7477
  (void) src1_padded_row_size;
7282
7478
  }
7283
7479
 
7284
- inline void ggml_cuda_op_dequantize_mul_mat_vec(
7480
+ static void ggml_cuda_op_dequantize_mul_mat_vec(
7285
7481
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7286
7482
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7287
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7483
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
7288
7484
 
7289
7485
  const int64_t ne00 = src0->ne[0];
7290
7486
  const int64_t row_diff = row_high - row_low;
7291
7487
 
7292
7488
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
7293
7489
  #ifdef GGML_CUDA_F16
7294
- size_t ash;
7295
- dfloat * src1_dfloat = nullptr; // dfloat == half
7490
+ cuda_pool_alloc<half> src1_dfloat_a;
7491
+ half * src1_dfloat = nullptr; // dfloat == half
7296
7492
 
7297
7493
  bool src1_convert_f16 =
7298
7494
  src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
@@ -7300,7 +7496,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
7300
7496
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
7301
7497
 
7302
7498
  if (src1_convert_f16) {
7303
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
7499
+ src1_dfloat = src1_dfloat_a.alloc(ne00);
7304
7500
  ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
7305
7501
  ne00, 1, sizeof(float), 0, 0,
7306
7502
  ne00, 1, sizeof(half), 0, 0, stream);
@@ -7348,12 +7544,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
7348
7544
  break;
7349
7545
  }
7350
7546
 
7351
- #ifdef GGML_CUDA_F16
7352
- if (src1_convert_f16) {
7353
- ggml_cuda_pool_free(src1_dfloat, ash);
7354
- }
7355
- #endif // GGML_CUDA_F16
7356
-
7357
7547
  (void) src1;
7358
7548
  (void) dst;
7359
7549
  (void) src1_ddq_i;
@@ -7361,10 +7551,10 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
7361
7551
  (void) src1_padded_row_size;
7362
7552
  }
7363
7553
 
7364
- inline void ggml_cuda_op_mul_mat_cublas(
7554
+ static void ggml_cuda_op_mul_mat_cublas(
7365
7555
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7366
7556
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7367
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7557
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
7368
7558
 
7369
7559
  GGML_ASSERT(src0_dd_i != nullptr);
7370
7560
  GGML_ASSERT(src1_ddf_i != nullptr);
@@ -7384,33 +7574,30 @@ inline void ggml_cuda_op_mul_mat_cublas(
7384
7574
  // ldc == nrows of the matrix that cuBLAS writes into
7385
7575
  int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
7386
7576
 
7387
- const int compute_capability = g_compute_capabilities[id];
7577
+ const int compute_capability = g_device_caps[id].cc;
7388
7578
 
7389
7579
  if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
7390
7580
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7391
- half * src0_as_f16 = nullptr;
7392
- size_t src0_as = 0;
7581
+ cuda_pool_alloc<half> src0_as_f16;
7393
7582
  if (src0->type != GGML_TYPE_F16) {
7394
7583
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
7395
7584
  GGML_ASSERT(to_fp16_cuda != nullptr);
7396
7585
  size_t ne = row_diff*ne00;
7397
- src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
7398
- to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
7586
+ src0_as_f16.alloc(ne);
7587
+ to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
7399
7588
  }
7400
- const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
7589
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
7401
7590
 
7402
- half * src1_as_f16 = nullptr;
7403
- size_t src1_as = 0;
7591
+ cuda_pool_alloc<half> src1_as_f16;
7404
7592
  if (src1->type != GGML_TYPE_F16) {
7405
7593
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
7406
7594
  GGML_ASSERT(to_fp16_cuda != nullptr);
7407
7595
  size_t ne = src1_ncols*ne10;
7408
- src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
7409
- to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
7596
+ src1_as_f16.alloc(ne);
7597
+ to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
7410
7598
  }
7411
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
7412
- size_t dst_as = 0;
7413
- half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
7599
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
7600
+ cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);
7414
7601
 
7415
7602
  const half alpha_f16 = 1.0f;
7416
7603
  const half beta_f16 = 0.0f;
@@ -7419,36 +7606,25 @@ inline void ggml_cuda_op_mul_mat_cublas(
7419
7606
  CUBLAS_CHECK(
7420
7607
  cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7421
7608
  row_diff, src1_ncols, ne10,
7422
- &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
7423
- src1_ptr, CUDA_R_16F, ne10,
7424
- &beta_f16, dst_f16, CUDA_R_16F, ldc,
7609
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
7610
+ src1_ptr, CUDA_R_16F, ne10,
7611
+ &beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
7425
7612
  CUBLAS_COMPUTE_16F,
7426
7613
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7427
7614
 
7428
7615
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
7429
- to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
7430
-
7431
- ggml_cuda_pool_free(dst_f16, dst_as);
7432
-
7433
- if (src0_as != 0) {
7434
- ggml_cuda_pool_free(src0_as_f16, src0_as);
7435
- }
7436
-
7437
- if (src1_as != 0) {
7438
- ggml_cuda_pool_free(src1_as_f16, src1_as);
7439
- }
7616
+ to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
7440
7617
  }
7441
7618
  else {
7442
- float * src0_ddq_as_f32 = nullptr;
7443
- size_t src0_as = 0;
7619
+ cuda_pool_alloc<float> src0_ddq_as_f32;
7444
7620
 
7445
7621
  if (src0->type != GGML_TYPE_F32) {
7446
7622
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
7447
7623
  GGML_ASSERT(to_fp32_cuda != nullptr);
7448
- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
7449
- to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
7624
+ src0_ddq_as_f32.alloc(row_diff*ne00);
7625
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
7450
7626
  }
7451
- const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
7627
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
7452
7628
 
7453
7629
  const float alpha = 1.0f;
7454
7630
  const float beta = 0.0f;
@@ -7460,10 +7636,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
7460
7636
  &alpha, src0_ddf_i, ne00,
7461
7637
  src1_ddf_i, ne10,
7462
7638
  &beta, dst_dd_i, ldc));
7463
-
7464
- if (src0_as != 0) {
7465
- ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
7466
- }
7467
7639
  }
7468
7640
 
7469
7641
  (void) dst;
@@ -7471,9 +7643,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
7471
7643
  (void) src1_padded_row_size;
7472
7644
  }
7473
7645
 
7474
- inline void ggml_cuda_op_rope(
7646
+ static void ggml_cuda_op_rope(
7475
7647
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7476
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7648
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7477
7649
 
7478
7650
  GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
7479
7651
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@@ -7551,9 +7723,9 @@ inline void ggml_cuda_op_rope(
7551
7723
  (void) src1_dd;
7552
7724
  }
7553
7725
 
7554
- inline void ggml_cuda_op_alibi(
7726
+ static void ggml_cuda_op_alibi(
7555
7727
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7556
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7728
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7557
7729
 
7558
7730
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7559
7731
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7582,9 +7754,9 @@ inline void ggml_cuda_op_alibi(
7582
7754
  (void) src1_dd;
7583
7755
  }
7584
7756
 
7585
- inline void ggml_cuda_op_im2col(
7757
+ static void ggml_cuda_op_im2col(
7586
7758
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7587
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7759
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7588
7760
 
7589
7761
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
7590
7762
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -7617,10 +7789,9 @@ inline void ggml_cuda_op_im2col(
7617
7789
  (void) src0_dd;
7618
7790
  }
7619
7791
 
7620
-
7621
- inline void ggml_cuda_op_sum_rows(
7792
+ static void ggml_cuda_op_sum_rows(
7622
7793
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7623
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7794
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7624
7795
 
7625
7796
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7626
7797
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7635,9 +7806,9 @@ inline void ggml_cuda_op_sum_rows(
7635
7806
  (void) src1_dd;
7636
7807
  }
7637
7808
 
7638
- inline void ggml_cuda_op_argsort(
7809
+ static void ggml_cuda_op_argsort(
7639
7810
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7640
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7811
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7641
7812
 
7642
7813
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7643
7814
  GGML_ASSERT( dst->type == GGML_TYPE_I32);
@@ -7654,9 +7825,9 @@ inline void ggml_cuda_op_argsort(
7654
7825
  (void) src1_dd;
7655
7826
  }
7656
7827
 
7657
- inline void ggml_cuda_op_diag_mask_inf(
7828
+ static void ggml_cuda_op_diag_mask_inf(
7658
7829
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7659
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7830
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7660
7831
 
7661
7832
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7662
7833
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7674,9 +7845,9 @@ inline void ggml_cuda_op_diag_mask_inf(
7674
7845
  (void) src1_dd;
7675
7846
  }
7676
7847
 
7677
- inline void ggml_cuda_op_soft_max(
7848
+ static void ggml_cuda_op_soft_max(
7678
7849
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7679
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7850
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7680
7851
 
7681
7852
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7682
7853
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7695,14 +7866,15 @@ inline void ggml_cuda_op_soft_max(
7695
7866
  (void) dst;
7696
7867
  }
7697
7868
 
7698
- inline void ggml_cuda_op_scale(
7869
+ static void ggml_cuda_op_scale(
7699
7870
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7700
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7871
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7701
7872
 
7702
7873
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7703
7874
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
7704
7875
 
7705
- const float scale = ((float *) dst->op_params)[0];
7876
+ float scale;
7877
+ memcpy(&scale, dst->op_params, sizeof(float));
7706
7878
 
7707
7879
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
7708
7880
  CUDA_CHECK(cudaGetLastError());
@@ -7712,9 +7884,9 @@ inline void ggml_cuda_op_scale(
7712
7884
  (void) src1_dd;
7713
7885
  }
7714
7886
 
7715
- inline void ggml_cuda_op_clamp(
7887
+ static void ggml_cuda_op_clamp(
7716
7888
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7717
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7889
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7718
7890
 
7719
7891
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7720
7892
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7754,18 +7926,17 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7754
7926
  float * src1_ddf = nullptr;
7755
7927
  float * dst_ddf = nullptr;
7756
7928
 
7757
- // as = actual size
7758
- size_t src0_asf = 0;
7759
- size_t src1_asf = 0;
7760
- size_t dst_asf = 0;
7929
+ cuda_pool_alloc<float> src0_f;
7930
+ cuda_pool_alloc<float> src1_f;
7931
+ cuda_pool_alloc<float> dst_f;
7761
7932
 
7762
7933
  ggml_cuda_set_device(g_main_device);
7763
- const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7934
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7764
7935
 
7765
7936
  if (src0_on_device) {
7766
7937
  src0_ddf = (float *) src0_extra->data_device[g_main_device];
7767
7938
  } else {
7768
- src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
7939
+ src0_ddf = src0_f.alloc(ggml_nelements(src0));
7769
7940
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
7770
7941
  }
7771
7942
 
@@ -7773,14 +7944,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7773
7944
  if (src1_on_device) {
7774
7945
  src1_ddf = (float *) src1_extra->data_device[g_main_device];
7775
7946
  } else {
7776
- src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
7947
+ src1_ddf = src1_f.alloc(ggml_nelements(src1));
7777
7948
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
7778
7949
  }
7779
7950
  }
7780
7951
  if (dst_on_device) {
7781
7952
  dst_ddf = (float *) dst_extra->data_device[g_main_device];
7782
7953
  } else {
7783
- dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
7954
+ dst_ddf = dst_f.alloc(ggml_nelements(dst));
7784
7955
  }
7785
7956
 
7786
7957
  // do the computation
@@ -7792,16 +7963,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7792
7963
  CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
7793
7964
  }
7794
7965
 
7795
- if (src0_asf > 0) {
7796
- ggml_cuda_pool_free(src0_ddf, src0_asf);
7797
- }
7798
- if (src1_asf > 0) {
7799
- ggml_cuda_pool_free(src1_ddf, src1_asf);
7800
- }
7801
- if (dst_asf > 0) {
7802
- ggml_cuda_pool_free(dst_ddf, dst_asf);
7803
- }
7804
-
7805
7966
  if (dst->backend == GGML_BACKEND_CPU) {
7806
7967
  CUDA_CHECK(cudaDeviceSynchronize());
7807
7968
  }
@@ -7818,12 +7979,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
7818
7979
 
7819
7980
  #ifdef NDEBUG
7820
7981
  for (int id = 0; id < g_device_count; ++id) {
7821
- CUDA_CHECK(ggml_cuda_set_device(id));
7982
+ ggml_cuda_set_device(id);
7822
7983
  CUDA_CHECK(cudaDeviceSynchronize());
7823
7984
  }
7824
7985
 
7825
7986
  for (int id = 0; id < g_device_count; ++id) {
7826
- CUDA_CHECK(ggml_cuda_set_device(id));
7987
+ ggml_cuda_set_device(id);
7827
7988
 
7828
7989
  for (int id_other = 0; id_other < g_device_count; ++id_other) {
7829
7990
  if (id == id_other) {
@@ -7857,7 +8018,6 @@ static void ggml_cuda_op_mul_mat(
7857
8018
  const int64_t ne01 = src0->ne[1];
7858
8019
  const int64_t ne02 = src0->ne[2];
7859
8020
  const int64_t ne03 = src0->ne[3];
7860
- const int64_t nrows0 = ggml_nrows(src0);
7861
8021
 
7862
8022
  const int64_t ne10 = src1->ne[0];
7863
8023
  const int64_t ne11 = src1->ne[1];
@@ -7900,27 +8060,29 @@ static void ggml_cuda_op_mul_mat(
7900
8060
  GGML_ASSERT(!(split && ne03 > 1));
7901
8061
  GGML_ASSERT(!(split && ne02 < ne12));
7902
8062
 
7903
- // dd = data device
7904
- char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
7905
- float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
7906
- char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
7907
- float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
8063
+ struct dev_data {
8064
+ cuda_pool_alloc<char> src0_dd_alloc;
8065
+ cuda_pool_alloc<float> src1_ddf_alloc;
8066
+ cuda_pool_alloc<char> src1_ddq_alloc;
8067
+ cuda_pool_alloc<float> dst_dd_alloc;
8068
+
8069
+ char * src0_dd = nullptr;
8070
+ float * src1_ddf = nullptr; // float
8071
+ char * src1_ddq = nullptr; // q8_1
8072
+ float * dst_dd = nullptr;
7908
8073
 
7909
- // as = actual size
7910
- size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
7911
- size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
7912
- size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
7913
- size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
8074
+ int64_t row_low;
8075
+ int64_t row_high;
8076
+ };
7914
8077
 
7915
- int64_t row_low[GGML_CUDA_MAX_DEVICES];
7916
- int64_t row_high[GGML_CUDA_MAX_DEVICES];
8078
+ dev_data dev[GGML_CUDA_MAX_DEVICES];
7917
8079
 
7918
8080
  int used_devices = 0;
7919
8081
 
7920
- for (int64_t id = 0; id < g_device_count; ++id) {
8082
+ for (int id = 0; id < g_device_count; ++id) {
7921
8083
  // by default, use all rows
7922
- row_low[id] = 0;
7923
- row_high[id] = ne01;
8084
+ dev[id].row_low = 0;
8085
+ dev[id].row_high = ne01;
7924
8086
 
7925
8087
  // for multi GPU, get the row boundaries from tensor split
7926
8088
  // and round to mul_mat_q tile sizes
@@ -7928,19 +8090,23 @@ static void ggml_cuda_op_mul_mat(
7928
8090
  const int64_t rounding = get_row_rounding(src0->type);
7929
8091
 
7930
8092
  if (id != 0) {
7931
- row_low[id] = ne01*g_tensor_split[id];
7932
- row_low[id] -= row_low[id] % rounding;
8093
+ dev[id].row_low = ne01*g_tensor_split[id];
8094
+ if (dev[id].row_low < ne01) {
8095
+ dev[id].row_low -= dev[id].row_low % rounding;
8096
+ }
7933
8097
  }
7934
8098
 
7935
8099
  if (id != g_device_count - 1) {
7936
- row_high[id] = ne01*g_tensor_split[id + 1];
7937
- row_high[id] -= row_high[id] % rounding;
8100
+ dev[id].row_high = ne01*g_tensor_split[id + 1];
8101
+ if (dev[id].row_high < ne01) {
8102
+ dev[id].row_high -= dev[id].row_high % rounding;
8103
+ }
7938
8104
  }
7939
8105
  }
7940
8106
  }
7941
8107
 
7942
- for (int64_t id = 0; id < g_device_count; ++id) {
7943
- if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
8108
+ for (int id = 0; id < g_device_count; ++id) {
8109
+ if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
7944
8110
  continue;
7945
8111
  }
7946
8112
 
@@ -7950,42 +8116,41 @@ static void ggml_cuda_op_mul_mat(
7950
8116
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
7951
8117
 
7952
8118
  ggml_cuda_set_device(id);
7953
- const cudaStream_t stream = g_cudaStreams[id][0];
8119
+ cudaStream_t stream = g_cudaStreams[id][0];
7954
8120
 
7955
8121
  if (src0_on_device && src0_is_contiguous) {
7956
- src0_dd[id] = (char *) src0_extra->data_device[id];
8122
+ dev[id].src0_dd = (char *) src0_extra->data_device[id];
7957
8123
  } else {
7958
- // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7959
- src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
8124
+ dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
7960
8125
  }
7961
8126
 
7962
8127
  if (src1_on_device && src1_is_contiguous) {
7963
- src1_ddf[id] = (float *) src1_extra->data_device[id];
8128
+ dev[id].src1_ddf = (float *) src1_extra->data_device[id];
7964
8129
  } else {
7965
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
8130
+ dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1));
7966
8131
  }
7967
8132
 
7968
8133
  if (convert_src1_to_q8_1) {
7969
- src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
8134
+ dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
7970
8135
 
7971
8136
  if (src1_on_device && src1_is_contiguous) {
7972
- quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
8137
+ quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
7973
8138
  CUDA_CHECK(cudaGetLastError());
7974
8139
  }
7975
8140
  }
7976
8141
 
7977
8142
  if (dst_on_device) {
7978
- dst_dd[id] = (float *) dst_extra->data_device[id];
8143
+ dev[id].dst_dd = (float *) dst_extra->data_device[id];
7979
8144
  } else {
7980
- const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
7981
- dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
8145
+ const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
8146
+ dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
7982
8147
  }
7983
8148
  }
7984
8149
 
7985
8150
  // if multiple devices are used they need to wait for the main device
7986
8151
  // here an event is recorded that signals that the main device has finished calculating the input data
7987
8152
  if (split && used_devices > 1) {
7988
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8153
+ ggml_cuda_set_device(g_main_device);
7989
8154
  CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
7990
8155
  }
7991
8156
 
@@ -7994,17 +8159,17 @@ static void ggml_cuda_op_mul_mat(
7994
8159
  const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
7995
8160
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
7996
8161
 
7997
- for (int64_t id = 0; id < g_device_count; ++id) {
7998
- if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
8162
+ for (int id = 0; id < g_device_count; ++id) {
8163
+ if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
7999
8164
  continue;
8000
8165
  }
8001
8166
 
8002
8167
  const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
8003
8168
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
8004
- const int64_t row_diff = row_high[id] - row_low[id];
8169
+ const int64_t row_diff = dev[id].row_high - dev[id].row_low;
8005
8170
 
8006
8171
  ggml_cuda_set_device(id);
8007
- const cudaStream_t stream = g_cudaStreams[id][is];
8172
+ cudaStream_t stream = g_cudaStreams[id][is];
8008
8173
 
8009
8174
  // wait for main GPU data if necessary
8010
8175
  if (split && (id != g_main_device || is != 0)) {
@@ -8018,34 +8183,34 @@ static void ggml_cuda_op_mul_mat(
8018
8183
  const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
8019
8184
 
8020
8185
  // for split tensors the data begins at i0 == i0_offset_low
8021
- char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
8022
- float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
8023
- char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
8024
- float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
8186
+ char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
8187
+ float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
8188
+ char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
8189
+ float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
8025
8190
 
8026
8191
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
8027
8192
  // in that case an offset on dst_ddf_i is needed
8028
8193
  if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
8029
- dst_dd_i += row_low[id]; // offset is 0 if no tensor split
8194
+ dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
8030
8195
  }
8031
8196
 
8032
8197
  // copy src0, src1 to device if necessary
8033
8198
  if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
8034
8199
  if (id != g_main_device) {
8035
8200
  if (convert_src1_to_q8_1) {
8036
- char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
8037
- CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
8038
- cudaMemcpyDeviceToDevice, stream));
8201
+ char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
8202
+ CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
8203
+ src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
8039
8204
  } else {
8040
8205
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
8041
8206
  src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
8042
- CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
8043
- cudaMemcpyDeviceToDevice, stream));
8207
+ CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
8208
+ src1_ncols*ne10*sizeof(float), stream));
8044
8209
  }
8045
8210
  }
8046
8211
  } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
8047
8212
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
8048
- src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
8213
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
8049
8214
  } else {
8050
8215
  GGML_ASSERT(false);
8051
8216
  }
@@ -8056,12 +8221,12 @@ static void ggml_cuda_op_mul_mat(
8056
8221
  }
8057
8222
 
8058
8223
  if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
8059
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
8224
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
8060
8225
  }
8061
8226
 
8062
8227
  // do the computation
8063
8228
  op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
8064
- row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
8229
+ dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
8065
8230
  CUDA_CHECK(cudaGetLastError());
8066
8231
 
8067
8232
  // copy dst to host or other device if necessary
@@ -8085,9 +8250,25 @@ static void ggml_cuda_op_mul_mat(
8085
8250
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
8086
8251
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
8087
8252
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
8088
- dhf_dst_i += src1_col_0*ne0 + row_low[id];
8089
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
8090
- row_diff*sizeof(float), src1_ncols, kind, stream));
8253
+ dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
8254
+ #if !defined(GGML_USE_HIPBLAS)
8255
+ if (kind == cudaMemcpyDeviceToDevice) {
8256
+ // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
8257
+ cudaMemcpy3DPeerParms p = {};
8258
+ p.dstDevice = g_main_device;
8259
+ p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
8260
+ p.srcDevice = id;
8261
+ p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
8262
+ p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
8263
+ CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
8264
+ } else
8265
+ #endif
8266
+ {
8267
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
8268
+ dst_dd_i, row_diff*sizeof(float),
8269
+ row_diff*sizeof(float), src1_ncols,
8270
+ kind, stream));
8271
+ }
8091
8272
  } else {
8092
8273
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
8093
8274
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@@ -8104,35 +8285,14 @@ static void ggml_cuda_op_mul_mat(
8104
8285
  }
8105
8286
  }
8106
8287
 
8107
- for (int64_t id = 0; id < g_device_count; ++id) {
8108
- if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
8109
- continue;
8110
- }
8111
- CUDA_CHECK(ggml_cuda_set_device(id));
8112
-
8113
- // free buffers again when done
8114
- if (src0_as[id] > 0) {
8115
- ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
8116
- }
8117
- if (src1_asf[id] > 0) {
8118
- ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
8119
- }
8120
- if (src1_asq[id] > 0) {
8121
- ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
8122
- }
8123
- if (dst_as[id] > 0) {
8124
- ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
8125
- }
8126
- }
8127
-
8128
8288
  // main device waits for all other devices to be finished
8129
8289
  if (split && g_device_count > 1) {
8130
8290
  int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
8131
8291
  is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
8132
8292
 
8133
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8134
- for (int64_t id = 0; id < g_device_count; ++id) {
8135
- if (row_low[id] == row_high[id]) {
8293
+ ggml_cuda_set_device(g_main_device);
8294
+ for (int id = 0; id < g_device_count; ++id) {
8295
+ if (dev[id].row_low == dev[id].row_high) {
8136
8296
  continue;
8137
8297
  }
8138
8298
  for (int64_t is = 0; is < is_max; ++is) {
@@ -8142,7 +8302,7 @@ static void ggml_cuda_op_mul_mat(
8142
8302
  }
8143
8303
 
8144
8304
  if (dst->backend == GGML_BACKEND_CPU) {
8145
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8305
+ ggml_cuda_set_device(g_main_device);
8146
8306
  CUDA_CHECK(cudaDeviceSynchronize());
8147
8307
  }
8148
8308
  }
@@ -8252,7 +8412,7 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
8252
8412
 
8253
8413
  const int64_t ne12 = src1->ne[2];
8254
8414
 
8255
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8415
+ ggml_cuda_set_device(g_main_device);
8256
8416
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8257
8417
 
8258
8418
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -8284,7 +8444,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
8284
8444
 
8285
8445
  const int64_t ne12 = src1->ne[2];
8286
8446
 
8287
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8447
+ ggml_cuda_set_device(g_main_device);
8288
8448
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8289
8449
 
8290
8450
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -8355,7 +8515,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8355
8515
  const int64_t ne1 = ggml_nelements(src1);
8356
8516
  const int64_t ne = ggml_nelements(dst);
8357
8517
 
8358
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8518
+ ggml_cuda_set_device(g_main_device);
8359
8519
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8360
8520
 
8361
8521
  CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
@@ -8374,14 +8534,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8374
8534
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8375
8535
  GGML_ASSERT(to_fp16_cuda != nullptr);
8376
8536
 
8377
- size_t src1_as = 0;
8378
- half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
8379
- to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8380
-
8381
- size_t dst_as = 0;
8537
+ cuda_pool_alloc<half> src1_as_f16(ne1);
8538
+ to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
8382
8539
 
8383
- half * dst_f16 = nullptr;
8384
- char * dst_t = nullptr;
8540
+ cuda_pool_alloc<half> dst_f16;
8541
+ char * dst_t;
8385
8542
 
8386
8543
  cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
8387
8544
  cudaDataType_t cu_data_type = CUDA_R_16F;
@@ -8400,8 +8557,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8400
8557
  const void * beta = &beta_f16;
8401
8558
 
8402
8559
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8403
- dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8404
- dst_t = (char *) dst_f16;
8560
+ dst_t = (char *) dst_f16.alloc(ne);
8405
8561
 
8406
8562
  nbd2 /= sizeof(float) / sizeof(half);
8407
8563
  nbd3 /= sizeof(float) / sizeof(half);
@@ -8448,9 +8604,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8448
8604
  CUBLAS_CHECK(
8449
8605
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8450
8606
  ne01, ne11, ne10,
8451
- alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8452
- (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8453
- beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
8607
+ alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8608
+ (const char *) src1_as_f16.get(), CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8609
+ beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
8454
8610
  ne12*ne13,
8455
8611
  cu_compute_type,
8456
8612
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -8458,19 +8614,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8458
8614
  // use cublasGemmBatchedEx
8459
8615
  const int ne23 = ne12*ne13;
8460
8616
 
8461
- const void ** ptrs_src = nullptr;
8462
- void ** ptrs_dst = nullptr;
8463
-
8464
- size_t ptrs_src_s = 0;
8465
- size_t ptrs_dst_s = 0;
8466
-
8467
- ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
8468
- ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
8617
+ cuda_pool_alloc<const void *> ptrs_src(2*ne23);
8618
+ cuda_pool_alloc< void *> ptrs_dst(1*ne23);
8469
8619
 
8470
8620
  dim3 block_dims(ne13, ne12);
8471
8621
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
8472
- src0_as_f16, src1_as_f16, dst_t,
8473
- ptrs_src, ptrs_dst,
8622
+ src0_as_f16, src1_as_f16.get(), dst_t,
8623
+ ptrs_src.get(), ptrs_dst.get(),
8474
8624
  ne12, ne13,
8475
8625
  ne23,
8476
8626
  nb02, nb03,
@@ -8482,30 +8632,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8482
8632
  CUBLAS_CHECK(
8483
8633
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8484
8634
  ne01, ne11, ne10,
8485
- alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8486
- (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8487
- beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
8635
+ alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8636
+ (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8637
+ beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
8488
8638
  ne23,
8489
8639
  cu_compute_type,
8490
8640
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8491
-
8492
- if (ptrs_src_s != 0) {
8493
- ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
8494
- }
8495
- if (ptrs_dst_s != 0) {
8496
- ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
8497
- }
8498
8641
  }
8499
8642
  #endif
8500
8643
 
8501
8644
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8502
8645
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8503
- to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8504
-
8505
- ggml_cuda_pool_free(dst_f16, dst_as);
8646
+ to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
8506
8647
  }
8507
-
8508
- ggml_cuda_pool_free(src1_as_f16, src1_as);
8509
8648
  }
8510
8649
 
8511
8650
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8517,9 +8656,9 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8517
8656
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
8518
8657
 
8519
8658
  int64_t min_compute_capability = INT_MAX;
8520
- for (int64_t id = 0; id < g_device_count; ++id) {
8521
- if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8522
- min_compute_capability = g_compute_capabilities[id];
8659
+ for (int id = 0; id < g_device_count; ++id) {
8660
+ if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8661
+ min_compute_capability = g_device_caps[id].cc;
8523
8662
  }
8524
8663
  }
8525
8664
 
@@ -8660,7 +8799,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
8660
8799
  const int64_t ne1 = ggml_nelements(src1);
8661
8800
  const int64_t ne = ggml_nelements(dst);
8662
8801
 
8663
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8802
+ ggml_cuda_set_device(g_main_device);
8664
8803
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8665
8804
 
8666
8805
  CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
@@ -8778,7 +8917,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8778
8917
 
8779
8918
  std::vector<char> ids_host(ggml_nbytes(ids));
8780
8919
 
8781
- const cudaStream_t stream = g_cudaStreams[g_main_device][0];
8920
+ cudaStream_t stream = g_cudaStreams[g_main_device][0];
8782
8921
 
8783
8922
  if (ids->backend == GGML_BACKEND_GPU) {
8784
8923
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
@@ -8832,17 +8971,16 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8832
8971
  ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8833
8972
  }
8834
8973
  } else {
8835
- size_t as_src1, as_dst;
8836
- char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
8837
- char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
8974
+ cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
8975
+ cuda_pool_alloc<char> dst_contiguous(sizeof(float)*ggml_nelements(dst));
8838
8976
 
8839
- src1_row_extra.data_device[g_main_device] = src1_contiguous;
8840
- dst_row_extra.data_device[g_main_device] = dst_contiguous;
8977
+ src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
8978
+ dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
8841
8979
 
8842
8980
  const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8843
8981
  cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8844
8982
  const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
8845
- cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8983
+ cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
8846
8984
 
8847
8985
  for (int32_t row_id = 0; row_id < n_as; ++row_id) {
8848
8986
  const struct ggml_tensor * src0_row = dst->src[row_id + 2];
@@ -8857,7 +8995,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8857
8995
 
8858
8996
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
8859
8997
 
8860
- CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8998
+ CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
8861
8999
  nb11, src1_kind, stream));
8862
9000
  num_src1_rows++;
8863
9001
  }
@@ -8889,14 +9027,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8889
9027
 
8890
9028
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
8891
9029
 
8892
- CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
9030
+ CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
8893
9031
  nb1, dst_kind, stream));
8894
9032
  num_src1_rows++;
8895
9033
  }
8896
9034
  }
8897
-
8898
- ggml_cuda_pool_free(src1_contiguous, as_src1);
8899
- ggml_cuda_pool_free(dst_contiguous, as_dst);
8900
9035
  }
8901
9036
 
8902
9037
  if (dst->backend == GGML_BACKEND_CPU) {
@@ -8938,7 +9073,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
8938
9073
  const int64_t nb11 = src1->nb[1];
8939
9074
  const int64_t nb12 = src1->nb[2];
8940
9075
 
8941
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
9076
+ ggml_cuda_set_device(g_main_device);
8942
9077
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8943
9078
 
8944
9079
  const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -9028,7 +9163,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
9028
9163
  ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
9029
9164
  memset(extra, 0, sizeof(*extra));
9030
9165
 
9031
- for (int64_t id = 0; id < g_device_count; ++id) {
9166
+ for (int id = 0; id < g_device_count; ++id) {
9032
9167
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
9033
9168
  continue;
9034
9169
  }
@@ -9099,15 +9234,14 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
9099
9234
 
9100
9235
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9101
9236
 
9102
- for (int64_t id = 0; id < g_device_count; ++id) {
9237
+ for (int id = 0; id < g_device_count; ++id) {
9238
+ ggml_cuda_set_device(id);
9103
9239
  if (extra->data_device[id] != nullptr) {
9104
- CUDA_CHECK(ggml_cuda_set_device(id));
9105
9240
  CUDA_CHECK(cudaFree(extra->data_device[id]));
9106
9241
  }
9107
9242
 
9108
9243
  for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9109
9244
  if (extra->events[id][is] != nullptr) {
9110
- CUDA_CHECK(ggml_cuda_set_device(id));
9111
9245
  CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
9112
9246
  }
9113
9247
  }
@@ -9161,7 +9295,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
9161
9295
  force_inplace;
9162
9296
  const size_t size = ggml_nbytes(tensor);
9163
9297
 
9164
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
9298
+ ggml_cuda_set_device(g_main_device);
9165
9299
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
9166
9300
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
9167
9301
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
@@ -9238,7 +9372,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
9238
9372
  GGML_ASSERT(ggml_is_contiguous(tensor));
9239
9373
 
9240
9374
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9241
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
9375
+ ggml_cuda_set_device(g_main_device);
9242
9376
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
9243
9377
  }
9244
9378
 
@@ -9662,12 +9796,16 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9662
9796
  // host buffer type
9663
9797
 
9664
9798
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9665
- CUDA_CHECK(cudaFreeHost(buffer->context));
9799
+ ggml_cuda_host_free(buffer->context);
9666
9800
  }
9667
9801
 
9668
9802
  static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
9669
- void * ptr;
9670
- CUDA_CHECK(cudaMallocHost(&ptr, size));
9803
+ void * ptr = ggml_cuda_host_malloc(size);
9804
+
9805
+ if (ptr == nullptr) {
9806
+ // fallback to cpu buffer
9807
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
9808
+ }
9671
9809
 
9672
9810
  // FIXME: this is a hack to avoid having to implement a new buffer type
9673
9811
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);