llama_cpp 0.10.2 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-backend.c +6 -10
- data/ext/llama_cpp/src/ggml-cuda.cu +510 -372
- data/ext/llama_cpp/src/ggml-quants.c +25 -344
- data/ext/llama_cpp/src/ggml.c +7 -8
- data/ext/llama_cpp/src/ggml.h +2 -0
- data/ext/llama_cpp/src/llama.cpp +432 -39
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
@@ -68,8 +68,9 @@
|
|
68
68
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
69
69
|
#endif
|
70
70
|
#define cudaMemcpy hipMemcpy
|
71
|
-
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
72
71
|
#define cudaMemcpyAsync hipMemcpyAsync
|
72
|
+
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
73
|
+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
73
74
|
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
74
75
|
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
75
76
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
@@ -86,10 +87,29 @@
|
|
86
87
|
#define cudaStream_t hipStream_t
|
87
88
|
#define cudaSuccess hipSuccess
|
88
89
|
#define __trap abort
|
90
|
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
91
|
+
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
92
|
+
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
93
|
+
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
94
|
+
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
95
|
+
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
96
|
+
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
97
|
+
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
98
|
+
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
89
99
|
#else
|
90
100
|
#include <cuda_runtime.h>
|
101
|
+
#include <cuda.h>
|
91
102
|
#include <cublas_v2.h>
|
92
103
|
#include <cuda_fp16.h>
|
104
|
+
|
105
|
+
#if CUDART_VERSION < 11020
|
106
|
+
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
107
|
+
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
108
|
+
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
109
|
+
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
110
|
+
#define cublasComputeType_t cudaDataType_t
|
111
|
+
#endif // CUDART_VERSION < 11020
|
112
|
+
|
93
113
|
#endif // defined(GGML_USE_HIPBLAS)
|
94
114
|
|
95
115
|
#include "ggml-cuda.h"
|
@@ -144,7 +164,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
144
164
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
145
165
|
#if __has_builtin(__builtin_elementwise_sub_sat)
|
146
166
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
147
|
-
return reinterpret_cast<const int&>(c);
|
167
|
+
return reinterpret_cast<const int &>(c);
|
148
168
|
#else
|
149
169
|
int8x4_t c;
|
150
170
|
int16_t tmp;
|
@@ -155,7 +175,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
155
175
|
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
156
176
|
c[i] = tmp;
|
157
177
|
}
|
158
|
-
return reinterpret_cast<int&>(c);
|
178
|
+
return reinterpret_cast<int &>(c);
|
159
179
|
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
160
180
|
}
|
161
181
|
|
@@ -193,45 +213,59 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
193
213
|
|
194
214
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
195
215
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
216
|
+
[[noreturn]]
|
217
|
+
static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
|
218
|
+
int id = -1; // in case cudaGetDevice fails
|
219
|
+
cudaGetDevice(&id);
|
220
|
+
|
221
|
+
fprintf(stderr, "CUDA error: %s\n", msg);
|
222
|
+
fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
223
|
+
fprintf(stderr, " %s\n", stmt);
|
224
|
+
// abort with GGML_ASSERT to get a stack trace
|
225
|
+
GGML_ASSERT(!"CUDA error");
|
226
|
+
}
|
227
|
+
|
228
|
+
#define CUDA_CHECK_GEN(err, success, error_fn) \
|
229
|
+
do { \
|
230
|
+
auto err_ = (err); \
|
231
|
+
if (err_ != (success)) { \
|
232
|
+
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
|
233
|
+
} \
|
207
234
|
} while (0)
|
208
235
|
|
236
|
+
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
|
237
|
+
|
209
238
|
#if CUDART_VERSION >= 12000
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
214
|
-
int id; \
|
215
|
-
cudaGetDevice(&id); \
|
216
|
-
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
217
|
-
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
218
|
-
fprintf(stderr, "current device: %d\n", id); \
|
219
|
-
GGML_ASSERT(!"cuBLAS error"); \
|
220
|
-
} \
|
221
|
-
} while (0)
|
239
|
+
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
240
|
+
return cublasGetStatusString(err);
|
241
|
+
}
|
222
242
|
#else
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
243
|
+
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
244
|
+
switch (err) {
|
245
|
+
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
|
246
|
+
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
|
247
|
+
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
|
248
|
+
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
|
249
|
+
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
|
250
|
+
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
|
251
|
+
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
|
252
|
+
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
|
253
|
+
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
|
254
|
+
default: return "unknown error";
|
255
|
+
}
|
256
|
+
}
|
257
|
+
#endif // CUDART_VERSION >= 12000
|
258
|
+
|
259
|
+
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
260
|
+
|
261
|
+
#if !defined(GGML_USE_HIPBLAS)
|
262
|
+
static const char * cu_get_error_str(CUresult err) {
|
263
|
+
const char * err_str;
|
264
|
+
cuGetErrorString(err, &err_str);
|
265
|
+
return err_str;
|
266
|
+
}
|
267
|
+
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
268
|
+
#endif
|
235
269
|
|
236
270
|
#if CUDART_VERSION >= 11100
|
237
271
|
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
@@ -287,10 +321,10 @@ typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * s
|
|
287
321
|
typedef void (*ggml_cuda_op_mul_mat_t)(
|
288
322
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
289
323
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
290
|
-
const int64_t src1_padded_row_size,
|
324
|
+
const int64_t src1_padded_row_size, cudaStream_t stream);
|
291
325
|
typedef void (*ggml_cuda_op_flatten_t)(
|
292
326
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
293
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
327
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
|
294
328
|
|
295
329
|
// QK = number of values after dequantization
|
296
330
|
// QR = QK / number of values before dequantization
|
@@ -496,22 +530,29 @@ struct ggml_tensor_extra_gpu {
|
|
496
530
|
|
497
531
|
// this is faster on Windows
|
498
532
|
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
499
|
-
|
533
|
+
static void ggml_cuda_set_device(const int device) {
|
500
534
|
int current_device;
|
501
535
|
CUDA_CHECK(cudaGetDevice(¤t_device));
|
502
536
|
|
503
537
|
if (device == current_device) {
|
504
|
-
return
|
538
|
+
return;
|
505
539
|
}
|
506
540
|
|
507
|
-
|
541
|
+
CUDA_CHECK(cudaSetDevice(device));
|
508
542
|
}
|
509
543
|
|
510
544
|
static int g_device_count = -1;
|
511
545
|
static int g_main_device = 0;
|
512
|
-
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
513
546
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
514
547
|
|
548
|
+
struct cuda_device_capabilities {
|
549
|
+
int cc; // compute capability
|
550
|
+
bool vmm; // virtual memory support
|
551
|
+
size_t vmm_granularity; // granularity of virtual memory
|
552
|
+
};
|
553
|
+
|
554
|
+
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
|
555
|
+
|
515
556
|
static void * g_scratch_buffer = nullptr;
|
516
557
|
static size_t g_scratch_size = 0; // disabled by default
|
517
558
|
static size_t g_scratch_offset = 0;
|
@@ -553,6 +594,7 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
553
594
|
|
554
595
|
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
555
596
|
return b;
|
597
|
+
GGML_UNUSED(a);
|
556
598
|
}
|
557
599
|
|
558
600
|
static __device__ __forceinline__ float op_add(const float a, const float b) {
|
@@ -674,7 +716,7 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
674
716
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
675
717
|
}
|
676
718
|
|
677
|
-
static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
|
719
|
+
static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
|
678
720
|
const float GELU_QUICK_COEF = -1.702f;
|
679
721
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
680
722
|
if (i >= k) {
|
@@ -683,7 +725,7 @@ static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
|
|
683
725
|
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
|
684
726
|
}
|
685
727
|
|
686
|
-
static __global__ void tanh_f32(const float *x, float *dst, int k) {
|
728
|
+
static __global__ void tanh_f32(const float * x, float * dst, int k) {
|
687
729
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
688
730
|
if (i >= k) {
|
689
731
|
return;
|
@@ -700,7 +742,7 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|
700
742
|
dst[i] = fmaxf(x[i], 0);
|
701
743
|
}
|
702
744
|
|
703
|
-
static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
|
745
|
+
static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
|
704
746
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
705
747
|
if (i >= k) {
|
706
748
|
return;
|
@@ -753,7 +795,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
|
|
753
795
|
}
|
754
796
|
}
|
755
797
|
|
756
|
-
static __global__ void concat_f32(const float
|
798
|
+
static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
|
757
799
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
758
800
|
if (nidx >= ne0) {
|
759
801
|
return;
|
@@ -778,7 +820,7 @@ static __global__ void concat_f32(const float *x,const float *y, float *dst, c
|
|
778
820
|
}
|
779
821
|
}
|
780
822
|
|
781
|
-
static __global__ void upscale_f32(const float
|
823
|
+
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
|
782
824
|
int ne0 = ne00 * scale_factor;
|
783
825
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
784
826
|
if (nidx >= ne0) {
|
@@ -798,7 +840,7 @@ static __global__ void upscale_f32(const float *x, float *dst, const int ne00,
|
|
798
840
|
dst[offset_dst] = x[offset_src];
|
799
841
|
}
|
800
842
|
|
801
|
-
static __global__ void pad_f32(const float
|
843
|
+
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
802
844
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
803
845
|
if (nidx >= ne0) {
|
804
846
|
return;
|
@@ -4700,7 +4742,6 @@ static __global__ void mul_mat_p021_f16_f32(
|
|
4700
4742
|
|
4701
4743
|
const int row_y = col_x;
|
4702
4744
|
|
4703
|
-
|
4704
4745
|
// y is not transposed but permuted
|
4705
4746
|
const int iy = channel*nrows_y + row_y;
|
4706
4747
|
|
@@ -5266,17 +5307,17 @@ static __global__ void im2col_f32_f16(
|
|
5266
5307
|
const int ky = (i - kd) / OW;
|
5267
5308
|
const int ix = i % OW;
|
5268
5309
|
|
5269
|
-
const
|
5270
|
-
const
|
5310
|
+
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
5311
|
+
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
|
5271
5312
|
|
5272
|
-
const
|
5313
|
+
const int64_t offset_dst =
|
5273
5314
|
(blockIdx.y * OW + ix) * CHW +
|
5274
5315
|
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
5275
5316
|
|
5276
5317
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
5277
5318
|
dst[offset_dst] = __float2half(0.0f);
|
5278
5319
|
} else {
|
5279
|
-
const
|
5320
|
+
const int64_t offset_src = blockIdx.z * offset_delta;
|
5280
5321
|
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
5281
5322
|
}
|
5282
5323
|
}
|
@@ -5375,7 +5416,7 @@ struct bin_bcast_cuda {
|
|
5375
5416
|
cne[3] = 1;
|
5376
5417
|
};
|
5377
5418
|
|
5378
|
-
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
5419
|
+
auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
|
5379
5420
|
cnb[1] *= cne[1];
|
5380
5421
|
cnb[2] *= cne[2];
|
5381
5422
|
cnb[3] *= cne[3];
|
@@ -5868,7 +5909,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
5868
5909
|
|
5869
5910
|
int id;
|
5870
5911
|
CUDA_CHECK(cudaGetDevice(&id));
|
5871
|
-
const int compute_capability =
|
5912
|
+
const int compute_capability = g_device_caps[id].cc;
|
5872
5913
|
|
5873
5914
|
int mmq_x, mmq_y, nwarps;
|
5874
5915
|
if (compute_capability >= CC_RDNA2) {
|
@@ -5913,7 +5954,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
5913
5954
|
|
5914
5955
|
int id;
|
5915
5956
|
CUDA_CHECK(cudaGetDevice(&id));
|
5916
|
-
const int compute_capability =
|
5957
|
+
const int compute_capability = g_device_caps[id].cc;
|
5917
5958
|
|
5918
5959
|
int mmq_x, mmq_y, nwarps;
|
5919
5960
|
if (compute_capability >= CC_RDNA2) {
|
@@ -5958,7 +5999,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
5958
5999
|
|
5959
6000
|
int id;
|
5960
6001
|
CUDA_CHECK(cudaGetDevice(&id));
|
5961
|
-
const int compute_capability =
|
6002
|
+
const int compute_capability = g_device_caps[id].cc;
|
5962
6003
|
|
5963
6004
|
int mmq_x, mmq_y, nwarps;
|
5964
6005
|
if (compute_capability >= CC_RDNA2) {
|
@@ -6003,7 +6044,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
6003
6044
|
|
6004
6045
|
int id;
|
6005
6046
|
CUDA_CHECK(cudaGetDevice(&id));
|
6006
|
-
const int compute_capability =
|
6047
|
+
const int compute_capability = g_device_caps[id].cc;
|
6007
6048
|
|
6008
6049
|
int mmq_x, mmq_y, nwarps;
|
6009
6050
|
if (compute_capability >= CC_RDNA2) {
|
@@ -6048,7 +6089,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
6048
6089
|
|
6049
6090
|
int id;
|
6050
6091
|
CUDA_CHECK(cudaGetDevice(&id));
|
6051
|
-
const int compute_capability =
|
6092
|
+
const int compute_capability = g_device_caps[id].cc;
|
6052
6093
|
|
6053
6094
|
int mmq_x, mmq_y, nwarps;
|
6054
6095
|
if (compute_capability >= CC_RDNA2) {
|
@@ -6093,7 +6134,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
6093
6134
|
|
6094
6135
|
int id;
|
6095
6136
|
CUDA_CHECK(cudaGetDevice(&id));
|
6096
|
-
const int compute_capability =
|
6137
|
+
const int compute_capability = g_device_caps[id].cc;
|
6097
6138
|
|
6098
6139
|
int mmq_x, mmq_y, nwarps;
|
6099
6140
|
if (compute_capability >= CC_RDNA2) {
|
@@ -6140,7 +6181,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
6140
6181
|
|
6141
6182
|
int id;
|
6142
6183
|
CUDA_CHECK(cudaGetDevice(&id));
|
6143
|
-
const int compute_capability =
|
6184
|
+
const int compute_capability = g_device_caps[id].cc;
|
6144
6185
|
|
6145
6186
|
int mmq_x, mmq_y, nwarps;
|
6146
6187
|
if (compute_capability >= CC_RDNA2) {
|
@@ -6186,7 +6227,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
6186
6227
|
|
6187
6228
|
int id;
|
6188
6229
|
CUDA_CHECK(cudaGetDevice(&id));
|
6189
|
-
const int compute_capability =
|
6230
|
+
const int compute_capability = g_device_caps[id].cc;
|
6190
6231
|
|
6191
6232
|
int mmq_x, mmq_y, nwarps;
|
6192
6233
|
if (compute_capability >= CC_RDNA2) {
|
@@ -6231,7 +6272,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
6231
6272
|
|
6232
6273
|
int id;
|
6233
6274
|
CUDA_CHECK(cudaGetDevice(&id));
|
6234
|
-
const int compute_capability =
|
6275
|
+
const int compute_capability = g_device_caps[id].cc;
|
6235
6276
|
|
6236
6277
|
int mmq_x, mmq_y, nwarps;
|
6237
6278
|
if (compute_capability >= CC_RDNA2) {
|
@@ -6276,7 +6317,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
6276
6317
|
|
6277
6318
|
int id;
|
6278
6319
|
CUDA_CHECK(cudaGetDevice(&id));
|
6279
|
-
const int compute_capability =
|
6320
|
+
const int compute_capability = g_device_caps[id].cc;
|
6280
6321
|
|
6281
6322
|
int mmq_x, mmq_y, nwarps;
|
6282
6323
|
if (compute_capability >= CC_RDNA2) {
|
@@ -6536,30 +6577,30 @@ struct scoped_spin_lock {
|
|
6536
6577
|
scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
|
6537
6578
|
};
|
6538
6579
|
|
6539
|
-
|
6580
|
+
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
|
6581
|
+
|
6582
|
+
// #define DEBUG_CUDA_MALLOC
|
6583
|
+
struct ggml_cuda_buffer {
|
6540
6584
|
void * ptr = nullptr;
|
6541
6585
|
size_t size = 0;
|
6542
6586
|
};
|
6543
6587
|
|
6544
|
-
static
|
6545
|
-
static
|
6588
|
+
static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
|
6589
|
+
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
|
6546
6590
|
|
6547
|
-
static void *
|
6591
|
+
static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
|
6548
6592
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
6549
|
-
int id;
|
6550
|
-
CUDA_CHECK(cudaGetDevice(&id));
|
6551
6593
|
#ifdef DEBUG_CUDA_MALLOC
|
6552
6594
|
int nnz = 0;
|
6553
|
-
size_t max_size = 0
|
6595
|
+
size_t max_size = 0;
|
6554
6596
|
#endif
|
6555
6597
|
size_t best_diff = 1ull << 36;
|
6556
6598
|
int ibest = -1;
|
6557
6599
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
6558
|
-
|
6600
|
+
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
|
6559
6601
|
if (b.ptr != nullptr) {
|
6560
6602
|
#ifdef DEBUG_CUDA_MALLOC
|
6561
6603
|
++nnz;
|
6562
|
-
tot_size += b.size;
|
6563
6604
|
if (b.size > max_size) max_size = b.size;
|
6564
6605
|
#endif
|
6565
6606
|
if (b.size >= size) {
|
@@ -6579,32 +6620,32 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
6579
6620
|
}
|
6580
6621
|
}
|
6581
6622
|
if (ibest >= 0) {
|
6582
|
-
|
6623
|
+
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest];
|
6583
6624
|
void * ptr = b.ptr;
|
6584
6625
|
*actual_size = b.size;
|
6585
6626
|
b.ptr = nullptr;
|
6586
6627
|
b.size = 0;
|
6587
6628
|
return ptr;
|
6588
6629
|
}
|
6589
|
-
#ifdef DEBUG_CUDA_MALLOC
|
6590
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
6591
|
-
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
6592
|
-
#endif
|
6593
6630
|
void * ptr;
|
6594
6631
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
6595
6632
|
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
6633
|
+
ggml_cuda_set_device(device);
|
6596
6634
|
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
6597
6635
|
*actual_size = look_ahead_size;
|
6636
|
+
g_cuda_pool_size[device] += look_ahead_size;
|
6637
|
+
#ifdef DEBUG_CUDA_MALLOC
|
6638
|
+
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
|
6639
|
+
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
|
6640
|
+
#endif
|
6598
6641
|
return ptr;
|
6599
6642
|
}
|
6600
6643
|
|
6601
|
-
static void
|
6644
|
+
static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
|
6602
6645
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
6603
|
-
int id;
|
6604
|
-
CUDA_CHECK(cudaGetDevice(&id));
|
6605
6646
|
|
6606
6647
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
6607
|
-
|
6648
|
+
ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
|
6608
6649
|
if (b.ptr == nullptr) {
|
6609
6650
|
b.ptr = ptr;
|
6610
6651
|
b.size = size;
|
@@ -6612,9 +6653,149 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
6612
6653
|
}
|
6613
6654
|
}
|
6614
6655
|
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
6656
|
+
ggml_cuda_set_device(device);
|
6615
6657
|
CUDA_CHECK(cudaFree(ptr));
|
6658
|
+
g_cuda_pool_size[device] -= size;
|
6616
6659
|
}
|
6617
6660
|
|
6661
|
+
#if !defined(GGML_USE_HIPBLAS)
|
6662
|
+
// pool with virtual memory
|
6663
|
+
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
6664
|
+
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
6665
|
+
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
|
6666
|
+
|
6667
|
+
static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
|
6668
|
+
scoped_spin_lock lock(g_cuda_pool_lock);
|
6669
|
+
|
6670
|
+
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
6671
|
+
const size_t alignment = 128;
|
6672
|
+
size = alignment * ((size + alignment - 1) / alignment);
|
6673
|
+
|
6674
|
+
size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];
|
6675
|
+
|
6676
|
+
if (size > avail) {
|
6677
|
+
// round up to the next multiple of the granularity
|
6678
|
+
size_t reserve_size = size - avail;
|
6679
|
+
const size_t granularity = g_device_caps[device].vmm_granularity;
|
6680
|
+
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
|
6681
|
+
|
6682
|
+
GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
6683
|
+
|
6684
|
+
// allocate more physical memory
|
6685
|
+
CUmemAllocationProp prop = {};
|
6686
|
+
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
6687
|
+
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
6688
|
+
prop.location.id = device;
|
6689
|
+
CUmemGenericAllocationHandle handle;
|
6690
|
+
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
|
6691
|
+
|
6692
|
+
// reserve virtual address space (if not already reserved)
|
6693
|
+
if (g_cuda_pool_addr[device] == 0) {
|
6694
|
+
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
|
6695
|
+
}
|
6696
|
+
|
6697
|
+
// map at the end of the pool
|
6698
|
+
CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));
|
6699
|
+
|
6700
|
+
// the memory allocation handle is no longer needed after mapping
|
6701
|
+
CU_CHECK(cuMemRelease(handle));
|
6702
|
+
|
6703
|
+
// set access
|
6704
|
+
CUmemAccessDesc access = {};
|
6705
|
+
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
6706
|
+
access.location.id = device;
|
6707
|
+
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
6708
|
+
CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));
|
6709
|
+
|
6710
|
+
// add to the pool
|
6711
|
+
g_cuda_pool_size[device] += reserve_size;
|
6712
|
+
|
6713
|
+
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
6714
|
+
// id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
|
6715
|
+
// (unsigned long long) (reserve_size/1024/1024));
|
6716
|
+
}
|
6717
|
+
|
6718
|
+
GGML_ASSERT(g_cuda_pool_addr[device] != 0);
|
6719
|
+
|
6720
|
+
void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
|
6721
|
+
*actual_size = size;
|
6722
|
+
g_cuda_pool_used[device] += size;
|
6723
|
+
|
6724
|
+
#ifdef DEBUG_CUDA_MALLOC
|
6725
|
+
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
|
6726
|
+
#endif
|
6727
|
+
|
6728
|
+
return ptr;
|
6729
|
+
}
|
6730
|
+
|
6731
|
+
static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
|
6732
|
+
scoped_spin_lock lock(g_cuda_pool_lock);
|
6733
|
+
|
6734
|
+
#ifdef DEBUG_CUDA_MALLOC
|
6735
|
+
printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
|
6736
|
+
#endif
|
6737
|
+
|
6738
|
+
g_cuda_pool_used[device] -= size;
|
6739
|
+
|
6740
|
+
// all deallocations must be in reverse order of the allocations
|
6741
|
+
GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
|
6742
|
+
}
|
6743
|
+
|
6744
|
+
static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
|
6745
|
+
if (g_device_caps[device].vmm) {
|
6746
|
+
return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
|
6747
|
+
} else {
|
6748
|
+
return ggml_cuda_pool_malloc_leg(device, size, actual_size);
|
6749
|
+
}
|
6750
|
+
}
|
6751
|
+
|
6752
|
+
static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
|
6753
|
+
if (g_device_caps[device].vmm) {
|
6754
|
+
ggml_cuda_pool_free_vmm(device, ptr, size);
|
6755
|
+
} else {
|
6756
|
+
ggml_cuda_pool_free_leg(device, ptr, size);
|
6757
|
+
}
|
6758
|
+
}
|
6759
|
+
#else
|
6760
|
+
#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
|
6761
|
+
#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
|
6762
|
+
#endif // !defined(GGML_USE_HIPBLAS)
|
6763
|
+
|
6764
|
+
template<typename T>
|
6765
|
+
struct cuda_pool_alloc {
|
6766
|
+
int device = -1;
|
6767
|
+
T * ptr = nullptr;
|
6768
|
+
size_t actual_size = 0;
|
6769
|
+
|
6770
|
+
// size is in number of elements
|
6771
|
+
T * alloc(size_t size) {
|
6772
|
+
GGML_ASSERT(ptr == nullptr);
|
6773
|
+
CUDA_CHECK(cudaGetDevice(&device));
|
6774
|
+
ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
|
6775
|
+
return ptr;
|
6776
|
+
}
|
6777
|
+
|
6778
|
+
cuda_pool_alloc(size_t size) {
|
6779
|
+
alloc(size);
|
6780
|
+
}
|
6781
|
+
|
6782
|
+
~cuda_pool_alloc() {
|
6783
|
+
if (ptr != nullptr) {
|
6784
|
+
ggml_cuda_pool_free(device, ptr, actual_size);
|
6785
|
+
}
|
6786
|
+
}
|
6787
|
+
|
6788
|
+
T * get() {
|
6789
|
+
return ptr;
|
6790
|
+
}
|
6791
|
+
|
6792
|
+
cuda_pool_alloc() = default;
|
6793
|
+
cuda_pool_alloc(const cuda_pool_alloc &) = delete;
|
6794
|
+
cuda_pool_alloc(cuda_pool_alloc &&) = delete;
|
6795
|
+
cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
|
6796
|
+
cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
|
6797
|
+
};
|
6798
|
+
|
6618
6799
|
static bool g_cublas_loaded = false;
|
6619
6800
|
|
6620
6801
|
bool ggml_cublas_loaded(void) {
|
@@ -6653,16 +6834,33 @@ void ggml_init_cublas() {
|
|
6653
6834
|
#endif
|
6654
6835
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
6655
6836
|
for (int id = 0; id < g_device_count; ++id) {
|
6837
|
+
int device_vmm = 0;
|
6838
|
+
|
6839
|
+
#if !defined(GGML_USE_HIPBLAS)
|
6840
|
+
CUdevice device;
|
6841
|
+
CU_CHECK(cuDeviceGet(&device, id));
|
6842
|
+
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
6843
|
+
|
6844
|
+
if (device_vmm) {
|
6845
|
+
CUmemAllocationProp alloc_prop = {};
|
6846
|
+
alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
6847
|
+
alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
6848
|
+
alloc_prop.location.id = id;
|
6849
|
+
CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
6850
|
+
}
|
6851
|
+
#endif // !defined(GGML_USE_HIPBLAS)
|
6852
|
+
g_device_caps[id].vmm = !!device_vmm;
|
6853
|
+
|
6656
6854
|
cudaDeviceProp prop;
|
6657
6855
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
6658
|
-
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
6856
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
6659
6857
|
|
6660
6858
|
g_tensor_split[id] = total_vram;
|
6661
6859
|
total_vram += prop.totalGlobalMem;
|
6662
6860
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
6663
|
-
|
6861
|
+
g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
6664
6862
|
#else
|
6665
|
-
|
6863
|
+
g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
|
6666
6864
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
6667
6865
|
}
|
6668
6866
|
for (int id = 0; id < g_device_count; ++id) {
|
@@ -6670,7 +6868,7 @@ void ggml_init_cublas() {
|
|
6670
6868
|
}
|
6671
6869
|
|
6672
6870
|
for (int id = 0; id < g_device_count; ++id) {
|
6673
|
-
|
6871
|
+
ggml_cuda_set_device(id);
|
6674
6872
|
|
6675
6873
|
// create cuda streams
|
6676
6874
|
for (int is = 0; is < MAX_STREAMS; ++is) {
|
@@ -6722,8 +6920,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
6722
6920
|
void * ptr = nullptr;
|
6723
6921
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
6724
6922
|
if (err != cudaSuccess) {
|
6725
|
-
//
|
6726
|
-
// This can fixed the OOM error in WSL.
|
6923
|
+
// clear the error
|
6727
6924
|
cudaGetLastError();
|
6728
6925
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
6729
6926
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
@@ -6786,7 +6983,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6786
6983
|
|
6787
6984
|
static void ggml_cuda_op_get_rows(
|
6788
6985
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6789
|
-
const float * src0_d, const float * src1_d, float * dst_d,
|
6986
|
+
const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
|
6790
6987
|
|
6791
6988
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
6792
6989
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
@@ -6828,9 +7025,9 @@ static void ggml_cuda_op_get_rows(
|
|
6828
7025
|
}
|
6829
7026
|
|
6830
7027
|
template<class op>
|
6831
|
-
|
7028
|
+
static void ggml_cuda_op_bin_bcast(
|
6832
7029
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6833
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7030
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6834
7031
|
|
6835
7032
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6836
7033
|
|
@@ -6849,7 +7046,7 @@ inline void ggml_cuda_op_bin_bcast(
|
|
6849
7046
|
|
6850
7047
|
static void ggml_cuda_op_repeat(
|
6851
7048
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6852
|
-
const float * src0_d, const float * src1_d, float * dst_d,
|
7049
|
+
const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
|
6853
7050
|
|
6854
7051
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
6855
7052
|
|
@@ -6857,16 +7054,16 @@ static void ggml_cuda_op_repeat(
|
|
6857
7054
|
(void) src1_d;
|
6858
7055
|
}
|
6859
7056
|
|
6860
|
-
|
7057
|
+
static void ggml_cuda_op_add(
|
6861
7058
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6862
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7059
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6863
7060
|
|
6864
7061
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6865
7062
|
}
|
6866
7063
|
|
6867
|
-
|
7064
|
+
static void ggml_cuda_op_acc(
|
6868
7065
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6869
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7066
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6870
7067
|
|
6871
7068
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6872
7069
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -6883,23 +7080,23 @@ inline void ggml_cuda_op_acc(
|
|
6883
7080
|
(void) dst;
|
6884
7081
|
}
|
6885
7082
|
|
6886
|
-
|
7083
|
+
static void ggml_cuda_op_mul(
|
6887
7084
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6888
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7085
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6889
7086
|
|
6890
7087
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6891
7088
|
}
|
6892
7089
|
|
6893
|
-
|
7090
|
+
static void ggml_cuda_op_div(
|
6894
7091
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6895
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7092
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6896
7093
|
|
6897
7094
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
6898
7095
|
}
|
6899
7096
|
|
6900
|
-
|
7097
|
+
static void ggml_cuda_op_gelu(
|
6901
7098
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6902
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7099
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6903
7100
|
|
6904
7101
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6905
7102
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -6911,9 +7108,9 @@ inline void ggml_cuda_op_gelu(
|
|
6911
7108
|
(void) src1_dd;
|
6912
7109
|
}
|
6913
7110
|
|
6914
|
-
|
7111
|
+
static void ggml_cuda_op_silu(
|
6915
7112
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6916
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7113
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6917
7114
|
|
6918
7115
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6919
7116
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -6925,9 +7122,9 @@ inline void ggml_cuda_op_silu(
|
|
6925
7122
|
(void) src1_dd;
|
6926
7123
|
}
|
6927
7124
|
|
6928
|
-
|
7125
|
+
static void ggml_cuda_op_gelu_quick(
|
6929
7126
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6930
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7127
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6931
7128
|
|
6932
7129
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6933
7130
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -6939,9 +7136,9 @@ inline void ggml_cuda_op_gelu_quick(
|
|
6939
7136
|
(void) src1_dd;
|
6940
7137
|
}
|
6941
7138
|
|
6942
|
-
|
7139
|
+
static void ggml_cuda_op_tanh(
|
6943
7140
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6944
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7141
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6945
7142
|
|
6946
7143
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6947
7144
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -6953,9 +7150,9 @@ inline void ggml_cuda_op_tanh(
|
|
6953
7150
|
(void) src1_dd;
|
6954
7151
|
}
|
6955
7152
|
|
6956
|
-
|
7153
|
+
static void ggml_cuda_op_relu(
|
6957
7154
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6958
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7155
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6959
7156
|
|
6960
7157
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6961
7158
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -6967,9 +7164,9 @@ inline void ggml_cuda_op_relu(
|
|
6967
7164
|
(void) src1_dd;
|
6968
7165
|
}
|
6969
7166
|
|
6970
|
-
|
7167
|
+
static void ggml_cuda_op_leaky_relu(
|
6971
7168
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6972
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7169
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6973
7170
|
|
6974
7171
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6975
7172
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -6984,9 +7181,9 @@ inline void ggml_cuda_op_leaky_relu(
|
|
6984
7181
|
(void) src1_dd;
|
6985
7182
|
}
|
6986
7183
|
|
6987
|
-
|
7184
|
+
static void ggml_cuda_op_sqr(
|
6988
7185
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6989
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7186
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
6990
7187
|
|
6991
7188
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6992
7189
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -6998,9 +7195,9 @@ inline void ggml_cuda_op_sqr(
|
|
6998
7195
|
(void) src1_dd;
|
6999
7196
|
}
|
7000
7197
|
|
7001
|
-
|
7198
|
+
static void ggml_cuda_op_norm(
|
7002
7199
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7003
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7200
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7004
7201
|
|
7005
7202
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7006
7203
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -7018,10 +7215,9 @@ inline void ggml_cuda_op_norm(
|
|
7018
7215
|
(void) src1_dd;
|
7019
7216
|
}
|
7020
7217
|
|
7021
|
-
|
7022
|
-
inline void ggml_cuda_op_group_norm(
|
7218
|
+
static void ggml_cuda_op_group_norm(
|
7023
7219
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7024
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7220
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7025
7221
|
|
7026
7222
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7027
7223
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -7035,9 +7231,9 @@ inline void ggml_cuda_op_group_norm(
|
|
7035
7231
|
(void) src1_dd;
|
7036
7232
|
}
|
7037
7233
|
|
7038
|
-
|
7234
|
+
static void ggml_cuda_op_concat(
|
7039
7235
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7040
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7236
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7041
7237
|
|
7042
7238
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7043
7239
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -7051,9 +7247,9 @@ inline void ggml_cuda_op_concat(
|
|
7051
7247
|
(void) dst;
|
7052
7248
|
}
|
7053
7249
|
|
7054
|
-
|
7250
|
+
static void ggml_cuda_op_upscale(
|
7055
7251
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7056
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7252
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7057
7253
|
|
7058
7254
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7059
7255
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
@@ -7068,9 +7264,9 @@ inline void ggml_cuda_op_upscale(
|
|
7068
7264
|
(void) src1_dd;
|
7069
7265
|
}
|
7070
7266
|
|
7071
|
-
|
7267
|
+
static void ggml_cuda_op_pad(
|
7072
7268
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7073
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7269
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7074
7270
|
|
7075
7271
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7076
7272
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
@@ -7085,9 +7281,9 @@ inline void ggml_cuda_op_pad(
|
|
7085
7281
|
(void) src1_dd;
|
7086
7282
|
}
|
7087
7283
|
|
7088
|
-
|
7284
|
+
static void ggml_cuda_op_rms_norm(
|
7089
7285
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7090
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7286
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7091
7287
|
|
7092
7288
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7093
7289
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -7105,10 +7301,10 @@ inline void ggml_cuda_op_rms_norm(
|
|
7105
7301
|
(void) src1_dd;
|
7106
7302
|
}
|
7107
7303
|
|
7108
|
-
|
7304
|
+
static void ggml_cuda_op_mul_mat_q(
|
7109
7305
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
7110
7306
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
7111
|
-
const int64_t src1_padded_row_size,
|
7307
|
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
7112
7308
|
|
7113
7309
|
const int64_t ne00 = src0->ne[0];
|
7114
7310
|
|
@@ -7170,13 +7366,13 @@ inline void ggml_cuda_op_mul_mat_q(
|
|
7170
7366
|
static int64_t get_row_rounding(ggml_type type) {
|
7171
7367
|
int64_t min_compute_capability = INT_MAX;
|
7172
7368
|
int64_t max_compute_capability = INT_MIN;
|
7173
|
-
for (
|
7369
|
+
for (int id = 0; id < g_device_count; ++id) {
|
7174
7370
|
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7175
|
-
if (min_compute_capability >
|
7176
|
-
min_compute_capability =
|
7371
|
+
if (min_compute_capability > g_device_caps[id].cc) {
|
7372
|
+
min_compute_capability = g_device_caps[id].cc;
|
7177
7373
|
}
|
7178
|
-
if (max_compute_capability <
|
7179
|
-
max_compute_capability =
|
7374
|
+
if (max_compute_capability < g_device_caps[id].cc) {
|
7375
|
+
max_compute_capability = g_device_caps[id].cc;
|
7180
7376
|
}
|
7181
7377
|
}
|
7182
7378
|
}
|
@@ -7228,10 +7424,10 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
7228
7424
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
7229
7425
|
}
|
7230
7426
|
|
7231
|
-
|
7427
|
+
static void ggml_cuda_op_mul_mat_vec_q(
|
7232
7428
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
7233
7429
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
7234
|
-
const int64_t src1_padded_row_size,
|
7430
|
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
7235
7431
|
|
7236
7432
|
GGML_ASSERT(ggml_nrows(src1) == 1);
|
7237
7433
|
|
@@ -7281,18 +7477,18 @@ inline void ggml_cuda_op_mul_mat_vec_q(
|
|
7281
7477
|
(void) src1_padded_row_size;
|
7282
7478
|
}
|
7283
7479
|
|
7284
|
-
|
7480
|
+
static void ggml_cuda_op_dequantize_mul_mat_vec(
|
7285
7481
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
7286
7482
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
7287
|
-
const int64_t src1_padded_row_size,
|
7483
|
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
7288
7484
|
|
7289
7485
|
const int64_t ne00 = src0->ne[0];
|
7290
7486
|
const int64_t row_diff = row_high - row_low;
|
7291
7487
|
|
7292
7488
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
7293
7489
|
#ifdef GGML_CUDA_F16
|
7294
|
-
|
7295
|
-
|
7490
|
+
cuda_pool_alloc<half> src1_dfloat_a;
|
7491
|
+
half * src1_dfloat = nullptr; // dfloat == half
|
7296
7492
|
|
7297
7493
|
bool src1_convert_f16 =
|
7298
7494
|
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
@@ -7300,7 +7496,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
7300
7496
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
7301
7497
|
|
7302
7498
|
if (src1_convert_f16) {
|
7303
|
-
src1_dfloat = (
|
7499
|
+
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
7304
7500
|
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
7305
7501
|
ne00, 1, sizeof(float), 0, 0,
|
7306
7502
|
ne00, 1, sizeof(half), 0, 0, stream);
|
@@ -7348,12 +7544,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
7348
7544
|
break;
|
7349
7545
|
}
|
7350
7546
|
|
7351
|
-
#ifdef GGML_CUDA_F16
|
7352
|
-
if (src1_convert_f16) {
|
7353
|
-
ggml_cuda_pool_free(src1_dfloat, ash);
|
7354
|
-
}
|
7355
|
-
#endif // GGML_CUDA_F16
|
7356
|
-
|
7357
7547
|
(void) src1;
|
7358
7548
|
(void) dst;
|
7359
7549
|
(void) src1_ddq_i;
|
@@ -7361,10 +7551,10 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
7361
7551
|
(void) src1_padded_row_size;
|
7362
7552
|
}
|
7363
7553
|
|
7364
|
-
|
7554
|
+
static void ggml_cuda_op_mul_mat_cublas(
|
7365
7555
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
7366
7556
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
7367
|
-
const int64_t src1_padded_row_size,
|
7557
|
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
7368
7558
|
|
7369
7559
|
GGML_ASSERT(src0_dd_i != nullptr);
|
7370
7560
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
@@ -7384,33 +7574,30 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
7384
7574
|
// ldc == nrows of the matrix that cuBLAS writes into
|
7385
7575
|
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
7386
7576
|
|
7387
|
-
const int compute_capability =
|
7577
|
+
const int compute_capability = g_device_caps[id].cc;
|
7388
7578
|
|
7389
7579
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
7390
7580
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
7391
|
-
half
|
7392
|
-
size_t src0_as = 0;
|
7581
|
+
cuda_pool_alloc<half> src0_as_f16;
|
7393
7582
|
if (src0->type != GGML_TYPE_F16) {
|
7394
7583
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
7395
7584
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
7396
7585
|
size_t ne = row_diff*ne00;
|
7397
|
-
src0_as_f16
|
7398
|
-
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
7586
|
+
src0_as_f16.alloc(ne);
|
7587
|
+
to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
|
7399
7588
|
}
|
7400
|
-
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
7589
|
+
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
|
7401
7590
|
|
7402
|
-
half
|
7403
|
-
size_t src1_as = 0;
|
7591
|
+
cuda_pool_alloc<half> src1_as_f16;
|
7404
7592
|
if (src1->type != GGML_TYPE_F16) {
|
7405
7593
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
7406
7594
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
7407
7595
|
size_t ne = src1_ncols*ne10;
|
7408
|
-
src1_as_f16
|
7409
|
-
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
7596
|
+
src1_as_f16.alloc(ne);
|
7597
|
+
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
|
7410
7598
|
}
|
7411
|
-
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
7412
|
-
|
7413
|
-
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
7599
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
|
7600
|
+
cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);
|
7414
7601
|
|
7415
7602
|
const half alpha_f16 = 1.0f;
|
7416
7603
|
const half beta_f16 = 0.0f;
|
@@ -7419,36 +7606,25 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
7419
7606
|
CUBLAS_CHECK(
|
7420
7607
|
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7421
7608
|
row_diff, src1_ncols, ne10,
|
7422
|
-
&alpha_f16, src0_ptr,
|
7423
|
-
src1_ptr,
|
7424
|
-
&beta_f16, dst_f16, CUDA_R_16F, ldc,
|
7609
|
+
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
7610
|
+
src1_ptr, CUDA_R_16F, ne10,
|
7611
|
+
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
7425
7612
|
CUBLAS_COMPUTE_16F,
|
7426
7613
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7427
7614
|
|
7428
7615
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
7429
|
-
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
|
7430
|
-
|
7431
|
-
ggml_cuda_pool_free(dst_f16, dst_as);
|
7432
|
-
|
7433
|
-
if (src0_as != 0) {
|
7434
|
-
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
7435
|
-
}
|
7436
|
-
|
7437
|
-
if (src1_as != 0) {
|
7438
|
-
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
7439
|
-
}
|
7616
|
+
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
7440
7617
|
}
|
7441
7618
|
else {
|
7442
|
-
float
|
7443
|
-
size_t src0_as = 0;
|
7619
|
+
cuda_pool_alloc<float> src0_ddq_as_f32;
|
7444
7620
|
|
7445
7621
|
if (src0->type != GGML_TYPE_F32) {
|
7446
7622
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
7447
7623
|
GGML_ASSERT(to_fp32_cuda != nullptr);
|
7448
|
-
src0_ddq_as_f32
|
7449
|
-
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
7624
|
+
src0_ddq_as_f32.alloc(row_diff*ne00);
|
7625
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
7450
7626
|
}
|
7451
|
-
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
7627
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
7452
7628
|
|
7453
7629
|
const float alpha = 1.0f;
|
7454
7630
|
const float beta = 0.0f;
|
@@ -7460,10 +7636,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
7460
7636
|
&alpha, src0_ddf_i, ne00,
|
7461
7637
|
src1_ddf_i, ne10,
|
7462
7638
|
&beta, dst_dd_i, ldc));
|
7463
|
-
|
7464
|
-
if (src0_as != 0) {
|
7465
|
-
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
7466
|
-
}
|
7467
7639
|
}
|
7468
7640
|
|
7469
7641
|
(void) dst;
|
@@ -7471,9 +7643,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
7471
7643
|
(void) src1_padded_row_size;
|
7472
7644
|
}
|
7473
7645
|
|
7474
|
-
|
7646
|
+
static void ggml_cuda_op_rope(
|
7475
7647
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7476
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7648
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7477
7649
|
|
7478
7650
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
7479
7651
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
@@ -7551,9 +7723,9 @@ inline void ggml_cuda_op_rope(
|
|
7551
7723
|
(void) src1_dd;
|
7552
7724
|
}
|
7553
7725
|
|
7554
|
-
|
7726
|
+
static void ggml_cuda_op_alibi(
|
7555
7727
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7556
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7728
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7557
7729
|
|
7558
7730
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7559
7731
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -7582,9 +7754,9 @@ inline void ggml_cuda_op_alibi(
|
|
7582
7754
|
(void) src1_dd;
|
7583
7755
|
}
|
7584
7756
|
|
7585
|
-
|
7757
|
+
static void ggml_cuda_op_im2col(
|
7586
7758
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7587
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7759
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7588
7760
|
|
7589
7761
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7590
7762
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -7617,10 +7789,9 @@ inline void ggml_cuda_op_im2col(
|
|
7617
7789
|
(void) src0_dd;
|
7618
7790
|
}
|
7619
7791
|
|
7620
|
-
|
7621
|
-
inline void ggml_cuda_op_sum_rows(
|
7792
|
+
static void ggml_cuda_op_sum_rows(
|
7622
7793
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7623
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7794
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7624
7795
|
|
7625
7796
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7626
7797
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -7635,9 +7806,9 @@ inline void ggml_cuda_op_sum_rows(
|
|
7635
7806
|
(void) src1_dd;
|
7636
7807
|
}
|
7637
7808
|
|
7638
|
-
|
7809
|
+
static void ggml_cuda_op_argsort(
|
7639
7810
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7640
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7811
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7641
7812
|
|
7642
7813
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7643
7814
|
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
@@ -7654,9 +7825,9 @@ inline void ggml_cuda_op_argsort(
|
|
7654
7825
|
(void) src1_dd;
|
7655
7826
|
}
|
7656
7827
|
|
7657
|
-
|
7828
|
+
static void ggml_cuda_op_diag_mask_inf(
|
7658
7829
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7659
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7830
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7660
7831
|
|
7661
7832
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7662
7833
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -7674,9 +7845,9 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
7674
7845
|
(void) src1_dd;
|
7675
7846
|
}
|
7676
7847
|
|
7677
|
-
|
7848
|
+
static void ggml_cuda_op_soft_max(
|
7678
7849
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7679
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7850
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7680
7851
|
|
7681
7852
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7682
7853
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -7695,14 +7866,15 @@ inline void ggml_cuda_op_soft_max(
|
|
7695
7866
|
(void) dst;
|
7696
7867
|
}
|
7697
7868
|
|
7698
|
-
|
7869
|
+
static void ggml_cuda_op_scale(
|
7699
7870
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7700
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7871
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7701
7872
|
|
7702
7873
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7703
7874
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7704
7875
|
|
7705
|
-
|
7876
|
+
float scale;
|
7877
|
+
memcpy(&scale, dst->op_params, sizeof(float));
|
7706
7878
|
|
7707
7879
|
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
7708
7880
|
CUDA_CHECK(cudaGetLastError());
|
@@ -7712,9 +7884,9 @@ inline void ggml_cuda_op_scale(
|
|
7712
7884
|
(void) src1_dd;
|
7713
7885
|
}
|
7714
7886
|
|
7715
|
-
|
7887
|
+
static void ggml_cuda_op_clamp(
|
7716
7888
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7717
|
-
const float * src0_dd, const float * src1_dd, float * dst_dd,
|
7889
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
7718
7890
|
|
7719
7891
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7720
7892
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
@@ -7754,18 +7926,17 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
7754
7926
|
float * src1_ddf = nullptr;
|
7755
7927
|
float * dst_ddf = nullptr;
|
7756
7928
|
|
7757
|
-
|
7758
|
-
|
7759
|
-
|
7760
|
-
size_t dst_asf = 0;
|
7929
|
+
cuda_pool_alloc<float> src0_f;
|
7930
|
+
cuda_pool_alloc<float> src1_f;
|
7931
|
+
cuda_pool_alloc<float> dst_f;
|
7761
7932
|
|
7762
7933
|
ggml_cuda_set_device(g_main_device);
|
7763
|
-
|
7934
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7764
7935
|
|
7765
7936
|
if (src0_on_device) {
|
7766
7937
|
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
7767
7938
|
} else {
|
7768
|
-
src0_ddf = (
|
7939
|
+
src0_ddf = src0_f.alloc(ggml_nelements(src0));
|
7769
7940
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
7770
7941
|
}
|
7771
7942
|
|
@@ -7773,14 +7944,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
7773
7944
|
if (src1_on_device) {
|
7774
7945
|
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7775
7946
|
} else {
|
7776
|
-
src1_ddf = (
|
7947
|
+
src1_ddf = src1_f.alloc(ggml_nelements(src1));
|
7777
7948
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
7778
7949
|
}
|
7779
7950
|
}
|
7780
7951
|
if (dst_on_device) {
|
7781
7952
|
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7782
7953
|
} else {
|
7783
|
-
dst_ddf = (
|
7954
|
+
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
7784
7955
|
}
|
7785
7956
|
|
7786
7957
|
// do the computation
|
@@ -7792,16 +7963,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
7792
7963
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
7793
7964
|
}
|
7794
7965
|
|
7795
|
-
if (src0_asf > 0) {
|
7796
|
-
ggml_cuda_pool_free(src0_ddf, src0_asf);
|
7797
|
-
}
|
7798
|
-
if (src1_asf > 0) {
|
7799
|
-
ggml_cuda_pool_free(src1_ddf, src1_asf);
|
7800
|
-
}
|
7801
|
-
if (dst_asf > 0) {
|
7802
|
-
ggml_cuda_pool_free(dst_ddf, dst_asf);
|
7803
|
-
}
|
7804
|
-
|
7805
7966
|
if (dst->backend == GGML_BACKEND_CPU) {
|
7806
7967
|
CUDA_CHECK(cudaDeviceSynchronize());
|
7807
7968
|
}
|
@@ -7818,12 +7979,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
7818
7979
|
|
7819
7980
|
#ifdef NDEBUG
|
7820
7981
|
for (int id = 0; id < g_device_count; ++id) {
|
7821
|
-
|
7982
|
+
ggml_cuda_set_device(id);
|
7822
7983
|
CUDA_CHECK(cudaDeviceSynchronize());
|
7823
7984
|
}
|
7824
7985
|
|
7825
7986
|
for (int id = 0; id < g_device_count; ++id) {
|
7826
|
-
|
7987
|
+
ggml_cuda_set_device(id);
|
7827
7988
|
|
7828
7989
|
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
7829
7990
|
if (id == id_other) {
|
@@ -7857,7 +8018,6 @@ static void ggml_cuda_op_mul_mat(
|
|
7857
8018
|
const int64_t ne01 = src0->ne[1];
|
7858
8019
|
const int64_t ne02 = src0->ne[2];
|
7859
8020
|
const int64_t ne03 = src0->ne[3];
|
7860
|
-
const int64_t nrows0 = ggml_nrows(src0);
|
7861
8021
|
|
7862
8022
|
const int64_t ne10 = src1->ne[0];
|
7863
8023
|
const int64_t ne11 = src1->ne[1];
|
@@ -7900,27 +8060,29 @@ static void ggml_cuda_op_mul_mat(
|
|
7900
8060
|
GGML_ASSERT(!(split && ne03 > 1));
|
7901
8061
|
GGML_ASSERT(!(split && ne02 < ne12));
|
7902
8062
|
|
7903
|
-
|
7904
|
-
|
7905
|
-
|
7906
|
-
|
7907
|
-
|
8063
|
+
struct dev_data {
|
8064
|
+
cuda_pool_alloc<char> src0_dd_alloc;
|
8065
|
+
cuda_pool_alloc<float> src1_ddf_alloc;
|
8066
|
+
cuda_pool_alloc<char> src1_ddq_alloc;
|
8067
|
+
cuda_pool_alloc<float> dst_dd_alloc;
|
8068
|
+
|
8069
|
+
char * src0_dd = nullptr;
|
8070
|
+
float * src1_ddf = nullptr; // float
|
8071
|
+
char * src1_ddq = nullptr; // q8_1
|
8072
|
+
float * dst_dd = nullptr;
|
7908
8073
|
|
7909
|
-
|
7910
|
-
|
7911
|
-
|
7912
|
-
size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
7913
|
-
size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
|
8074
|
+
int64_t row_low;
|
8075
|
+
int64_t row_high;
|
8076
|
+
};
|
7914
8077
|
|
7915
|
-
|
7916
|
-
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
8078
|
+
dev_data dev[GGML_CUDA_MAX_DEVICES];
|
7917
8079
|
|
7918
8080
|
int used_devices = 0;
|
7919
8081
|
|
7920
|
-
for (
|
8082
|
+
for (int id = 0; id < g_device_count; ++id) {
|
7921
8083
|
// by default, use all rows
|
7922
|
-
|
7923
|
-
|
8084
|
+
dev[id].row_low = 0;
|
8085
|
+
dev[id].row_high = ne01;
|
7924
8086
|
|
7925
8087
|
// for multi GPU, get the row boundaries from tensor split
|
7926
8088
|
// and round to mul_mat_q tile sizes
|
@@ -7928,19 +8090,23 @@ static void ggml_cuda_op_mul_mat(
|
|
7928
8090
|
const int64_t rounding = get_row_rounding(src0->type);
|
7929
8091
|
|
7930
8092
|
if (id != 0) {
|
7931
|
-
|
7932
|
-
|
8093
|
+
dev[id].row_low = ne01*g_tensor_split[id];
|
8094
|
+
if (dev[id].row_low < ne01) {
|
8095
|
+
dev[id].row_low -= dev[id].row_low % rounding;
|
8096
|
+
}
|
7933
8097
|
}
|
7934
8098
|
|
7935
8099
|
if (id != g_device_count - 1) {
|
7936
|
-
|
7937
|
-
|
8100
|
+
dev[id].row_high = ne01*g_tensor_split[id + 1];
|
8101
|
+
if (dev[id].row_high < ne01) {
|
8102
|
+
dev[id].row_high -= dev[id].row_high % rounding;
|
8103
|
+
}
|
7938
8104
|
}
|
7939
8105
|
}
|
7940
8106
|
}
|
7941
8107
|
|
7942
|
-
for (
|
7943
|
-
if ((!split && id != g_main_device) ||
|
8108
|
+
for (int id = 0; id < g_device_count; ++id) {
|
8109
|
+
if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
|
7944
8110
|
continue;
|
7945
8111
|
}
|
7946
8112
|
|
@@ -7950,42 +8116,41 @@ static void ggml_cuda_op_mul_mat(
|
|
7950
8116
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
7951
8117
|
|
7952
8118
|
ggml_cuda_set_device(id);
|
7953
|
-
|
8119
|
+
cudaStream_t stream = g_cudaStreams[id][0];
|
7954
8120
|
|
7955
8121
|
if (src0_on_device && src0_is_contiguous) {
|
7956
|
-
|
8122
|
+
dev[id].src0_dd = (char *) src0_extra->data_device[id];
|
7957
8123
|
} else {
|
7958
|
-
|
7959
|
-
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
8124
|
+
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
|
7960
8125
|
}
|
7961
8126
|
|
7962
8127
|
if (src1_on_device && src1_is_contiguous) {
|
7963
|
-
|
8128
|
+
dev[id].src1_ddf = (float *) src1_extra->data_device[id];
|
7964
8129
|
} else {
|
7965
|
-
|
8130
|
+
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1));
|
7966
8131
|
}
|
7967
8132
|
|
7968
8133
|
if (convert_src1_to_q8_1) {
|
7969
|
-
|
8134
|
+
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
7970
8135
|
|
7971
8136
|
if (src1_on_device && src1_is_contiguous) {
|
7972
|
-
quantize_row_q8_1_cuda(
|
8137
|
+
quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
7973
8138
|
CUDA_CHECK(cudaGetLastError());
|
7974
8139
|
}
|
7975
8140
|
}
|
7976
8141
|
|
7977
8142
|
if (dst_on_device) {
|
7978
|
-
|
8143
|
+
dev[id].dst_dd = (float *) dst_extra->data_device[id];
|
7979
8144
|
} else {
|
7980
|
-
const size_t size_dst_ddf = split ? (
|
7981
|
-
|
8145
|
+
const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
|
8146
|
+
dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
|
7982
8147
|
}
|
7983
8148
|
}
|
7984
8149
|
|
7985
8150
|
// if multiple devices are used they need to wait for the main device
|
7986
8151
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
7987
8152
|
if (split && used_devices > 1) {
|
7988
|
-
|
8153
|
+
ggml_cuda_set_device(g_main_device);
|
7989
8154
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
7990
8155
|
}
|
7991
8156
|
|
@@ -7994,17 +8159,17 @@ static void ggml_cuda_op_mul_mat(
|
|
7994
8159
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
7995
8160
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
7996
8161
|
|
7997
|
-
for (
|
7998
|
-
if ((!split && id != g_main_device) ||
|
8162
|
+
for (int id = 0; id < g_device_count; ++id) {
|
8163
|
+
if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
|
7999
8164
|
continue;
|
8000
8165
|
}
|
8001
8166
|
|
8002
8167
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
8003
8168
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
8004
|
-
const int64_t row_diff =
|
8169
|
+
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
8005
8170
|
|
8006
8171
|
ggml_cuda_set_device(id);
|
8007
|
-
|
8172
|
+
cudaStream_t stream = g_cudaStreams[id][is];
|
8008
8173
|
|
8009
8174
|
// wait for main GPU data if necessary
|
8010
8175
|
if (split && (id != g_main_device || is != 0)) {
|
@@ -8018,34 +8183,34 @@ static void ggml_cuda_op_mul_mat(
|
|
8018
8183
|
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
8019
8184
|
|
8020
8185
|
// for split tensors the data begins at i0 == i0_offset_low
|
8021
|
-
char * src0_dd_i =
|
8022
|
-
float * src1_ddf_i =
|
8023
|
-
char * src1_ddq_i =
|
8024
|
-
float * dst_dd_i =
|
8186
|
+
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
8187
|
+
float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
|
8188
|
+
char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
|
8189
|
+
float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
8025
8190
|
|
8026
8191
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
8027
8192
|
// in that case an offset on dst_ddf_i is needed
|
8028
8193
|
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
8029
|
-
dst_dd_i +=
|
8194
|
+
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
8030
8195
|
}
|
8031
8196
|
|
8032
8197
|
// copy src0, src1 to device if necessary
|
8033
8198
|
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
8034
8199
|
if (id != g_main_device) {
|
8035
8200
|
if (convert_src1_to_q8_1) {
|
8036
|
-
char * src1_ddq_i_source =
|
8037
|
-
CUDA_CHECK(
|
8038
|
-
|
8201
|
+
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
8202
|
+
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
|
8203
|
+
src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
8039
8204
|
} else {
|
8040
8205
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
8041
8206
|
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
8042
|
-
CUDA_CHECK(
|
8043
|
-
|
8207
|
+
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
|
8208
|
+
src1_ncols*ne10*sizeof(float), stream));
|
8044
8209
|
}
|
8045
8210
|
}
|
8046
8211
|
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
8047
8212
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
8048
|
-
|
8213
|
+
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
8049
8214
|
} else {
|
8050
8215
|
GGML_ASSERT(false);
|
8051
8216
|
}
|
@@ -8056,12 +8221,12 @@ static void ggml_cuda_op_mul_mat(
|
|
8056
8221
|
}
|
8057
8222
|
|
8058
8223
|
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
8059
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor,
|
8224
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
|
8060
8225
|
}
|
8061
8226
|
|
8062
8227
|
// do the computation
|
8063
8228
|
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
8064
|
-
|
8229
|
+
dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
|
8065
8230
|
CUDA_CHECK(cudaGetLastError());
|
8066
8231
|
|
8067
8232
|
// copy dst to host or other device if necessary
|
@@ -8085,9 +8250,25 @@ static void ggml_cuda_op_mul_mat(
|
|
8085
8250
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
8086
8251
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
8087
8252
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
8088
|
-
dhf_dst_i += src1_col_0*ne0 +
|
8089
|
-
|
8090
|
-
|
8253
|
+
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
8254
|
+
#if !defined(GGML_USE_HIPBLAS)
|
8255
|
+
if (kind == cudaMemcpyDeviceToDevice) {
|
8256
|
+
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
8257
|
+
cudaMemcpy3DPeerParms p = {};
|
8258
|
+
p.dstDevice = g_main_device;
|
8259
|
+
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
|
8260
|
+
p.srcDevice = id;
|
8261
|
+
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
8262
|
+
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
8263
|
+
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
8264
|
+
} else
|
8265
|
+
#endif
|
8266
|
+
{
|
8267
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
8268
|
+
dst_dd_i, row_diff*sizeof(float),
|
8269
|
+
row_diff*sizeof(float), src1_ncols,
|
8270
|
+
kind, stream));
|
8271
|
+
}
|
8091
8272
|
} else {
|
8092
8273
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
8093
8274
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
@@ -8104,35 +8285,14 @@ static void ggml_cuda_op_mul_mat(
|
|
8104
8285
|
}
|
8105
8286
|
}
|
8106
8287
|
|
8107
|
-
for (int64_t id = 0; id < g_device_count; ++id) {
|
8108
|
-
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
8109
|
-
continue;
|
8110
|
-
}
|
8111
|
-
CUDA_CHECK(ggml_cuda_set_device(id));
|
8112
|
-
|
8113
|
-
// free buffers again when done
|
8114
|
-
if (src0_as[id] > 0) {
|
8115
|
-
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
8116
|
-
}
|
8117
|
-
if (src1_asf[id] > 0) {
|
8118
|
-
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
8119
|
-
}
|
8120
|
-
if (src1_asq[id] > 0) {
|
8121
|
-
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
8122
|
-
}
|
8123
|
-
if (dst_as[id] > 0) {
|
8124
|
-
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
8125
|
-
}
|
8126
|
-
}
|
8127
|
-
|
8128
8288
|
// main device waits for all other devices to be finished
|
8129
8289
|
if (split && g_device_count > 1) {
|
8130
8290
|
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
8131
8291
|
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
8132
8292
|
|
8133
|
-
|
8134
|
-
for (
|
8135
|
-
if (
|
8293
|
+
ggml_cuda_set_device(g_main_device);
|
8294
|
+
for (int id = 0; id < g_device_count; ++id) {
|
8295
|
+
if (dev[id].row_low == dev[id].row_high) {
|
8136
8296
|
continue;
|
8137
8297
|
}
|
8138
8298
|
for (int64_t is = 0; is < is_max; ++is) {
|
@@ -8142,7 +8302,7 @@ static void ggml_cuda_op_mul_mat(
|
|
8142
8302
|
}
|
8143
8303
|
|
8144
8304
|
if (dst->backend == GGML_BACKEND_CPU) {
|
8145
|
-
|
8305
|
+
ggml_cuda_set_device(g_main_device);
|
8146
8306
|
CUDA_CHECK(cudaDeviceSynchronize());
|
8147
8307
|
}
|
8148
8308
|
}
|
@@ -8252,7 +8412,7 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
8252
8412
|
|
8253
8413
|
const int64_t ne12 = src1->ne[2];
|
8254
8414
|
|
8255
|
-
|
8415
|
+
ggml_cuda_set_device(g_main_device);
|
8256
8416
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8257
8417
|
|
8258
8418
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
@@ -8284,7 +8444,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
8284
8444
|
|
8285
8445
|
const int64_t ne12 = src1->ne[2];
|
8286
8446
|
|
8287
|
-
|
8447
|
+
ggml_cuda_set_device(g_main_device);
|
8288
8448
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8289
8449
|
|
8290
8450
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
@@ -8355,7 +8515,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8355
8515
|
const int64_t ne1 = ggml_nelements(src1);
|
8356
8516
|
const int64_t ne = ggml_nelements(dst);
|
8357
8517
|
|
8358
|
-
|
8518
|
+
ggml_cuda_set_device(g_main_device);
|
8359
8519
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8360
8520
|
|
8361
8521
|
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
@@ -8374,14 +8534,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8374
8534
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8375
8535
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8376
8536
|
|
8377
|
-
|
8378
|
-
|
8379
|
-
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
8380
|
-
|
8381
|
-
size_t dst_as = 0;
|
8537
|
+
cuda_pool_alloc<half> src1_as_f16(ne1);
|
8538
|
+
to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
|
8382
8539
|
|
8383
|
-
half
|
8384
|
-
char * dst_t
|
8540
|
+
cuda_pool_alloc<half> dst_f16;
|
8541
|
+
char * dst_t;
|
8385
8542
|
|
8386
8543
|
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
8387
8544
|
cudaDataType_t cu_data_type = CUDA_R_16F;
|
@@ -8400,8 +8557,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8400
8557
|
const void * beta = &beta_f16;
|
8401
8558
|
|
8402
8559
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8403
|
-
|
8404
|
-
dst_t = (char *) dst_f16;
|
8560
|
+
dst_t = (char *) dst_f16.alloc(ne);
|
8405
8561
|
|
8406
8562
|
nbd2 /= sizeof(float) / sizeof(half);
|
8407
8563
|
nbd3 /= sizeof(float) / sizeof(half);
|
@@ -8448,9 +8604,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8448
8604
|
CUBLAS_CHECK(
|
8449
8605
|
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8450
8606
|
ne01, ne11, ne10,
|
8451
|
-
alpha, (const char *) src0_as_f16,
|
8452
|
-
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
8453
|
-
beta, ( char *) dst_t,
|
8607
|
+
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
8608
|
+
(const char *) src1_as_f16.get(), CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
8609
|
+
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
8454
8610
|
ne12*ne13,
|
8455
8611
|
cu_compute_type,
|
8456
8612
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
@@ -8458,19 +8614,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8458
8614
|
// use cublasGemmBatchedEx
|
8459
8615
|
const int ne23 = ne12*ne13;
|
8460
8616
|
|
8461
|
-
const void
|
8462
|
-
|
8463
|
-
|
8464
|
-
size_t ptrs_src_s = 0;
|
8465
|
-
size_t ptrs_dst_s = 0;
|
8466
|
-
|
8467
|
-
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
8468
|
-
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
8617
|
+
cuda_pool_alloc<const void *> ptrs_src(2*ne23);
|
8618
|
+
cuda_pool_alloc< void *> ptrs_dst(1*ne23);
|
8469
8619
|
|
8470
8620
|
dim3 block_dims(ne13, ne12);
|
8471
8621
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
8472
|
-
src0_as_f16, src1_as_f16, dst_t,
|
8473
|
-
ptrs_src, ptrs_dst,
|
8622
|
+
src0_as_f16, src1_as_f16.get(), dst_t,
|
8623
|
+
ptrs_src.get(), ptrs_dst.get(),
|
8474
8624
|
ne12, ne13,
|
8475
8625
|
ne23,
|
8476
8626
|
nb02, nb03,
|
@@ -8482,30 +8632,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8482
8632
|
CUBLAS_CHECK(
|
8483
8633
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8484
8634
|
ne01, ne11, ne10,
|
8485
|
-
alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
8486
|
-
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
8487
|
-
beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
|
8635
|
+
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
8636
|
+
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
8637
|
+
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
8488
8638
|
ne23,
|
8489
8639
|
cu_compute_type,
|
8490
8640
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8491
|
-
|
8492
|
-
if (ptrs_src_s != 0) {
|
8493
|
-
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
8494
|
-
}
|
8495
|
-
if (ptrs_dst_s != 0) {
|
8496
|
-
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
8497
|
-
}
|
8498
8641
|
}
|
8499
8642
|
#endif
|
8500
8643
|
|
8501
8644
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8502
8645
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8503
|
-
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8504
|
-
|
8505
|
-
ggml_cuda_pool_free(dst_f16, dst_as);
|
8646
|
+
to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
|
8506
8647
|
}
|
8507
|
-
|
8508
|
-
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
8509
8648
|
}
|
8510
8649
|
|
8511
8650
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8517,9 +8656,9 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
8517
8656
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
8518
8657
|
|
8519
8658
|
int64_t min_compute_capability = INT_MAX;
|
8520
|
-
for (
|
8521
|
-
if (min_compute_capability >
|
8522
|
-
min_compute_capability =
|
8659
|
+
for (int id = 0; id < g_device_count; ++id) {
|
8660
|
+
if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
8661
|
+
min_compute_capability = g_device_caps[id].cc;
|
8523
8662
|
}
|
8524
8663
|
}
|
8525
8664
|
|
@@ -8660,7 +8799,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
|
8660
8799
|
const int64_t ne1 = ggml_nelements(src1);
|
8661
8800
|
const int64_t ne = ggml_nelements(dst);
|
8662
8801
|
|
8663
|
-
|
8802
|
+
ggml_cuda_set_device(g_main_device);
|
8664
8803
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8665
8804
|
|
8666
8805
|
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
|
@@ -8778,7 +8917,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8778
8917
|
|
8779
8918
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
8780
8919
|
|
8781
|
-
|
8920
|
+
cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
8782
8921
|
|
8783
8922
|
if (ids->backend == GGML_BACKEND_GPU) {
|
8784
8923
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
@@ -8832,17 +8971,16 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8832
8971
|
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8833
8972
|
}
|
8834
8973
|
} else {
|
8835
|
-
|
8836
|
-
char
|
8837
|
-
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
|
8974
|
+
cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
8975
|
+
cuda_pool_alloc<char> dst_contiguous(sizeof(float)*ggml_nelements(dst));
|
8838
8976
|
|
8839
|
-
src1_row_extra.data_device[g_main_device] = src1_contiguous;
|
8840
|
-
dst_row_extra.data_device[g_main_device] = dst_contiguous;
|
8977
|
+
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
8978
|
+
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
8841
8979
|
|
8842
8980
|
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
8843
8981
|
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
8844
8982
|
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
8845
|
-
|
8983
|
+
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
8846
8984
|
|
8847
8985
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
8848
8986
|
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
@@ -8857,7 +8995,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8857
8995
|
|
8858
8996
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8859
8997
|
|
8860
|
-
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
|
8998
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
|
8861
8999
|
nb11, src1_kind, stream));
|
8862
9000
|
num_src1_rows++;
|
8863
9001
|
}
|
@@ -8889,14 +9027,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8889
9027
|
|
8890
9028
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8891
9029
|
|
8892
|
-
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
|
9030
|
+
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
|
8893
9031
|
nb1, dst_kind, stream));
|
8894
9032
|
num_src1_rows++;
|
8895
9033
|
}
|
8896
9034
|
}
|
8897
|
-
|
8898
|
-
ggml_cuda_pool_free(src1_contiguous, as_src1);
|
8899
|
-
ggml_cuda_pool_free(dst_contiguous, as_dst);
|
8900
9035
|
}
|
8901
9036
|
|
8902
9037
|
if (dst->backend == GGML_BACKEND_CPU) {
|
@@ -8938,7 +9073,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
8938
9073
|
const int64_t nb11 = src1->nb[1];
|
8939
9074
|
const int64_t nb12 = src1->nb[2];
|
8940
9075
|
|
8941
|
-
|
9076
|
+
ggml_cuda_set_device(g_main_device);
|
8942
9077
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
8943
9078
|
|
8944
9079
|
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
@@ -9028,7 +9163,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
9028
9163
|
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
9029
9164
|
memset(extra, 0, sizeof(*extra));
|
9030
9165
|
|
9031
|
-
for (
|
9166
|
+
for (int id = 0; id < g_device_count; ++id) {
|
9032
9167
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
9033
9168
|
continue;
|
9034
9169
|
}
|
@@ -9099,15 +9234,14 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
9099
9234
|
|
9100
9235
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
9101
9236
|
|
9102
|
-
for (
|
9237
|
+
for (int id = 0; id < g_device_count; ++id) {
|
9238
|
+
ggml_cuda_set_device(id);
|
9103
9239
|
if (extra->data_device[id] != nullptr) {
|
9104
|
-
CUDA_CHECK(ggml_cuda_set_device(id));
|
9105
9240
|
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
9106
9241
|
}
|
9107
9242
|
|
9108
9243
|
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
9109
9244
|
if (extra->events[id][is] != nullptr) {
|
9110
|
-
CUDA_CHECK(ggml_cuda_set_device(id));
|
9111
9245
|
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
9112
9246
|
}
|
9113
9247
|
}
|
@@ -9161,7 +9295,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
9161
9295
|
force_inplace;
|
9162
9296
|
const size_t size = ggml_nbytes(tensor);
|
9163
9297
|
|
9164
|
-
|
9298
|
+
ggml_cuda_set_device(g_main_device);
|
9165
9299
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
9166
9300
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
9167
9301
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
@@ -9238,7 +9372,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
|
9238
9372
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
9239
9373
|
|
9240
9374
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
9241
|
-
|
9375
|
+
ggml_cuda_set_device(g_main_device);
|
9242
9376
|
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
9243
9377
|
}
|
9244
9378
|
|
@@ -9662,12 +9796,16 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
9662
9796
|
// host buffer type
|
9663
9797
|
|
9664
9798
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
9665
|
-
|
9799
|
+
ggml_cuda_host_free(buffer->context);
|
9666
9800
|
}
|
9667
9801
|
|
9668
9802
|
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
9669
|
-
void * ptr;
|
9670
|
-
|
9803
|
+
void * ptr = ggml_cuda_host_malloc(size);
|
9804
|
+
|
9805
|
+
if (ptr == nullptr) {
|
9806
|
+
// fallback to cpu buffer
|
9807
|
+
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
9808
|
+
}
|
9671
9809
|
|
9672
9810
|
// FIXME: this is a hack to avoid having to implement a new buffer type
|
9673
9811
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|