llama_cpp 0.3.0 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +195 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +499 -118
- data/ext/llama_cpp/src/ggml-cuda.h +1 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +357 -176
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +230 -261
- data/ext/llama_cpp/src/llama.h +31 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +21 -1
- metadata +3 -2
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
|
|
70
70
|
|
71
71
|
// QK = number of values after dequantization
|
72
72
|
// QR = QK / number of values before dequantization
|
73
|
+
// QI = number of 32 bit integers before dequantization
|
73
74
|
|
74
75
|
#define QK4_0 32
|
75
76
|
#define QR4_0 2
|
77
|
+
#define QI4_0 4
|
76
78
|
typedef struct {
|
77
79
|
half d; // delta
|
78
80
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
81
83
|
|
82
84
|
#define QK4_1 32
|
83
85
|
#define QR4_1 2
|
86
|
+
#define QI4_1 4
|
84
87
|
typedef struct {
|
85
88
|
half d; // delta
|
86
89
|
half m; // min
|
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
90
93
|
|
91
94
|
#define QK5_0 32
|
92
95
|
#define QR5_0 2
|
96
|
+
#define QI5_0 4
|
93
97
|
typedef struct {
|
94
98
|
half d; // delta
|
95
99
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
99
103
|
|
100
104
|
#define QK5_1 32
|
101
105
|
#define QR5_1 2
|
106
|
+
#define QI5_1 4
|
102
107
|
typedef struct {
|
103
108
|
half d; // delta
|
104
109
|
half m; // min
|
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
109
114
|
|
110
115
|
#define QK8_0 32
|
111
116
|
#define QR8_0 1
|
117
|
+
#define QI8_0 8
|
112
118
|
typedef struct {
|
113
119
|
half d; // delta
|
114
120
|
int8_t qs[QK8_0]; // quants
|
115
121
|
} block_q8_0;
|
116
122
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
117
123
|
|
124
|
+
#define QK8_1 32
|
125
|
+
#define QR8_1 1
|
126
|
+
#define QI8_1 8
|
127
|
+
typedef struct {
|
128
|
+
half d; // delta
|
129
|
+
half s; // unquantized sum
|
130
|
+
int8_t qs[QK8_0]; // quants
|
131
|
+
} block_q8_1;
|
132
|
+
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
|
+
|
134
|
+
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
135
|
+
|
118
136
|
//================================= k-quants
|
119
137
|
|
120
138
|
#ifdef GGML_QKK_64
|
@@ -198,14 +216,15 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
198
216
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
199
217
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
200
218
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
219
|
+
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
201
220
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
202
221
|
|
203
222
|
// dmmv = dequantize_mul_mat_vec
|
204
223
|
#ifndef GGML_CUDA_DMMV_X
|
205
224
|
#define GGML_CUDA_DMMV_X 32
|
206
225
|
#endif
|
207
|
-
#ifndef
|
208
|
-
#define
|
226
|
+
#ifndef GGML_CUDA_MMV_Y
|
227
|
+
#define GGML_CUDA_MMV_Y 1
|
209
228
|
#endif
|
210
229
|
|
211
230
|
#ifndef K_QUANTS_PER_ITERATION
|
@@ -214,6 +233,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
214
233
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
215
234
|
#endif
|
216
235
|
|
236
|
+
struct ggml_tensor_extra_gpu {
|
237
|
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
238
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
239
|
+
};
|
240
|
+
|
217
241
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
218
242
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
219
243
|
|
@@ -223,6 +247,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
|
|
223
247
|
dst[i] = x[i] + y[i];
|
224
248
|
}
|
225
249
|
|
250
|
+
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
251
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
252
|
+
|
253
|
+
if (i >= k) {
|
254
|
+
return;
|
255
|
+
}
|
256
|
+
dst[i] = __hadd(x[i], __float2half(y[i]));
|
257
|
+
}
|
258
|
+
|
226
259
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
227
260
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
228
261
|
|
@@ -256,7 +289,6 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
256
289
|
}
|
257
290
|
|
258
291
|
// sum up partial sums
|
259
|
-
__syncthreads();
|
260
292
|
#pragma unroll
|
261
293
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
262
294
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -700,7 +732,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
700
732
|
#endif
|
701
733
|
|
702
734
|
// sum up partial sums and write back result
|
703
|
-
__syncthreads();
|
704
735
|
#pragma unroll
|
705
736
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
706
737
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -805,7 +836,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
805
836
|
#endif
|
806
837
|
|
807
838
|
// sum up partial sums and write back result
|
808
|
-
__syncthreads();
|
809
839
|
#pragma unroll
|
810
840
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
811
841
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -909,7 +939,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
909
939
|
#endif
|
910
940
|
|
911
941
|
// sum up partial sums and write back result
|
912
|
-
__syncthreads();
|
913
942
|
#pragma unroll
|
914
943
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
915
944
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1014,7 +1043,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1014
1043
|
#endif
|
1015
1044
|
|
1016
1045
|
// sum up partial sums and write back result
|
1017
|
-
__syncthreads();
|
1018
1046
|
#pragma unroll
|
1019
1047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1020
1048
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1125,7 +1153,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
|
1125
1153
|
#endif
|
1126
1154
|
|
1127
1155
|
// sum up partial sums and write back result
|
1128
|
-
__syncthreads();
|
1129
1156
|
#pragma unroll
|
1130
1157
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1131
1158
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1144,6 +1171,41 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1144
1171
|
v.y = x[ib + iqs + 1];
|
1145
1172
|
}
|
1146
1173
|
|
1174
|
+
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
1175
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1176
|
+
|
1177
|
+
if (i >= k) {
|
1178
|
+
return;
|
1179
|
+
}
|
1180
|
+
|
1181
|
+
block_q8_1 * y = (block_q8_1 *) vy;
|
1182
|
+
|
1183
|
+
const int ib = i / QK8_0; // block index
|
1184
|
+
const int iqs = i % QK8_0; // quant index
|
1185
|
+
|
1186
|
+
const float xi = x[i];
|
1187
|
+
float amax = fabsf(xi);
|
1188
|
+
float sum = xi;
|
1189
|
+
|
1190
|
+
#pragma unroll
|
1191
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1192
|
+
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
1193
|
+
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
const float d = amax / 127;
|
1197
|
+
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
1198
|
+
|
1199
|
+
y[ib].qs[iqs] = q;
|
1200
|
+
|
1201
|
+
if (iqs > 0) {
|
1202
|
+
return;
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
y[ib].d = d;
|
1206
|
+
y[ib].s = sum;
|
1207
|
+
}
|
1208
|
+
|
1147
1209
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1148
1210
|
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1149
1211
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
@@ -1165,6 +1227,182 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1165
1227
|
y[iybs + iqs + y_offset] = v.y;
|
1166
1228
|
}
|
1167
1229
|
|
1230
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1231
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1232
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1233
|
+
|
1234
|
+
int vi;
|
1235
|
+
memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1236
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1237
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1238
|
+
|
1239
|
+
const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
|
1240
|
+
|
1241
|
+
// subtract 8 from each quantized value
|
1242
|
+
const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
|
1243
|
+
const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
|
1244
|
+
|
1245
|
+
// SIMD dot product of quantized values
|
1246
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1247
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1248
|
+
|
1249
|
+
return sumi*d;
|
1250
|
+
#else
|
1251
|
+
return 0.0f; // only to satisfy the compiler
|
1252
|
+
#endif // __CUDA_ARCH__ >= 600
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1256
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1257
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1258
|
+
|
1259
|
+
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
1260
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1261
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
1262
|
+
|
1263
|
+
const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
|
1264
|
+
const float m = bq4_1->m;
|
1265
|
+
const float s = bq8_1->s;
|
1266
|
+
|
1267
|
+
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
|
1268
|
+
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
|
1269
|
+
|
1270
|
+
// SIMD dot product of quantized values
|
1271
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1272
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1273
|
+
|
1274
|
+
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1275
|
+
#else
|
1276
|
+
return 0.0f; // only to satisfy the compiler
|
1277
|
+
#endif // __CUDA_ARCH__ >= 600
|
1278
|
+
}
|
1279
|
+
|
1280
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1281
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1282
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1283
|
+
|
1284
|
+
int qs;
|
1285
|
+
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1286
|
+
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1287
|
+
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1288
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1289
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1290
|
+
|
1291
|
+
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1292
|
+
|
1293
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1294
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1295
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1296
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1297
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1298
|
+
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1299
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1300
|
+
|
1301
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1302
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1303
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1304
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1305
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1306
|
+
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1307
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1308
|
+
|
1309
|
+
return sumi*d;
|
1310
|
+
#else
|
1311
|
+
return 0.0f; // only to satisfy the compiler
|
1312
|
+
#endif // __CUDA_ARCH__ >= 600
|
1313
|
+
}
|
1314
|
+
|
1315
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1316
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1317
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1318
|
+
|
1319
|
+
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1320
|
+
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1321
|
+
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1322
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1323
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1324
|
+
|
1325
|
+
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1326
|
+
const float m = bq5_1->m;
|
1327
|
+
const float s = bq8_1->s;
|
1328
|
+
|
1329
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1330
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1331
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1332
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1333
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1334
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1335
|
+
|
1336
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1337
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1338
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1339
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1340
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1341
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1342
|
+
|
1343
|
+
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1344
|
+
#else
|
1345
|
+
return 0.0f; // only to satisfy the compiler
|
1346
|
+
#endif // __CUDA_ARCH__ >= 600
|
1347
|
+
}
|
1348
|
+
|
1349
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1350
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1351
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1352
|
+
|
1353
|
+
int vi;
|
1354
|
+
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1355
|
+
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1356
|
+
|
1357
|
+
const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
|
1358
|
+
|
1359
|
+
// SIMD dot product of quantized values
|
1360
|
+
int sumi = __dp4a(vi, ui, 0);
|
1361
|
+
|
1362
|
+
return sumi*d;
|
1363
|
+
#else
|
1364
|
+
return 0.0f; // only to satisfy the compiler
|
1365
|
+
#endif // __CUDA_ARCH__ >= 600
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1369
|
+
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
1370
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1371
|
+
|
1372
|
+
if (row >= nrows) {
|
1373
|
+
return;
|
1374
|
+
}
|
1375
|
+
|
1376
|
+
const int blocks_per_row = ncols / qk;
|
1377
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
1378
|
+
|
1379
|
+
// partial sum for each thread
|
1380
|
+
float tmp = 0.0f;
|
1381
|
+
|
1382
|
+
const block_q_t * x = (const block_q_t *) vx;
|
1383
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1384
|
+
|
1385
|
+
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1386
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1387
|
+
|
1388
|
+
const int iby = i + threadIdx.x / qi; // y block index
|
1389
|
+
|
1390
|
+
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1391
|
+
|
1392
|
+
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1393
|
+
}
|
1394
|
+
|
1395
|
+
// sum up partial sums and write back result
|
1396
|
+
#pragma unroll
|
1397
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1398
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1399
|
+
}
|
1400
|
+
|
1401
|
+
if (threadIdx.x == 0) {
|
1402
|
+
dst[row] = tmp;
|
1403
|
+
}
|
1404
|
+
}
|
1405
|
+
|
1168
1406
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1169
1407
|
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1170
1408
|
// qk = quantized weights per x block
|
@@ -1219,7 +1457,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1219
1457
|
}
|
1220
1458
|
|
1221
1459
|
// sum up partial sums and write back result
|
1222
|
-
__syncthreads();
|
1223
1460
|
#pragma unroll
|
1224
1461
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1225
1462
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1235,7 +1472,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1235
1472
|
}
|
1236
1473
|
|
1237
1474
|
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1238
|
-
const half * x = (half *) vx;
|
1475
|
+
const half * x = (const half *) vx;
|
1239
1476
|
|
1240
1477
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1241
1478
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
@@ -1270,7 +1507,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1270
1507
|
const int idst = channel*nrows_dst + row_dst;
|
1271
1508
|
|
1272
1509
|
// sum up partial sums and write back result
|
1273
|
-
__syncthreads();
|
1274
1510
|
#pragma unroll
|
1275
1511
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1276
1512
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1283,9 +1519,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1283
1519
|
|
1284
1520
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1285
1521
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1286
|
-
const int row_stride_x, const int
|
1522
|
+
const int row_stride_x, const int channel_stride_x) {
|
1287
1523
|
|
1288
|
-
const half * x = (half *) vx;
|
1524
|
+
const half * x = (const half *) vx;
|
1289
1525
|
|
1290
1526
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1291
1527
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
@@ -1316,7 +1552,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1316
1552
|
}
|
1317
1553
|
|
1318
1554
|
// sum up partial sums and write back result
|
1319
|
-
__syncthreads();
|
1320
1555
|
#pragma unroll
|
1321
1556
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1322
1557
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1328,14 +1563,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1328
1563
|
}
|
1329
1564
|
|
1330
1565
|
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
1331
|
-
const float * xi = (float *) cxi;
|
1566
|
+
const float * xi = (const float *) cxi;
|
1332
1567
|
float * dsti = (float *) cdsti;
|
1333
1568
|
|
1334
1569
|
*dsti = *xi;
|
1335
1570
|
}
|
1336
1571
|
|
1337
1572
|
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
1338
|
-
const float * xi = (float *) cxi;
|
1573
|
+
const float * xi = (const float *) cxi;
|
1339
1574
|
half * dsti = (half *) cdsti;
|
1340
1575
|
|
1341
1576
|
*dsti = __float2half(*xi);
|
@@ -1426,7 +1661,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
1426
1661
|
}
|
1427
1662
|
|
1428
1663
|
// sum up partial sums
|
1429
|
-
__syncthreads();
|
1430
1664
|
#pragma unroll
|
1431
1665
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1432
1666
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1459,6 +1693,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1459
1693
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
1460
1694
|
}
|
1461
1695
|
|
1696
|
+
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
1697
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
1698
|
+
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
1699
|
+
}
|
1700
|
+
|
1462
1701
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
1463
1702
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
1464
1703
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -1475,6 +1714,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
1475
1714
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1476
1715
|
}
|
1477
1716
|
|
1717
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
|
1718
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1719
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
|
1720
|
+
}
|
1721
|
+
|
1478
1722
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1479
1723
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1480
1724
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1543,45 +1787,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
1543
1787
|
|
1544
1788
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1545
1789
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1546
|
-
const int block_num_y = (nrows +
|
1790
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1547
1791
|
const dim3 block_nums(1, block_num_y, 1);
|
1548
|
-
const dim3 block_dims(WARP_SIZE,
|
1792
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1549
1793
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
1550
1794
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1551
1795
|
}
|
1552
1796
|
|
1553
1797
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1554
1798
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1555
|
-
const int block_num_y = (nrows +
|
1799
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1556
1800
|
const dim3 block_nums(1, block_num_y, 1);
|
1557
|
-
const dim3 block_dims(WARP_SIZE,
|
1801
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1558
1802
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
1559
1803
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1560
1804
|
}
|
1561
1805
|
|
1562
1806
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1563
1807
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1564
|
-
const int block_num_y = (nrows +
|
1808
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1565
1809
|
const dim3 block_nums(1, block_num_y, 1);
|
1566
|
-
const dim3 block_dims(WARP_SIZE,
|
1810
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1567
1811
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
1568
1812
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1569
1813
|
}
|
1570
1814
|
|
1571
1815
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1572
1816
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1573
|
-
const int block_num_y = (nrows +
|
1817
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1574
1818
|
const dim3 block_nums(1, block_num_y, 1);
|
1575
|
-
const dim3 block_dims(WARP_SIZE,
|
1819
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1576
1820
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
1577
1821
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1578
1822
|
}
|
1579
1823
|
|
1580
1824
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1581
1825
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1582
|
-
const int block_num_y = (nrows +
|
1826
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1583
1827
|
const dim3 block_nums(1, block_num_y, 1);
|
1584
|
-
const dim3 block_dims(WARP_SIZE,
|
1828
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1585
1829
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
1586
1830
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1587
1831
|
}
|
@@ -1628,6 +1872,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1628
1872
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1629
1873
|
}
|
1630
1874
|
|
1875
|
+
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1876
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1877
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1878
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1879
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1880
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
1881
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1882
|
+
}
|
1883
|
+
|
1884
|
+
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1885
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1886
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1887
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1888
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1889
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
1890
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1891
|
+
}
|
1892
|
+
|
1893
|
+
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1894
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1895
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1896
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1897
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1898
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
1899
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1900
|
+
}
|
1901
|
+
|
1902
|
+
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1903
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1904
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1905
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1906
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1907
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
1908
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1909
|
+
}
|
1910
|
+
|
1911
|
+
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1912
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1913
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1914
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1915
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1916
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
1917
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1918
|
+
}
|
1919
|
+
|
1631
1920
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1632
1921
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1633
1922
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1635,9 +1924,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
1635
1924
|
|
1636
1925
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1637
1926
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1638
|
-
const int block_num_y = (nrows +
|
1927
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1639
1928
|
const dim3 block_nums(1, block_num_y, 1);
|
1640
|
-
const dim3 block_dims(WARP_SIZE,
|
1929
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1641
1930
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
1642
1931
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1643
1932
|
}
|
@@ -1684,7 +1973,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|
1684
1973
|
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1685
1974
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1686
1975
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
1687
|
-
(vx, y, dst, ncols_x, nrows_x, row_stride_x,
|
1976
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
1688
1977
|
}
|
1689
1978
|
|
1690
1979
|
static void ggml_cpy_f32_f32_cuda(
|
@@ -1803,6 +2092,7 @@ static size_t g_scratch_offset = 0;
|
|
1803
2092
|
|
1804
2093
|
static int g_device_count = -1;
|
1805
2094
|
static int g_main_device = 0;
|
2095
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
1806
2096
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1807
2097
|
|
1808
2098
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -1820,9 +2110,12 @@ void ggml_init_cublas() {
|
|
1820
2110
|
for (int id = 0; id < g_device_count; ++id) {
|
1821
2111
|
cudaDeviceProp prop;
|
1822
2112
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
1823
|
-
fprintf(stderr, " Device %d: %s\n", id, prop.name);
|
2113
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
2114
|
+
|
1824
2115
|
g_tensor_split[id] = total_vram;
|
1825
2116
|
total_vram += prop.totalGlobalMem;
|
2117
|
+
|
2118
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
1826
2119
|
}
|
1827
2120
|
for (int id = 0; id < g_device_count; ++id) {
|
1828
2121
|
g_tensor_split[id] /= total_vram;
|
@@ -1941,7 +2234,7 @@ inline void ggml_cuda_op_add(
|
|
1941
2234
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1942
2235
|
cudaStream_t & cudaStream_main){
|
1943
2236
|
|
1944
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
2237
|
+
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
1945
2238
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
1946
2239
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
1947
2240
|
|
@@ -1949,8 +2242,13 @@ inline void ggml_cuda_op_add(
|
|
1949
2242
|
const int64_t i01_diff = i01_high - i01_low;
|
1950
2243
|
|
1951
2244
|
// compute
|
1952
|
-
|
1953
|
-
|
2245
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2246
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
2247
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2248
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
2249
|
+
} else {
|
2250
|
+
GGML_ASSERT(false);
|
2251
|
+
}
|
1954
2252
|
|
1955
2253
|
(void) src1;
|
1956
2254
|
(void) dst;
|
@@ -1982,7 +2280,6 @@ inline void ggml_cuda_op_mul(
|
|
1982
2280
|
|
1983
2281
|
// compute
|
1984
2282
|
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
1985
|
-
CUDA_CHECK(cudaGetLastError());
|
1986
2283
|
}
|
1987
2284
|
|
1988
2285
|
(void) dst;
|
@@ -2003,7 +2300,6 @@ inline void ggml_cuda_op_silu(
|
|
2003
2300
|
|
2004
2301
|
// compute
|
2005
2302
|
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2006
|
-
CUDA_CHECK(cudaGetLastError());
|
2007
2303
|
|
2008
2304
|
(void) src1;
|
2009
2305
|
(void) dst;
|
@@ -2026,7 +2322,6 @@ inline void ggml_cuda_op_rms_norm(
|
|
2026
2322
|
|
2027
2323
|
// compute
|
2028
2324
|
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2029
|
-
CUDA_CHECK(cudaGetLastError());
|
2030
2325
|
|
2031
2326
|
(void) src1;
|
2032
2327
|
(void) dst;
|
@@ -2036,7 +2331,7 @@ inline void ggml_cuda_op_rms_norm(
|
|
2036
2331
|
(void) i1;
|
2037
2332
|
}
|
2038
2333
|
|
2039
|
-
inline void
|
2334
|
+
inline void ggml_cuda_op_mul_mat_vec(
|
2040
2335
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2041
2336
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2042
2337
|
cudaStream_t & cudaStream_main){
|
@@ -2048,70 +2343,116 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
2048
2343
|
const int64_t ne00 = src0->ne[0];
|
2049
2344
|
const int64_t nrows = i01_high - i01_low;
|
2050
2345
|
|
2051
|
-
|
2052
|
-
|
2053
|
-
|
2054
|
-
|
2346
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
2347
|
+
const bool use_mul_mat_vec_q = false;
|
2348
|
+
#else
|
2349
|
+
int id;
|
2350
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
2055
2351
|
|
2056
|
-
bool
|
2057
|
-
src0->type ==
|
2058
|
-
src0->type ==
|
2352
|
+
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
|
2353
|
+
src0->type == GGML_TYPE_Q4_1 ||
|
2354
|
+
src0->type == GGML_TYPE_Q5_0 ||
|
2355
|
+
src0->type == GGML_TYPE_Q5_1 ||
|
2356
|
+
src0->type == GGML_TYPE_Q8_0;
|
2059
2357
|
|
2060
|
-
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
2358
|
+
// The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
|
2359
|
+
// However, they have bad performance with Pascal cards.
|
2360
|
+
// Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
|
2361
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
|
2362
|
+
#endif
|
2363
|
+
|
2364
|
+
if (use_mul_mat_vec_q) {
|
2365
|
+
size_t as;
|
2366
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
|
2367
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
|
2368
|
+
|
2369
|
+
switch (src0->type) {
|
2370
|
+
case GGML_TYPE_Q4_0:
|
2371
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2372
|
+
break;
|
2373
|
+
case GGML_TYPE_Q4_1:
|
2374
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2375
|
+
break;
|
2376
|
+
case GGML_TYPE_Q5_0:
|
2377
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2378
|
+
break;
|
2379
|
+
case GGML_TYPE_Q5_1:
|
2380
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2381
|
+
break;
|
2382
|
+
case GGML_TYPE_Q8_0:
|
2383
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2384
|
+
break;
|
2385
|
+
default:
|
2386
|
+
GGML_ASSERT(false);
|
2387
|
+
break;
|
2388
|
+
}
|
2389
|
+
|
2390
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
2391
|
+
} else {
|
2392
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
2393
|
+
#ifdef GGML_CUDA_DMMV_F16
|
2394
|
+
size_t ash;
|
2395
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
2396
|
+
|
2397
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
2398
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
2399
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
2400
|
+
|
2401
|
+
if (src1_convert_f16) {
|
2402
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
2403
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
2404
|
+
ne00, 1, sizeof(float), 0, 0,
|
2405
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
2406
|
+
}
|
2066
2407
|
#else
|
2067
|
-
|
2408
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
2068
2409
|
#endif // GGML_CUDA_DMMV_F16
|
2069
2410
|
|
2070
|
-
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2075
|
-
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2086
|
-
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
|
2101
|
-
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2108
|
-
CUDA_CHECK(cudaGetLastError());
|
2411
|
+
switch (src0->type) {
|
2412
|
+
case GGML_TYPE_Q4_0:
|
2413
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2414
|
+
break;
|
2415
|
+
case GGML_TYPE_Q4_1:
|
2416
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2417
|
+
break;
|
2418
|
+
case GGML_TYPE_Q5_0:
|
2419
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2420
|
+
break;
|
2421
|
+
case GGML_TYPE_Q5_1:
|
2422
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2423
|
+
break;
|
2424
|
+
case GGML_TYPE_Q8_0:
|
2425
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2426
|
+
break;
|
2427
|
+
case GGML_TYPE_Q2_K:
|
2428
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2429
|
+
break;
|
2430
|
+
case GGML_TYPE_Q3_K:
|
2431
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2432
|
+
break;
|
2433
|
+
case GGML_TYPE_Q4_K:
|
2434
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2435
|
+
break;
|
2436
|
+
case GGML_TYPE_Q5_K:
|
2437
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2438
|
+
break;
|
2439
|
+
case GGML_TYPE_Q6_K:
|
2440
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2441
|
+
break;
|
2442
|
+
case GGML_TYPE_F16:
|
2443
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2444
|
+
break;
|
2445
|
+
default:
|
2446
|
+
GGML_ASSERT(false);
|
2447
|
+
break;
|
2448
|
+
}
|
2109
2449
|
|
2110
2450
|
#ifdef GGML_CUDA_DMMV_F16
|
2111
|
-
|
2112
|
-
|
2113
|
-
|
2451
|
+
if (src1_convert_f16) {
|
2452
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
2453
|
+
}
|
2114
2454
|
#endif // GGML_CUDA_DMMV_F16
|
2455
|
+
}
|
2115
2456
|
|
2116
2457
|
(void) src1;
|
2117
2458
|
(void) dst;
|
@@ -2182,7 +2523,6 @@ inline void ggml_cuda_op_rope(
|
|
2182
2523
|
|
2183
2524
|
// compute
|
2184
2525
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
2185
|
-
CUDA_CHECK(cudaGetLastError());
|
2186
2526
|
|
2187
2527
|
(void) dst;
|
2188
2528
|
(void) src0_ddq_i;
|
@@ -2206,7 +2546,6 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
2206
2546
|
|
2207
2547
|
// compute
|
2208
2548
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
2209
|
-
CUDA_CHECK(cudaGetLastError());
|
2210
2549
|
|
2211
2550
|
(void) dst;
|
2212
2551
|
(void) src0_ddq_i;
|
@@ -2228,7 +2567,6 @@ inline void ggml_cuda_op_soft_max(
|
|
2228
2567
|
|
2229
2568
|
// compute
|
2230
2569
|
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2231
|
-
CUDA_CHECK(cudaGetLastError());
|
2232
2570
|
|
2233
2571
|
(void) src1;
|
2234
2572
|
(void) dst;
|
@@ -2324,10 +2662,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2324
2662
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2325
2663
|
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2326
2664
|
|
2327
|
-
// if multiple
|
2665
|
+
// if multiple devices are used they need to wait for the main device
|
2666
|
+
// here an event is recorded that signifies that the main device has finished calculating the input data
|
2328
2667
|
if (split && g_device_count > 1) {
|
2329
2668
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2330
|
-
CUDA_CHECK(
|
2669
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
2331
2670
|
}
|
2332
2671
|
|
2333
2672
|
for (int id = 0; id < g_device_count; ++id) {
|
@@ -2353,6 +2692,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2353
2692
|
int64_t row_diff = row_high - row_low;
|
2354
2693
|
|
2355
2694
|
cudaSetDevice(id);
|
2695
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2696
|
+
|
2697
|
+
// wait for main GPU data if necessary
|
2698
|
+
if (split && id != g_main_device) {
|
2699
|
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
2700
|
+
}
|
2356
2701
|
|
2357
2702
|
if (src0_on_device && src0_is_contiguous) {
|
2358
2703
|
if (src0_is_f32) {
|
@@ -2428,8 +2773,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2428
2773
|
}
|
2429
2774
|
const int64_t i11 = i13*ne12 + i12;
|
2430
2775
|
|
2431
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2432
|
-
|
2433
2776
|
// for split tensors the data begins at i0 == i0_offset_low
|
2434
2777
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2435
2778
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
@@ -2489,6 +2832,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2489
2832
|
|
2490
2833
|
// do the computation
|
2491
2834
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
2835
|
+
CUDA_CHECK(cudaGetLastError());
|
2492
2836
|
|
2493
2837
|
// copy dst to host or other device if necessary
|
2494
2838
|
if (!dst_on_device) {
|
@@ -2518,6 +2862,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2518
2862
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
2519
2863
|
}
|
2520
2864
|
}
|
2865
|
+
|
2866
|
+
// signify to main device that other device is done
|
2867
|
+
if (split && g_device_count > 1 && id != g_main_device) {
|
2868
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
|
2869
|
+
}
|
2521
2870
|
}
|
2522
2871
|
}
|
2523
2872
|
}
|
@@ -2529,7 +2878,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2529
2878
|
}
|
2530
2879
|
|
2531
2880
|
CUDA_CHECK(cudaSetDevice(id));
|
2532
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2533
2881
|
|
2534
2882
|
if (src0_asq[id] > 0) {
|
2535
2883
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
@@ -2544,11 +2892,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2544
2892
|
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
|
2545
2893
|
}
|
2546
2894
|
}
|
2895
|
+
|
2896
|
+
// main device waits for all other devices to be finished
|
2897
|
+
if (split && g_device_count > 1) {
|
2898
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2899
|
+
for (int id = 0; id < g_device_count; ++id) {
|
2900
|
+
if (id != g_main_device) {
|
2901
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
2902
|
+
}
|
2903
|
+
}
|
2904
|
+
}
|
2905
|
+
|
2906
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
2907
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2908
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2909
|
+
}
|
2547
2910
|
}
|
2548
2911
|
|
2549
2912
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2550
|
-
|
2551
|
-
|
2913
|
+
// ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
|
2914
|
+
// Due to flatten_rows == true this does in practice not make a difference however.
|
2915
|
+
// Better solution would be nice but right now that would require disproportionate changes.
|
2916
|
+
GGML_ASSERT(
|
2917
|
+
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
2918
|
+
src1->type == GGML_TYPE_F32 &&
|
2919
|
+
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
2920
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
2552
2921
|
}
|
2553
2922
|
|
2554
2923
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2653,8 +3022,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
2653
3022
|
}else if (src0->type == GGML_TYPE_F32) {
|
2654
3023
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2655
3024
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
2656
|
-
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0
|
2657
|
-
ggml_cuda_op(src0, src1, dst,
|
3025
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3026
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
2658
3027
|
} else {
|
2659
3028
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2660
3029
|
}
|
@@ -2777,31 +3146,38 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
2777
3146
|
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
2778
3147
|
|
2779
3148
|
extra->data_device[id] = buf;
|
3149
|
+
|
3150
|
+
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
3151
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
|
3152
|
+
}
|
2780
3153
|
}
|
2781
3154
|
|
2782
3155
|
tensor->extra = extra;
|
2783
3156
|
}
|
2784
3157
|
|
2785
3158
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
2786
|
-
if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
|
3159
|
+
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
2787
3160
|
return;
|
2788
3161
|
}
|
2789
3162
|
|
2790
3163
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
2791
3164
|
|
2792
3165
|
for (int id = 0; id < g_device_count; ++id) {
|
2793
|
-
if (extra->data_device[id]
|
2794
|
-
|
3166
|
+
if (extra->data_device[id] != nullptr) {
|
3167
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3168
|
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
2795
3169
|
}
|
2796
3170
|
|
2797
|
-
|
2798
|
-
|
3171
|
+
if (extra->events[id] != nullptr) {
|
3172
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3173
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
|
3174
|
+
}
|
2799
3175
|
}
|
2800
3176
|
|
2801
3177
|
delete extra;
|
2802
3178
|
}
|
2803
3179
|
|
2804
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
3180
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
2805
3181
|
if (scratch && g_scratch_size == 0) {
|
2806
3182
|
return;
|
2807
3183
|
}
|
@@ -2810,11 +3186,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2810
3186
|
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
|
2811
3187
|
const ggml_op src0_op = tensor->src0->op;
|
2812
3188
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2813
|
-
ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
|
3189
|
+
ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
|
2814
3190
|
}
|
2815
3191
|
}
|
2816
3192
|
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
|
2817
|
-
ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
|
3193
|
+
ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
|
2818
3194
|
}
|
2819
3195
|
|
2820
3196
|
tensor->backend = GGML_BACKEND_GPU;
|
@@ -2822,11 +3198,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2822
3198
|
memset(extra, 0, sizeof(*extra));
|
2823
3199
|
|
2824
3200
|
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
2825
|
-
tensor->op == GGML_OP_VIEW
|
3201
|
+
tensor->op == GGML_OP_VIEW ||
|
3202
|
+
force_inplace;
|
2826
3203
|
const size_t size = ggml_nbytes(tensor);
|
2827
3204
|
|
2828
3205
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2829
|
-
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
|
3206
|
+
if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
|
2830
3207
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
|
2831
3208
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2832
3209
|
size_t offset = 0;
|
@@ -2865,11 +3242,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2865
3242
|
}
|
2866
3243
|
|
2867
3244
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
2868
|
-
ggml_cuda_assign_buffers_impl(tensor, true);
|
3245
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
2869
3246
|
}
|
2870
3247
|
|
2871
3248
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
2872
|
-
ggml_cuda_assign_buffers_impl(tensor, false);
|
3249
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
3250
|
+
}
|
3251
|
+
|
3252
|
+
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
3253
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
2873
3254
|
}
|
2874
3255
|
|
2875
3256
|
void ggml_cuda_set_main_device(int main_device) {
|