llama_cpp 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +195 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +499 -118
- data/ext/llama_cpp/src/ggml-cuda.h +1 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +357 -176
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +230 -261
- data/ext/llama_cpp/src/llama.h +31 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +21 -1
- metadata +3 -2
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
|
|
70
70
|
|
71
71
|
// QK = number of values after dequantization
|
72
72
|
// QR = QK / number of values before dequantization
|
73
|
+
// QI = number of 32 bit integers before dequantization
|
73
74
|
|
74
75
|
#define QK4_0 32
|
75
76
|
#define QR4_0 2
|
77
|
+
#define QI4_0 4
|
76
78
|
typedef struct {
|
77
79
|
half d; // delta
|
78
80
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
81
83
|
|
82
84
|
#define QK4_1 32
|
83
85
|
#define QR4_1 2
|
86
|
+
#define QI4_1 4
|
84
87
|
typedef struct {
|
85
88
|
half d; // delta
|
86
89
|
half m; // min
|
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
90
93
|
|
91
94
|
#define QK5_0 32
|
92
95
|
#define QR5_0 2
|
96
|
+
#define QI5_0 4
|
93
97
|
typedef struct {
|
94
98
|
half d; // delta
|
95
99
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
99
103
|
|
100
104
|
#define QK5_1 32
|
101
105
|
#define QR5_1 2
|
106
|
+
#define QI5_1 4
|
102
107
|
typedef struct {
|
103
108
|
half d; // delta
|
104
109
|
half m; // min
|
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
109
114
|
|
110
115
|
#define QK8_0 32
|
111
116
|
#define QR8_0 1
|
117
|
+
#define QI8_0 8
|
112
118
|
typedef struct {
|
113
119
|
half d; // delta
|
114
120
|
int8_t qs[QK8_0]; // quants
|
115
121
|
} block_q8_0;
|
116
122
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
117
123
|
|
124
|
+
#define QK8_1 32
|
125
|
+
#define QR8_1 1
|
126
|
+
#define QI8_1 8
|
127
|
+
typedef struct {
|
128
|
+
half d; // delta
|
129
|
+
half s; // unquantized sum
|
130
|
+
int8_t qs[QK8_0]; // quants
|
131
|
+
} block_q8_1;
|
132
|
+
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
|
+
|
134
|
+
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
135
|
+
|
118
136
|
//================================= k-quants
|
119
137
|
|
120
138
|
#ifdef GGML_QKK_64
|
@@ -198,14 +216,15 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
198
216
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
199
217
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
200
218
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
219
|
+
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
201
220
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
202
221
|
|
203
222
|
// dmmv = dequantize_mul_mat_vec
|
204
223
|
#ifndef GGML_CUDA_DMMV_X
|
205
224
|
#define GGML_CUDA_DMMV_X 32
|
206
225
|
#endif
|
207
|
-
#ifndef
|
208
|
-
#define
|
226
|
+
#ifndef GGML_CUDA_MMV_Y
|
227
|
+
#define GGML_CUDA_MMV_Y 1
|
209
228
|
#endif
|
210
229
|
|
211
230
|
#ifndef K_QUANTS_PER_ITERATION
|
@@ -214,6 +233,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
214
233
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
215
234
|
#endif
|
216
235
|
|
236
|
+
struct ggml_tensor_extra_gpu {
|
237
|
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
238
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
239
|
+
};
|
240
|
+
|
217
241
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
218
242
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
219
243
|
|
@@ -223,6 +247,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
|
|
223
247
|
dst[i] = x[i] + y[i];
|
224
248
|
}
|
225
249
|
|
250
|
+
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
251
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
252
|
+
|
253
|
+
if (i >= k) {
|
254
|
+
return;
|
255
|
+
}
|
256
|
+
dst[i] = __hadd(x[i], __float2half(y[i]));
|
257
|
+
}
|
258
|
+
|
226
259
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
227
260
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
228
261
|
|
@@ -256,7 +289,6 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
256
289
|
}
|
257
290
|
|
258
291
|
// sum up partial sums
|
259
|
-
__syncthreads();
|
260
292
|
#pragma unroll
|
261
293
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
262
294
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -700,7 +732,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
700
732
|
#endif
|
701
733
|
|
702
734
|
// sum up partial sums and write back result
|
703
|
-
__syncthreads();
|
704
735
|
#pragma unroll
|
705
736
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
706
737
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -805,7 +836,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
805
836
|
#endif
|
806
837
|
|
807
838
|
// sum up partial sums and write back result
|
808
|
-
__syncthreads();
|
809
839
|
#pragma unroll
|
810
840
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
811
841
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -909,7 +939,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
909
939
|
#endif
|
910
940
|
|
911
941
|
// sum up partial sums and write back result
|
912
|
-
__syncthreads();
|
913
942
|
#pragma unroll
|
914
943
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
915
944
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1014,7 +1043,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1014
1043
|
#endif
|
1015
1044
|
|
1016
1045
|
// sum up partial sums and write back result
|
1017
|
-
__syncthreads();
|
1018
1046
|
#pragma unroll
|
1019
1047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1020
1048
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1125,7 +1153,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
|
1125
1153
|
#endif
|
1126
1154
|
|
1127
1155
|
// sum up partial sums and write back result
|
1128
|
-
__syncthreads();
|
1129
1156
|
#pragma unroll
|
1130
1157
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1131
1158
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1144,6 +1171,41 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1144
1171
|
v.y = x[ib + iqs + 1];
|
1145
1172
|
}
|
1146
1173
|
|
1174
|
+
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
1175
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1176
|
+
|
1177
|
+
if (i >= k) {
|
1178
|
+
return;
|
1179
|
+
}
|
1180
|
+
|
1181
|
+
block_q8_1 * y = (block_q8_1 *) vy;
|
1182
|
+
|
1183
|
+
const int ib = i / QK8_0; // block index
|
1184
|
+
const int iqs = i % QK8_0; // quant index
|
1185
|
+
|
1186
|
+
const float xi = x[i];
|
1187
|
+
float amax = fabsf(xi);
|
1188
|
+
float sum = xi;
|
1189
|
+
|
1190
|
+
#pragma unroll
|
1191
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1192
|
+
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
1193
|
+
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
const float d = amax / 127;
|
1197
|
+
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
1198
|
+
|
1199
|
+
y[ib].qs[iqs] = q;
|
1200
|
+
|
1201
|
+
if (iqs > 0) {
|
1202
|
+
return;
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
y[ib].d = d;
|
1206
|
+
y[ib].s = sum;
|
1207
|
+
}
|
1208
|
+
|
1147
1209
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1148
1210
|
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1149
1211
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
@@ -1165,6 +1227,182 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1165
1227
|
y[iybs + iqs + y_offset] = v.y;
|
1166
1228
|
}
|
1167
1229
|
|
1230
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1231
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1232
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1233
|
+
|
1234
|
+
int vi;
|
1235
|
+
memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1236
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1237
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1238
|
+
|
1239
|
+
const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
|
1240
|
+
|
1241
|
+
// subtract 8 from each quantized value
|
1242
|
+
const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
|
1243
|
+
const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
|
1244
|
+
|
1245
|
+
// SIMD dot product of quantized values
|
1246
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1247
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1248
|
+
|
1249
|
+
return sumi*d;
|
1250
|
+
#else
|
1251
|
+
return 0.0f; // only to satisfy the compiler
|
1252
|
+
#endif // __CUDA_ARCH__ >= 600
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1256
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1257
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1258
|
+
|
1259
|
+
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
1260
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1261
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
1262
|
+
|
1263
|
+
const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
|
1264
|
+
const float m = bq4_1->m;
|
1265
|
+
const float s = bq8_1->s;
|
1266
|
+
|
1267
|
+
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
|
1268
|
+
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
|
1269
|
+
|
1270
|
+
// SIMD dot product of quantized values
|
1271
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1272
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1273
|
+
|
1274
|
+
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1275
|
+
#else
|
1276
|
+
return 0.0f; // only to satisfy the compiler
|
1277
|
+
#endif // __CUDA_ARCH__ >= 600
|
1278
|
+
}
|
1279
|
+
|
1280
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1281
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1282
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1283
|
+
|
1284
|
+
int qs;
|
1285
|
+
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1286
|
+
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1287
|
+
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1288
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1289
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1290
|
+
|
1291
|
+
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1292
|
+
|
1293
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1294
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1295
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1296
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1297
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1298
|
+
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1299
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1300
|
+
|
1301
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1302
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1303
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1304
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1305
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1306
|
+
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1307
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1308
|
+
|
1309
|
+
return sumi*d;
|
1310
|
+
#else
|
1311
|
+
return 0.0f; // only to satisfy the compiler
|
1312
|
+
#endif // __CUDA_ARCH__ >= 600
|
1313
|
+
}
|
1314
|
+
|
1315
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1316
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1317
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1318
|
+
|
1319
|
+
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1320
|
+
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1321
|
+
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1322
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1323
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1324
|
+
|
1325
|
+
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1326
|
+
const float m = bq5_1->m;
|
1327
|
+
const float s = bq8_1->s;
|
1328
|
+
|
1329
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1330
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1331
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1332
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1333
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1334
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1335
|
+
|
1336
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1337
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1338
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1339
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1340
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1341
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1342
|
+
|
1343
|
+
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1344
|
+
#else
|
1345
|
+
return 0.0f; // only to satisfy the compiler
|
1346
|
+
#endif // __CUDA_ARCH__ >= 600
|
1347
|
+
}
|
1348
|
+
|
1349
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1350
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1351
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1352
|
+
|
1353
|
+
int vi;
|
1354
|
+
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1355
|
+
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1356
|
+
|
1357
|
+
const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
|
1358
|
+
|
1359
|
+
// SIMD dot product of quantized values
|
1360
|
+
int sumi = __dp4a(vi, ui, 0);
|
1361
|
+
|
1362
|
+
return sumi*d;
|
1363
|
+
#else
|
1364
|
+
return 0.0f; // only to satisfy the compiler
|
1365
|
+
#endif // __CUDA_ARCH__ >= 600
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1369
|
+
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
1370
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1371
|
+
|
1372
|
+
if (row >= nrows) {
|
1373
|
+
return;
|
1374
|
+
}
|
1375
|
+
|
1376
|
+
const int blocks_per_row = ncols / qk;
|
1377
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
1378
|
+
|
1379
|
+
// partial sum for each thread
|
1380
|
+
float tmp = 0.0f;
|
1381
|
+
|
1382
|
+
const block_q_t * x = (const block_q_t *) vx;
|
1383
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1384
|
+
|
1385
|
+
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1386
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1387
|
+
|
1388
|
+
const int iby = i + threadIdx.x / qi; // y block index
|
1389
|
+
|
1390
|
+
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1391
|
+
|
1392
|
+
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1393
|
+
}
|
1394
|
+
|
1395
|
+
// sum up partial sums and write back result
|
1396
|
+
#pragma unroll
|
1397
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1398
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1399
|
+
}
|
1400
|
+
|
1401
|
+
if (threadIdx.x == 0) {
|
1402
|
+
dst[row] = tmp;
|
1403
|
+
}
|
1404
|
+
}
|
1405
|
+
|
1168
1406
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1169
1407
|
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1170
1408
|
// qk = quantized weights per x block
|
@@ -1219,7 +1457,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1219
1457
|
}
|
1220
1458
|
|
1221
1459
|
// sum up partial sums and write back result
|
1222
|
-
__syncthreads();
|
1223
1460
|
#pragma unroll
|
1224
1461
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1225
1462
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1235,7 +1472,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1235
1472
|
}
|
1236
1473
|
|
1237
1474
|
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1238
|
-
const half * x = (half *) vx;
|
1475
|
+
const half * x = (const half *) vx;
|
1239
1476
|
|
1240
1477
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1241
1478
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
@@ -1270,7 +1507,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1270
1507
|
const int idst = channel*nrows_dst + row_dst;
|
1271
1508
|
|
1272
1509
|
// sum up partial sums and write back result
|
1273
|
-
__syncthreads();
|
1274
1510
|
#pragma unroll
|
1275
1511
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1276
1512
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1283,9 +1519,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1283
1519
|
|
1284
1520
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1285
1521
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1286
|
-
const int row_stride_x, const int
|
1522
|
+
const int row_stride_x, const int channel_stride_x) {
|
1287
1523
|
|
1288
|
-
const half * x = (half *) vx;
|
1524
|
+
const half * x = (const half *) vx;
|
1289
1525
|
|
1290
1526
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1291
1527
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
@@ -1316,7 +1552,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1316
1552
|
}
|
1317
1553
|
|
1318
1554
|
// sum up partial sums and write back result
|
1319
|
-
__syncthreads();
|
1320
1555
|
#pragma unroll
|
1321
1556
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1322
1557
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1328,14 +1563,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1328
1563
|
}
|
1329
1564
|
|
1330
1565
|
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
1331
|
-
const float * xi = (float *) cxi;
|
1566
|
+
const float * xi = (const float *) cxi;
|
1332
1567
|
float * dsti = (float *) cdsti;
|
1333
1568
|
|
1334
1569
|
*dsti = *xi;
|
1335
1570
|
}
|
1336
1571
|
|
1337
1572
|
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
1338
|
-
const float * xi = (float *) cxi;
|
1573
|
+
const float * xi = (const float *) cxi;
|
1339
1574
|
half * dsti = (half *) cdsti;
|
1340
1575
|
|
1341
1576
|
*dsti = __float2half(*xi);
|
@@ -1426,7 +1661,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
1426
1661
|
}
|
1427
1662
|
|
1428
1663
|
// sum up partial sums
|
1429
|
-
__syncthreads();
|
1430
1664
|
#pragma unroll
|
1431
1665
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1432
1666
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1459,6 +1693,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1459
1693
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
1460
1694
|
}
|
1461
1695
|
|
1696
|
+
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
1697
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
1698
|
+
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
1699
|
+
}
|
1700
|
+
|
1462
1701
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
1463
1702
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
1464
1703
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -1475,6 +1714,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
1475
1714
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1476
1715
|
}
|
1477
1716
|
|
1717
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
|
1718
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1719
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
|
1720
|
+
}
|
1721
|
+
|
1478
1722
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1479
1723
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1480
1724
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1543,45 +1787,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
1543
1787
|
|
1544
1788
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1545
1789
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1546
|
-
const int block_num_y = (nrows +
|
1790
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1547
1791
|
const dim3 block_nums(1, block_num_y, 1);
|
1548
|
-
const dim3 block_dims(WARP_SIZE,
|
1792
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1549
1793
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
1550
1794
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1551
1795
|
}
|
1552
1796
|
|
1553
1797
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1554
1798
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1555
|
-
const int block_num_y = (nrows +
|
1799
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1556
1800
|
const dim3 block_nums(1, block_num_y, 1);
|
1557
|
-
const dim3 block_dims(WARP_SIZE,
|
1801
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1558
1802
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
1559
1803
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1560
1804
|
}
|
1561
1805
|
|
1562
1806
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1563
1807
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1564
|
-
const int block_num_y = (nrows +
|
1808
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1565
1809
|
const dim3 block_nums(1, block_num_y, 1);
|
1566
|
-
const dim3 block_dims(WARP_SIZE,
|
1810
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1567
1811
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
1568
1812
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1569
1813
|
}
|
1570
1814
|
|
1571
1815
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1572
1816
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1573
|
-
const int block_num_y = (nrows +
|
1817
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1574
1818
|
const dim3 block_nums(1, block_num_y, 1);
|
1575
|
-
const dim3 block_dims(WARP_SIZE,
|
1819
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1576
1820
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
1577
1821
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1578
1822
|
}
|
1579
1823
|
|
1580
1824
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1581
1825
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1582
|
-
const int block_num_y = (nrows +
|
1826
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1583
1827
|
const dim3 block_nums(1, block_num_y, 1);
|
1584
|
-
const dim3 block_dims(WARP_SIZE,
|
1828
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1585
1829
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
1586
1830
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1587
1831
|
}
|
@@ -1628,6 +1872,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1628
1872
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1629
1873
|
}
|
1630
1874
|
|
1875
|
+
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1876
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1877
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1878
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1879
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1880
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
1881
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1882
|
+
}
|
1883
|
+
|
1884
|
+
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1885
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1886
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1887
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1888
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1889
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
1890
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1891
|
+
}
|
1892
|
+
|
1893
|
+
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1894
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1895
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1896
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1897
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1898
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
1899
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1900
|
+
}
|
1901
|
+
|
1902
|
+
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1903
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1904
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1905
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1906
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1907
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
1908
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1909
|
+
}
|
1910
|
+
|
1911
|
+
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1912
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1913
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1914
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1915
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1916
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
1917
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1918
|
+
}
|
1919
|
+
|
1631
1920
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1632
1921
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1633
1922
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1635,9 +1924,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
1635
1924
|
|
1636
1925
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1637
1926
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1638
|
-
const int block_num_y = (nrows +
|
1927
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1639
1928
|
const dim3 block_nums(1, block_num_y, 1);
|
1640
|
-
const dim3 block_dims(WARP_SIZE,
|
1929
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1641
1930
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
1642
1931
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1643
1932
|
}
|
@@ -1684,7 +1973,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|
1684
1973
|
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1685
1974
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1686
1975
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
1687
|
-
(vx, y, dst, ncols_x, nrows_x, row_stride_x,
|
1976
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
1688
1977
|
}
|
1689
1978
|
|
1690
1979
|
static void ggml_cpy_f32_f32_cuda(
|
@@ -1803,6 +2092,7 @@ static size_t g_scratch_offset = 0;
|
|
1803
2092
|
|
1804
2093
|
static int g_device_count = -1;
|
1805
2094
|
static int g_main_device = 0;
|
2095
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
1806
2096
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1807
2097
|
|
1808
2098
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -1820,9 +2110,12 @@ void ggml_init_cublas() {
|
|
1820
2110
|
for (int id = 0; id < g_device_count; ++id) {
|
1821
2111
|
cudaDeviceProp prop;
|
1822
2112
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
1823
|
-
fprintf(stderr, " Device %d: %s\n", id, prop.name);
|
2113
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
2114
|
+
|
1824
2115
|
g_tensor_split[id] = total_vram;
|
1825
2116
|
total_vram += prop.totalGlobalMem;
|
2117
|
+
|
2118
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
1826
2119
|
}
|
1827
2120
|
for (int id = 0; id < g_device_count; ++id) {
|
1828
2121
|
g_tensor_split[id] /= total_vram;
|
@@ -1941,7 +2234,7 @@ inline void ggml_cuda_op_add(
|
|
1941
2234
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1942
2235
|
cudaStream_t & cudaStream_main){
|
1943
2236
|
|
1944
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
2237
|
+
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
1945
2238
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
1946
2239
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
1947
2240
|
|
@@ -1949,8 +2242,13 @@ inline void ggml_cuda_op_add(
|
|
1949
2242
|
const int64_t i01_diff = i01_high - i01_low;
|
1950
2243
|
|
1951
2244
|
// compute
|
1952
|
-
|
1953
|
-
|
2245
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2246
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
2247
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2248
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
2249
|
+
} else {
|
2250
|
+
GGML_ASSERT(false);
|
2251
|
+
}
|
1954
2252
|
|
1955
2253
|
(void) src1;
|
1956
2254
|
(void) dst;
|
@@ -1982,7 +2280,6 @@ inline void ggml_cuda_op_mul(
|
|
1982
2280
|
|
1983
2281
|
// compute
|
1984
2282
|
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
1985
|
-
CUDA_CHECK(cudaGetLastError());
|
1986
2283
|
}
|
1987
2284
|
|
1988
2285
|
(void) dst;
|
@@ -2003,7 +2300,6 @@ inline void ggml_cuda_op_silu(
|
|
2003
2300
|
|
2004
2301
|
// compute
|
2005
2302
|
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2006
|
-
CUDA_CHECK(cudaGetLastError());
|
2007
2303
|
|
2008
2304
|
(void) src1;
|
2009
2305
|
(void) dst;
|
@@ -2026,7 +2322,6 @@ inline void ggml_cuda_op_rms_norm(
|
|
2026
2322
|
|
2027
2323
|
// compute
|
2028
2324
|
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2029
|
-
CUDA_CHECK(cudaGetLastError());
|
2030
2325
|
|
2031
2326
|
(void) src1;
|
2032
2327
|
(void) dst;
|
@@ -2036,7 +2331,7 @@ inline void ggml_cuda_op_rms_norm(
|
|
2036
2331
|
(void) i1;
|
2037
2332
|
}
|
2038
2333
|
|
2039
|
-
inline void
|
2334
|
+
inline void ggml_cuda_op_mul_mat_vec(
|
2040
2335
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2041
2336
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2042
2337
|
cudaStream_t & cudaStream_main){
|
@@ -2048,70 +2343,116 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
2048
2343
|
const int64_t ne00 = src0->ne[0];
|
2049
2344
|
const int64_t nrows = i01_high - i01_low;
|
2050
2345
|
|
2051
|
-
|
2052
|
-
|
2053
|
-
|
2054
|
-
|
2346
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
2347
|
+
const bool use_mul_mat_vec_q = false;
|
2348
|
+
#else
|
2349
|
+
int id;
|
2350
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
2055
2351
|
|
2056
|
-
bool
|
2057
|
-
src0->type ==
|
2058
|
-
src0->type ==
|
2352
|
+
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
|
2353
|
+
src0->type == GGML_TYPE_Q4_1 ||
|
2354
|
+
src0->type == GGML_TYPE_Q5_0 ||
|
2355
|
+
src0->type == GGML_TYPE_Q5_1 ||
|
2356
|
+
src0->type == GGML_TYPE_Q8_0;
|
2059
2357
|
|
2060
|
-
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
2358
|
+
// The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
|
2359
|
+
// However, they have bad performance with Pascal cards.
|
2360
|
+
// Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
|
2361
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
|
2362
|
+
#endif
|
2363
|
+
|
2364
|
+
if (use_mul_mat_vec_q) {
|
2365
|
+
size_t as;
|
2366
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
|
2367
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
|
2368
|
+
|
2369
|
+
switch (src0->type) {
|
2370
|
+
case GGML_TYPE_Q4_0:
|
2371
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2372
|
+
break;
|
2373
|
+
case GGML_TYPE_Q4_1:
|
2374
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2375
|
+
break;
|
2376
|
+
case GGML_TYPE_Q5_0:
|
2377
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2378
|
+
break;
|
2379
|
+
case GGML_TYPE_Q5_1:
|
2380
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2381
|
+
break;
|
2382
|
+
case GGML_TYPE_Q8_0:
|
2383
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2384
|
+
break;
|
2385
|
+
default:
|
2386
|
+
GGML_ASSERT(false);
|
2387
|
+
break;
|
2388
|
+
}
|
2389
|
+
|
2390
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
2391
|
+
} else {
|
2392
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
2393
|
+
#ifdef GGML_CUDA_DMMV_F16
|
2394
|
+
size_t ash;
|
2395
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
2396
|
+
|
2397
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
2398
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
2399
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
2400
|
+
|
2401
|
+
if (src1_convert_f16) {
|
2402
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
2403
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
2404
|
+
ne00, 1, sizeof(float), 0, 0,
|
2405
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
2406
|
+
}
|
2066
2407
|
#else
|
2067
|
-
|
2408
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
2068
2409
|
#endif // GGML_CUDA_DMMV_F16
|
2069
2410
|
|
2070
|
-
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2075
|
-
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2086
|
-
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
|
2101
|
-
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2108
|
-
CUDA_CHECK(cudaGetLastError());
|
2411
|
+
switch (src0->type) {
|
2412
|
+
case GGML_TYPE_Q4_0:
|
2413
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2414
|
+
break;
|
2415
|
+
case GGML_TYPE_Q4_1:
|
2416
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2417
|
+
break;
|
2418
|
+
case GGML_TYPE_Q5_0:
|
2419
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2420
|
+
break;
|
2421
|
+
case GGML_TYPE_Q5_1:
|
2422
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2423
|
+
break;
|
2424
|
+
case GGML_TYPE_Q8_0:
|
2425
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2426
|
+
break;
|
2427
|
+
case GGML_TYPE_Q2_K:
|
2428
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2429
|
+
break;
|
2430
|
+
case GGML_TYPE_Q3_K:
|
2431
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2432
|
+
break;
|
2433
|
+
case GGML_TYPE_Q4_K:
|
2434
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2435
|
+
break;
|
2436
|
+
case GGML_TYPE_Q5_K:
|
2437
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2438
|
+
break;
|
2439
|
+
case GGML_TYPE_Q6_K:
|
2440
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2441
|
+
break;
|
2442
|
+
case GGML_TYPE_F16:
|
2443
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2444
|
+
break;
|
2445
|
+
default:
|
2446
|
+
GGML_ASSERT(false);
|
2447
|
+
break;
|
2448
|
+
}
|
2109
2449
|
|
2110
2450
|
#ifdef GGML_CUDA_DMMV_F16
|
2111
|
-
|
2112
|
-
|
2113
|
-
|
2451
|
+
if (src1_convert_f16) {
|
2452
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
2453
|
+
}
|
2114
2454
|
#endif // GGML_CUDA_DMMV_F16
|
2455
|
+
}
|
2115
2456
|
|
2116
2457
|
(void) src1;
|
2117
2458
|
(void) dst;
|
@@ -2182,7 +2523,6 @@ inline void ggml_cuda_op_rope(
|
|
2182
2523
|
|
2183
2524
|
// compute
|
2184
2525
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
2185
|
-
CUDA_CHECK(cudaGetLastError());
|
2186
2526
|
|
2187
2527
|
(void) dst;
|
2188
2528
|
(void) src0_ddq_i;
|
@@ -2206,7 +2546,6 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
2206
2546
|
|
2207
2547
|
// compute
|
2208
2548
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
2209
|
-
CUDA_CHECK(cudaGetLastError());
|
2210
2549
|
|
2211
2550
|
(void) dst;
|
2212
2551
|
(void) src0_ddq_i;
|
@@ -2228,7 +2567,6 @@ inline void ggml_cuda_op_soft_max(
|
|
2228
2567
|
|
2229
2568
|
// compute
|
2230
2569
|
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2231
|
-
CUDA_CHECK(cudaGetLastError());
|
2232
2570
|
|
2233
2571
|
(void) src1;
|
2234
2572
|
(void) dst;
|
@@ -2324,10 +2662,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2324
2662
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2325
2663
|
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2326
2664
|
|
2327
|
-
// if multiple
|
2665
|
+
// if multiple devices are used they need to wait for the main device
|
2666
|
+
// here an event is recorded that signifies that the main device has finished calculating the input data
|
2328
2667
|
if (split && g_device_count > 1) {
|
2329
2668
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2330
|
-
CUDA_CHECK(
|
2669
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
2331
2670
|
}
|
2332
2671
|
|
2333
2672
|
for (int id = 0; id < g_device_count; ++id) {
|
@@ -2353,6 +2692,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2353
2692
|
int64_t row_diff = row_high - row_low;
|
2354
2693
|
|
2355
2694
|
cudaSetDevice(id);
|
2695
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2696
|
+
|
2697
|
+
// wait for main GPU data if necessary
|
2698
|
+
if (split && id != g_main_device) {
|
2699
|
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
2700
|
+
}
|
2356
2701
|
|
2357
2702
|
if (src0_on_device && src0_is_contiguous) {
|
2358
2703
|
if (src0_is_f32) {
|
@@ -2428,8 +2773,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2428
2773
|
}
|
2429
2774
|
const int64_t i11 = i13*ne12 + i12;
|
2430
2775
|
|
2431
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2432
|
-
|
2433
2776
|
// for split tensors the data begins at i0 == i0_offset_low
|
2434
2777
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2435
2778
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
@@ -2489,6 +2832,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2489
2832
|
|
2490
2833
|
// do the computation
|
2491
2834
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
2835
|
+
CUDA_CHECK(cudaGetLastError());
|
2492
2836
|
|
2493
2837
|
// copy dst to host or other device if necessary
|
2494
2838
|
if (!dst_on_device) {
|
@@ -2518,6 +2862,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2518
2862
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
2519
2863
|
}
|
2520
2864
|
}
|
2865
|
+
|
2866
|
+
// signify to main device that other device is done
|
2867
|
+
if (split && g_device_count > 1 && id != g_main_device) {
|
2868
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
|
2869
|
+
}
|
2521
2870
|
}
|
2522
2871
|
}
|
2523
2872
|
}
|
@@ -2529,7 +2878,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2529
2878
|
}
|
2530
2879
|
|
2531
2880
|
CUDA_CHECK(cudaSetDevice(id));
|
2532
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2533
2881
|
|
2534
2882
|
if (src0_asq[id] > 0) {
|
2535
2883
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
@@ -2544,11 +2892,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2544
2892
|
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
|
2545
2893
|
}
|
2546
2894
|
}
|
2895
|
+
|
2896
|
+
// main device waits for all other devices to be finished
|
2897
|
+
if (split && g_device_count > 1) {
|
2898
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2899
|
+
for (int id = 0; id < g_device_count; ++id) {
|
2900
|
+
if (id != g_main_device) {
|
2901
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
2902
|
+
}
|
2903
|
+
}
|
2904
|
+
}
|
2905
|
+
|
2906
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
2907
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2908
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2909
|
+
}
|
2547
2910
|
}
|
2548
2911
|
|
2549
2912
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2550
|
-
|
2551
|
-
|
2913
|
+
// ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
|
2914
|
+
// Due to flatten_rows == true this does in practice not make a difference however.
|
2915
|
+
// Better solution would be nice but right now that would require disproportionate changes.
|
2916
|
+
GGML_ASSERT(
|
2917
|
+
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
2918
|
+
src1->type == GGML_TYPE_F32 &&
|
2919
|
+
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
2920
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
2552
2921
|
}
|
2553
2922
|
|
2554
2923
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2653,8 +3022,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
2653
3022
|
}else if (src0->type == GGML_TYPE_F32) {
|
2654
3023
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2655
3024
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
2656
|
-
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0
|
2657
|
-
ggml_cuda_op(src0, src1, dst,
|
3025
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3026
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
2658
3027
|
} else {
|
2659
3028
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2660
3029
|
}
|
@@ -2777,31 +3146,38 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
2777
3146
|
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
2778
3147
|
|
2779
3148
|
extra->data_device[id] = buf;
|
3149
|
+
|
3150
|
+
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
3151
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
|
3152
|
+
}
|
2780
3153
|
}
|
2781
3154
|
|
2782
3155
|
tensor->extra = extra;
|
2783
3156
|
}
|
2784
3157
|
|
2785
3158
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
2786
|
-
if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
|
3159
|
+
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
2787
3160
|
return;
|
2788
3161
|
}
|
2789
3162
|
|
2790
3163
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
2791
3164
|
|
2792
3165
|
for (int id = 0; id < g_device_count; ++id) {
|
2793
|
-
if (extra->data_device[id]
|
2794
|
-
|
3166
|
+
if (extra->data_device[id] != nullptr) {
|
3167
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3168
|
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
2795
3169
|
}
|
2796
3170
|
|
2797
|
-
|
2798
|
-
|
3171
|
+
if (extra->events[id] != nullptr) {
|
3172
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3173
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
|
3174
|
+
}
|
2799
3175
|
}
|
2800
3176
|
|
2801
3177
|
delete extra;
|
2802
3178
|
}
|
2803
3179
|
|
2804
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
3180
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
2805
3181
|
if (scratch && g_scratch_size == 0) {
|
2806
3182
|
return;
|
2807
3183
|
}
|
@@ -2810,11 +3186,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2810
3186
|
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
|
2811
3187
|
const ggml_op src0_op = tensor->src0->op;
|
2812
3188
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2813
|
-
ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
|
3189
|
+
ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
|
2814
3190
|
}
|
2815
3191
|
}
|
2816
3192
|
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
|
2817
|
-
ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
|
3193
|
+
ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
|
2818
3194
|
}
|
2819
3195
|
|
2820
3196
|
tensor->backend = GGML_BACKEND_GPU;
|
@@ -2822,11 +3198,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2822
3198
|
memset(extra, 0, sizeof(*extra));
|
2823
3199
|
|
2824
3200
|
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
2825
|
-
tensor->op == GGML_OP_VIEW
|
3201
|
+
tensor->op == GGML_OP_VIEW ||
|
3202
|
+
force_inplace;
|
2826
3203
|
const size_t size = ggml_nbytes(tensor);
|
2827
3204
|
|
2828
3205
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2829
|
-
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
|
3206
|
+
if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
|
2830
3207
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
|
2831
3208
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2832
3209
|
size_t offset = 0;
|
@@ -2865,11 +3242,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2865
3242
|
}
|
2866
3243
|
|
2867
3244
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
2868
|
-
ggml_cuda_assign_buffers_impl(tensor, true);
|
3245
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
2869
3246
|
}
|
2870
3247
|
|
2871
3248
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
2872
|
-
ggml_cuda_assign_buffers_impl(tensor, false);
|
3249
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
3250
|
+
}
|
3251
|
+
|
3252
|
+
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
3253
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
2873
3254
|
}
|
2874
3255
|
|
2875
3256
|
void ggml_cuda_set_main_device(int main_device) {
|