llama_cpp 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
|
|
70
70
|
|
71
71
|
// QK = number of values after dequantization
|
72
72
|
// QR = QK / number of values before dequantization
|
73
|
+
// QI = number of 32 bit integers before dequantization
|
73
74
|
|
74
75
|
#define QK4_0 32
|
75
76
|
#define QR4_0 2
|
77
|
+
#define QI4_0 4
|
76
78
|
typedef struct {
|
77
79
|
half d; // delta
|
78
80
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
81
83
|
|
82
84
|
#define QK4_1 32
|
83
85
|
#define QR4_1 2
|
86
|
+
#define QI4_1 4
|
84
87
|
typedef struct {
|
85
88
|
half d; // delta
|
86
89
|
half m; // min
|
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
90
93
|
|
91
94
|
#define QK5_0 32
|
92
95
|
#define QR5_0 2
|
96
|
+
#define QI5_0 4
|
93
97
|
typedef struct {
|
94
98
|
half d; // delta
|
95
99
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
99
103
|
|
100
104
|
#define QK5_1 32
|
101
105
|
#define QR5_1 2
|
106
|
+
#define QI5_1 4
|
102
107
|
typedef struct {
|
103
108
|
half d; // delta
|
104
109
|
half m; // min
|
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
109
114
|
|
110
115
|
#define QK8_0 32
|
111
116
|
#define QR8_0 1
|
117
|
+
#define QI8_0 8
|
112
118
|
typedef struct {
|
113
119
|
half d; // delta
|
114
120
|
int8_t qs[QK8_0]; // quants
|
115
121
|
} block_q8_0;
|
116
122
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
117
123
|
|
124
|
+
#define QK8_1 32
|
125
|
+
#define QR8_1 1
|
126
|
+
#define QI8_1 8
|
127
|
+
typedef struct {
|
128
|
+
half d; // delta
|
129
|
+
half s; // unquantized sum
|
130
|
+
int8_t qs[QK8_0]; // quants
|
131
|
+
} block_q8_1;
|
132
|
+
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
|
+
|
134
|
+
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
135
|
+
|
118
136
|
//================================= k-quants
|
119
137
|
|
120
138
|
#ifdef GGML_QKK_64
|
@@ -198,14 +216,15 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
198
216
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
199
217
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
200
218
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
219
|
+
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
201
220
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
202
221
|
|
203
222
|
// dmmv = dequantize_mul_mat_vec
|
204
223
|
#ifndef GGML_CUDA_DMMV_X
|
205
224
|
#define GGML_CUDA_DMMV_X 32
|
206
225
|
#endif
|
207
|
-
#ifndef
|
208
|
-
#define
|
226
|
+
#ifndef GGML_CUDA_MMV_Y
|
227
|
+
#define GGML_CUDA_MMV_Y 1
|
209
228
|
#endif
|
210
229
|
|
211
230
|
#ifndef K_QUANTS_PER_ITERATION
|
@@ -214,6 +233,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
214
233
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
215
234
|
#endif
|
216
235
|
|
236
|
+
struct ggml_tensor_extra_gpu {
|
237
|
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
238
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
239
|
+
};
|
240
|
+
|
217
241
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
218
242
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
219
243
|
|
@@ -265,7 +289,6 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
265
289
|
}
|
266
290
|
|
267
291
|
// sum up partial sums
|
268
|
-
__syncthreads();
|
269
292
|
#pragma unroll
|
270
293
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
271
294
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -709,7 +732,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
709
732
|
#endif
|
710
733
|
|
711
734
|
// sum up partial sums and write back result
|
712
|
-
__syncthreads();
|
713
735
|
#pragma unroll
|
714
736
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
715
737
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -814,7 +836,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
814
836
|
#endif
|
815
837
|
|
816
838
|
// sum up partial sums and write back result
|
817
|
-
__syncthreads();
|
818
839
|
#pragma unroll
|
819
840
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
820
841
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -918,7 +939,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
918
939
|
#endif
|
919
940
|
|
920
941
|
// sum up partial sums and write back result
|
921
|
-
__syncthreads();
|
922
942
|
#pragma unroll
|
923
943
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
924
944
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1023,7 +1043,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1023
1043
|
#endif
|
1024
1044
|
|
1025
1045
|
// sum up partial sums and write back result
|
1026
|
-
__syncthreads();
|
1027
1046
|
#pragma unroll
|
1028
1047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1029
1048
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1134,7 +1153,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
|
1134
1153
|
#endif
|
1135
1154
|
|
1136
1155
|
// sum up partial sums and write back result
|
1137
|
-
__syncthreads();
|
1138
1156
|
#pragma unroll
|
1139
1157
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1140
1158
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1153,6 +1171,41 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1153
1171
|
v.y = x[ib + iqs + 1];
|
1154
1172
|
}
|
1155
1173
|
|
1174
|
+
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
1175
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1176
|
+
|
1177
|
+
if (i >= k) {
|
1178
|
+
return;
|
1179
|
+
}
|
1180
|
+
|
1181
|
+
block_q8_1 * y = (block_q8_1 *) vy;
|
1182
|
+
|
1183
|
+
const int ib = i / QK8_0; // block index
|
1184
|
+
const int iqs = i % QK8_0; // quant index
|
1185
|
+
|
1186
|
+
const float xi = x[i];
|
1187
|
+
float amax = fabsf(xi);
|
1188
|
+
float sum = xi;
|
1189
|
+
|
1190
|
+
#pragma unroll
|
1191
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1192
|
+
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
1193
|
+
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
const float d = amax / 127;
|
1197
|
+
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
1198
|
+
|
1199
|
+
y[ib].qs[iqs] = q;
|
1200
|
+
|
1201
|
+
if (iqs > 0) {
|
1202
|
+
return;
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
y[ib].d = d;
|
1206
|
+
y[ib].s = sum;
|
1207
|
+
}
|
1208
|
+
|
1156
1209
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1157
1210
|
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1158
1211
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
@@ -1174,6 +1227,182 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1174
1227
|
y[iybs + iqs + y_offset] = v.y;
|
1175
1228
|
}
|
1176
1229
|
|
1230
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1231
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1232
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1233
|
+
|
1234
|
+
int vi;
|
1235
|
+
memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1236
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1237
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1238
|
+
|
1239
|
+
const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
|
1240
|
+
|
1241
|
+
// subtract 8 from each quantized value
|
1242
|
+
const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
|
1243
|
+
const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
|
1244
|
+
|
1245
|
+
// SIMD dot product of quantized values
|
1246
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1247
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1248
|
+
|
1249
|
+
return sumi*d;
|
1250
|
+
#else
|
1251
|
+
return 0.0f; // only to satisfy the compiler
|
1252
|
+
#endif // __CUDA_ARCH__ >= 600
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1256
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1257
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1258
|
+
|
1259
|
+
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
1260
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1261
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
1262
|
+
|
1263
|
+
const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
|
1264
|
+
const float m = bq4_1->m;
|
1265
|
+
const float s = bq8_1->s;
|
1266
|
+
|
1267
|
+
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
|
1268
|
+
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
|
1269
|
+
|
1270
|
+
// SIMD dot product of quantized values
|
1271
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1272
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1273
|
+
|
1274
|
+
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1275
|
+
#else
|
1276
|
+
return 0.0f; // only to satisfy the compiler
|
1277
|
+
#endif // __CUDA_ARCH__ >= 600
|
1278
|
+
}
|
1279
|
+
|
1280
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1281
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1282
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1283
|
+
|
1284
|
+
int qs;
|
1285
|
+
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1286
|
+
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1287
|
+
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1288
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1289
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1290
|
+
|
1291
|
+
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1292
|
+
|
1293
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1294
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1295
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1296
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1297
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1298
|
+
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1299
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1300
|
+
|
1301
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1302
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1303
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1304
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1305
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1306
|
+
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1307
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1308
|
+
|
1309
|
+
return sumi*d;
|
1310
|
+
#else
|
1311
|
+
return 0.0f; // only to satisfy the compiler
|
1312
|
+
#endif // __CUDA_ARCH__ >= 600
|
1313
|
+
}
|
1314
|
+
|
1315
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1316
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1317
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1318
|
+
|
1319
|
+
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1320
|
+
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1321
|
+
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1322
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1323
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1324
|
+
|
1325
|
+
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1326
|
+
const float m = bq5_1->m;
|
1327
|
+
const float s = bq8_1->s;
|
1328
|
+
|
1329
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1330
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1331
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1332
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1333
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1334
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1335
|
+
|
1336
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1337
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1338
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1339
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1340
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1341
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1342
|
+
|
1343
|
+
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1344
|
+
#else
|
1345
|
+
return 0.0f; // only to satisfy the compiler
|
1346
|
+
#endif // __CUDA_ARCH__ >= 600
|
1347
|
+
}
|
1348
|
+
|
1349
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1350
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1351
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1352
|
+
|
1353
|
+
int vi;
|
1354
|
+
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1355
|
+
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1356
|
+
|
1357
|
+
const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
|
1358
|
+
|
1359
|
+
// SIMD dot product of quantized values
|
1360
|
+
int sumi = __dp4a(vi, ui, 0);
|
1361
|
+
|
1362
|
+
return sumi*d;
|
1363
|
+
#else
|
1364
|
+
return 0.0f; // only to satisfy the compiler
|
1365
|
+
#endif // __CUDA_ARCH__ >= 600
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1369
|
+
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
1370
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1371
|
+
|
1372
|
+
if (row >= nrows) {
|
1373
|
+
return;
|
1374
|
+
}
|
1375
|
+
|
1376
|
+
const int blocks_per_row = ncols / qk;
|
1377
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
1378
|
+
|
1379
|
+
// partial sum for each thread
|
1380
|
+
float tmp = 0.0f;
|
1381
|
+
|
1382
|
+
const block_q_t * x = (const block_q_t *) vx;
|
1383
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1384
|
+
|
1385
|
+
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1386
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1387
|
+
|
1388
|
+
const int iby = i + threadIdx.x / qi; // y block index
|
1389
|
+
|
1390
|
+
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1391
|
+
|
1392
|
+
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1393
|
+
}
|
1394
|
+
|
1395
|
+
// sum up partial sums and write back result
|
1396
|
+
#pragma unroll
|
1397
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1398
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1399
|
+
}
|
1400
|
+
|
1401
|
+
if (threadIdx.x == 0) {
|
1402
|
+
dst[row] = tmp;
|
1403
|
+
}
|
1404
|
+
}
|
1405
|
+
|
1177
1406
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1178
1407
|
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1179
1408
|
// qk = quantized weights per x block
|
@@ -1228,7 +1457,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1228
1457
|
}
|
1229
1458
|
|
1230
1459
|
// sum up partial sums and write back result
|
1231
|
-
__syncthreads();
|
1232
1460
|
#pragma unroll
|
1233
1461
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1234
1462
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1279,7 +1507,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1279
1507
|
const int idst = channel*nrows_dst + row_dst;
|
1280
1508
|
|
1281
1509
|
// sum up partial sums and write back result
|
1282
|
-
__syncthreads();
|
1283
1510
|
#pragma unroll
|
1284
1511
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1285
1512
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1325,7 +1552,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1325
1552
|
}
|
1326
1553
|
|
1327
1554
|
// sum up partial sums and write back result
|
1328
|
-
__syncthreads();
|
1329
1555
|
#pragma unroll
|
1330
1556
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1331
1557
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1435,7 +1661,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
1435
1661
|
}
|
1436
1662
|
|
1437
1663
|
// sum up partial sums
|
1438
|
-
__syncthreads();
|
1439
1664
|
#pragma unroll
|
1440
1665
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1441
1666
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1489,6 +1714,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
1489
1714
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1490
1715
|
}
|
1491
1716
|
|
1717
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
|
1718
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1719
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
|
1720
|
+
}
|
1721
|
+
|
1492
1722
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1493
1723
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1494
1724
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1557,45 +1787,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
1557
1787
|
|
1558
1788
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1559
1789
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1560
|
-
const int block_num_y = (nrows +
|
1790
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1561
1791
|
const dim3 block_nums(1, block_num_y, 1);
|
1562
|
-
const dim3 block_dims(WARP_SIZE,
|
1792
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1563
1793
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
1564
1794
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1565
1795
|
}
|
1566
1796
|
|
1567
1797
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1568
1798
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1569
|
-
const int block_num_y = (nrows +
|
1799
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1570
1800
|
const dim3 block_nums(1, block_num_y, 1);
|
1571
|
-
const dim3 block_dims(WARP_SIZE,
|
1801
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1572
1802
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
1573
1803
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1574
1804
|
}
|
1575
1805
|
|
1576
1806
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1577
1807
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1578
|
-
const int block_num_y = (nrows +
|
1808
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1579
1809
|
const dim3 block_nums(1, block_num_y, 1);
|
1580
|
-
const dim3 block_dims(WARP_SIZE,
|
1810
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1581
1811
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
1582
1812
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1583
1813
|
}
|
1584
1814
|
|
1585
1815
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1586
1816
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1587
|
-
const int block_num_y = (nrows +
|
1817
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1588
1818
|
const dim3 block_nums(1, block_num_y, 1);
|
1589
|
-
const dim3 block_dims(WARP_SIZE,
|
1819
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1590
1820
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
1591
1821
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1592
1822
|
}
|
1593
1823
|
|
1594
1824
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1595
1825
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1596
|
-
const int block_num_y = (nrows +
|
1826
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1597
1827
|
const dim3 block_nums(1, block_num_y, 1);
|
1598
|
-
const dim3 block_dims(WARP_SIZE,
|
1828
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1599
1829
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
1600
1830
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1601
1831
|
}
|
@@ -1642,6 +1872,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1642
1872
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1643
1873
|
}
|
1644
1874
|
|
1875
|
+
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1876
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1877
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1878
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1879
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1880
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
1881
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1882
|
+
}
|
1883
|
+
|
1884
|
+
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1885
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1886
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1887
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1888
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1889
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
1890
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1891
|
+
}
|
1892
|
+
|
1893
|
+
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1894
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1895
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1896
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1897
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1898
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
1899
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1900
|
+
}
|
1901
|
+
|
1902
|
+
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1903
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1904
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1905
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1906
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1907
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
1908
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1909
|
+
}
|
1910
|
+
|
1911
|
+
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1912
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1913
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1914
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1915
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1916
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
1917
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1918
|
+
}
|
1919
|
+
|
1645
1920
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1646
1921
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1647
1922
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1649,9 +1924,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
1649
1924
|
|
1650
1925
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1651
1926
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1652
|
-
const int block_num_y = (nrows +
|
1927
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1653
1928
|
const dim3 block_nums(1, block_num_y, 1);
|
1654
|
-
const dim3 block_dims(WARP_SIZE,
|
1929
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1655
1930
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
1656
1931
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1657
1932
|
}
|
@@ -1817,6 +2092,7 @@ static size_t g_scratch_offset = 0;
|
|
1817
2092
|
|
1818
2093
|
static int g_device_count = -1;
|
1819
2094
|
static int g_main_device = 0;
|
2095
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
1820
2096
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1821
2097
|
|
1822
2098
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -1834,9 +2110,12 @@ void ggml_init_cublas() {
|
|
1834
2110
|
for (int id = 0; id < g_device_count; ++id) {
|
1835
2111
|
cudaDeviceProp prop;
|
1836
2112
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
1837
|
-
fprintf(stderr, " Device %d: %s\n", id, prop.name);
|
2113
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
2114
|
+
|
1838
2115
|
g_tensor_split[id] = total_vram;
|
1839
2116
|
total_vram += prop.totalGlobalMem;
|
2117
|
+
|
2118
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
1840
2119
|
}
|
1841
2120
|
for (int id = 0; id < g_device_count; ++id) {
|
1842
2121
|
g_tensor_split[id] /= total_vram;
|
@@ -1970,7 +2249,6 @@ inline void ggml_cuda_op_add(
|
|
1970
2249
|
} else {
|
1971
2250
|
GGML_ASSERT(false);
|
1972
2251
|
}
|
1973
|
-
CUDA_CHECK(cudaGetLastError());
|
1974
2252
|
|
1975
2253
|
(void) src1;
|
1976
2254
|
(void) dst;
|
@@ -2002,7 +2280,6 @@ inline void ggml_cuda_op_mul(
|
|
2002
2280
|
|
2003
2281
|
// compute
|
2004
2282
|
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
2005
|
-
CUDA_CHECK(cudaGetLastError());
|
2006
2283
|
}
|
2007
2284
|
|
2008
2285
|
(void) dst;
|
@@ -2023,7 +2300,6 @@ inline void ggml_cuda_op_silu(
|
|
2023
2300
|
|
2024
2301
|
// compute
|
2025
2302
|
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2026
|
-
CUDA_CHECK(cudaGetLastError());
|
2027
2303
|
|
2028
2304
|
(void) src1;
|
2029
2305
|
(void) dst;
|
@@ -2046,7 +2322,6 @@ inline void ggml_cuda_op_rms_norm(
|
|
2046
2322
|
|
2047
2323
|
// compute
|
2048
2324
|
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2049
|
-
CUDA_CHECK(cudaGetLastError());
|
2050
2325
|
|
2051
2326
|
(void) src1;
|
2052
2327
|
(void) dst;
|
@@ -2056,7 +2331,7 @@ inline void ggml_cuda_op_rms_norm(
|
|
2056
2331
|
(void) i1;
|
2057
2332
|
}
|
2058
2333
|
|
2059
|
-
inline void
|
2334
|
+
inline void ggml_cuda_op_mul_mat_vec(
|
2060
2335
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2061
2336
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2062
2337
|
cudaStream_t & cudaStream_main){
|
@@ -2068,70 +2343,116 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
2068
2343
|
const int64_t ne00 = src0->ne[0];
|
2069
2344
|
const int64_t nrows = i01_high - i01_low;
|
2070
2345
|
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2346
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
2347
|
+
const bool use_mul_mat_vec_q = false;
|
2348
|
+
#else
|
2349
|
+
int id;
|
2350
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
2075
2351
|
|
2076
|
-
bool
|
2077
|
-
src0->type ==
|
2078
|
-
src0->type ==
|
2352
|
+
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
|
2353
|
+
src0->type == GGML_TYPE_Q4_1 ||
|
2354
|
+
src0->type == GGML_TYPE_Q5_0 ||
|
2355
|
+
src0->type == GGML_TYPE_Q5_1 ||
|
2356
|
+
src0->type == GGML_TYPE_Q8_0;
|
2079
2357
|
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2358
|
+
// The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
|
2359
|
+
// However, they have bad performance with Pascal cards.
|
2360
|
+
// Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
|
2361
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
|
2362
|
+
#endif
|
2363
|
+
|
2364
|
+
if (use_mul_mat_vec_q) {
|
2365
|
+
size_t as;
|
2366
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
|
2367
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
|
2368
|
+
|
2369
|
+
switch (src0->type) {
|
2370
|
+
case GGML_TYPE_Q4_0:
|
2371
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2372
|
+
break;
|
2373
|
+
case GGML_TYPE_Q4_1:
|
2374
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2375
|
+
break;
|
2376
|
+
case GGML_TYPE_Q5_0:
|
2377
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2378
|
+
break;
|
2379
|
+
case GGML_TYPE_Q5_1:
|
2380
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2381
|
+
break;
|
2382
|
+
case GGML_TYPE_Q8_0:
|
2383
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2384
|
+
break;
|
2385
|
+
default:
|
2386
|
+
GGML_ASSERT(false);
|
2387
|
+
break;
|
2388
|
+
}
|
2389
|
+
|
2390
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
2391
|
+
} else {
|
2392
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
2393
|
+
#ifdef GGML_CUDA_DMMV_F16
|
2394
|
+
size_t ash;
|
2395
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
2396
|
+
|
2397
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
2398
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
2399
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
2400
|
+
|
2401
|
+
if (src1_convert_f16) {
|
2402
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
2403
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
2404
|
+
ne00, 1, sizeof(float), 0, 0,
|
2405
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
2406
|
+
}
|
2086
2407
|
#else
|
2087
|
-
|
2408
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
2088
2409
|
#endif // GGML_CUDA_DMMV_F16
|
2089
2410
|
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
|
2101
|
-
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2108
|
-
|
2109
|
-
|
2110
|
-
|
2111
|
-
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
2128
|
-
CUDA_CHECK(cudaGetLastError());
|
2411
|
+
switch (src0->type) {
|
2412
|
+
case GGML_TYPE_Q4_0:
|
2413
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2414
|
+
break;
|
2415
|
+
case GGML_TYPE_Q4_1:
|
2416
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2417
|
+
break;
|
2418
|
+
case GGML_TYPE_Q5_0:
|
2419
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2420
|
+
break;
|
2421
|
+
case GGML_TYPE_Q5_1:
|
2422
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2423
|
+
break;
|
2424
|
+
case GGML_TYPE_Q8_0:
|
2425
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2426
|
+
break;
|
2427
|
+
case GGML_TYPE_Q2_K:
|
2428
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2429
|
+
break;
|
2430
|
+
case GGML_TYPE_Q3_K:
|
2431
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2432
|
+
break;
|
2433
|
+
case GGML_TYPE_Q4_K:
|
2434
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2435
|
+
break;
|
2436
|
+
case GGML_TYPE_Q5_K:
|
2437
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2438
|
+
break;
|
2439
|
+
case GGML_TYPE_Q6_K:
|
2440
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2441
|
+
break;
|
2442
|
+
case GGML_TYPE_F16:
|
2443
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2444
|
+
break;
|
2445
|
+
default:
|
2446
|
+
GGML_ASSERT(false);
|
2447
|
+
break;
|
2448
|
+
}
|
2129
2449
|
|
2130
2450
|
#ifdef GGML_CUDA_DMMV_F16
|
2131
|
-
|
2132
|
-
|
2133
|
-
|
2451
|
+
if (src1_convert_f16) {
|
2452
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
2453
|
+
}
|
2134
2454
|
#endif // GGML_CUDA_DMMV_F16
|
2455
|
+
}
|
2135
2456
|
|
2136
2457
|
(void) src1;
|
2137
2458
|
(void) dst;
|
@@ -2202,7 +2523,6 @@ inline void ggml_cuda_op_rope(
|
|
2202
2523
|
|
2203
2524
|
// compute
|
2204
2525
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
2205
|
-
CUDA_CHECK(cudaGetLastError());
|
2206
2526
|
|
2207
2527
|
(void) dst;
|
2208
2528
|
(void) src0_ddq_i;
|
@@ -2226,7 +2546,6 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
2226
2546
|
|
2227
2547
|
// compute
|
2228
2548
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
2229
|
-
CUDA_CHECK(cudaGetLastError());
|
2230
2549
|
|
2231
2550
|
(void) dst;
|
2232
2551
|
(void) src0_ddq_i;
|
@@ -2248,7 +2567,6 @@ inline void ggml_cuda_op_soft_max(
|
|
2248
2567
|
|
2249
2568
|
// compute
|
2250
2569
|
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2251
|
-
CUDA_CHECK(cudaGetLastError());
|
2252
2570
|
|
2253
2571
|
(void) src1;
|
2254
2572
|
(void) dst;
|
@@ -2344,10 +2662,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2344
2662
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2345
2663
|
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2346
2664
|
|
2347
|
-
// if multiple
|
2665
|
+
// if multiple devices are used they need to wait for the main device
|
2666
|
+
// here an event is recorded that signifies that the main device has finished calculating the input data
|
2348
2667
|
if (split && g_device_count > 1) {
|
2349
2668
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2350
|
-
CUDA_CHECK(
|
2669
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
2351
2670
|
}
|
2352
2671
|
|
2353
2672
|
for (int id = 0; id < g_device_count; ++id) {
|
@@ -2373,6 +2692,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2373
2692
|
int64_t row_diff = row_high - row_low;
|
2374
2693
|
|
2375
2694
|
cudaSetDevice(id);
|
2695
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2696
|
+
|
2697
|
+
// wait for main GPU data if necessary
|
2698
|
+
if (split && id != g_main_device) {
|
2699
|
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
2700
|
+
}
|
2376
2701
|
|
2377
2702
|
if (src0_on_device && src0_is_contiguous) {
|
2378
2703
|
if (src0_is_f32) {
|
@@ -2448,8 +2773,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2448
2773
|
}
|
2449
2774
|
const int64_t i11 = i13*ne12 + i12;
|
2450
2775
|
|
2451
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2452
|
-
|
2453
2776
|
// for split tensors the data begins at i0 == i0_offset_low
|
2454
2777
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2455
2778
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
@@ -2509,6 +2832,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2509
2832
|
|
2510
2833
|
// do the computation
|
2511
2834
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
2835
|
+
CUDA_CHECK(cudaGetLastError());
|
2512
2836
|
|
2513
2837
|
// copy dst to host or other device if necessary
|
2514
2838
|
if (!dst_on_device) {
|
@@ -2538,6 +2862,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2538
2862
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
2539
2863
|
}
|
2540
2864
|
}
|
2865
|
+
|
2866
|
+
// signify to main device that other device is done
|
2867
|
+
if (split && g_device_count > 1 && id != g_main_device) {
|
2868
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
|
2869
|
+
}
|
2541
2870
|
}
|
2542
2871
|
}
|
2543
2872
|
}
|
@@ -2549,7 +2878,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2549
2878
|
}
|
2550
2879
|
|
2551
2880
|
CUDA_CHECK(cudaSetDevice(id));
|
2552
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2553
2881
|
|
2554
2882
|
if (src0_asq[id] > 0) {
|
2555
2883
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
@@ -2564,6 +2892,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2564
2892
|
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
|
2565
2893
|
}
|
2566
2894
|
}
|
2895
|
+
|
2896
|
+
// main device waits for all other devices to be finished
|
2897
|
+
if (split && g_device_count > 1) {
|
2898
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2899
|
+
for (int id = 0; id < g_device_count; ++id) {
|
2900
|
+
if (id != g_main_device) {
|
2901
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
2902
|
+
}
|
2903
|
+
}
|
2904
|
+
}
|
2905
|
+
|
2906
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
2907
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2908
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2909
|
+
}
|
2567
2910
|
}
|
2568
2911
|
|
2569
2912
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2679,8 +3022,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
2679
3022
|
}else if (src0->type == GGML_TYPE_F32) {
|
2680
3023
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2681
3024
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
2682
|
-
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0
|
2683
|
-
ggml_cuda_op(src0, src1, dst,
|
3025
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3026
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
2684
3027
|
} else {
|
2685
3028
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2686
3029
|
}
|
@@ -2803,25 +3146,32 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
2803
3146
|
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
2804
3147
|
|
2805
3148
|
extra->data_device[id] = buf;
|
3149
|
+
|
3150
|
+
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
3151
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
|
3152
|
+
}
|
2806
3153
|
}
|
2807
3154
|
|
2808
3155
|
tensor->extra = extra;
|
2809
3156
|
}
|
2810
3157
|
|
2811
3158
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
2812
|
-
if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
|
3159
|
+
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
2813
3160
|
return;
|
2814
3161
|
}
|
2815
3162
|
|
2816
3163
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
2817
3164
|
|
2818
3165
|
for (int id = 0; id < g_device_count; ++id) {
|
2819
|
-
if (extra->data_device[id]
|
2820
|
-
|
3166
|
+
if (extra->data_device[id] != nullptr) {
|
3167
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3168
|
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
2821
3169
|
}
|
2822
3170
|
|
2823
|
-
|
2824
|
-
|
3171
|
+
if (extra->events[id] != nullptr) {
|
3172
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3173
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
|
3174
|
+
}
|
2825
3175
|
}
|
2826
3176
|
|
2827
3177
|
delete extra;
|