llama_cpp 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
|
|
70
70
|
|
71
71
|
// QK = number of values after dequantization
|
72
72
|
// QR = QK / number of values before dequantization
|
73
|
+
// QI = number of 32 bit integers before dequantization
|
73
74
|
|
74
75
|
#define QK4_0 32
|
75
76
|
#define QR4_0 2
|
77
|
+
#define QI4_0 4
|
76
78
|
typedef struct {
|
77
79
|
half d; // delta
|
78
80
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
81
83
|
|
82
84
|
#define QK4_1 32
|
83
85
|
#define QR4_1 2
|
86
|
+
#define QI4_1 4
|
84
87
|
typedef struct {
|
85
88
|
half d; // delta
|
86
89
|
half m; // min
|
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
90
93
|
|
91
94
|
#define QK5_0 32
|
92
95
|
#define QR5_0 2
|
96
|
+
#define QI5_0 4
|
93
97
|
typedef struct {
|
94
98
|
half d; // delta
|
95
99
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
99
103
|
|
100
104
|
#define QK5_1 32
|
101
105
|
#define QR5_1 2
|
106
|
+
#define QI5_1 4
|
102
107
|
typedef struct {
|
103
108
|
half d; // delta
|
104
109
|
half m; // min
|
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
109
114
|
|
110
115
|
#define QK8_0 32
|
111
116
|
#define QR8_0 1
|
117
|
+
#define QI8_0 8
|
112
118
|
typedef struct {
|
113
119
|
half d; // delta
|
114
120
|
int8_t qs[QK8_0]; // quants
|
115
121
|
} block_q8_0;
|
116
122
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
117
123
|
|
124
|
+
#define QK8_1 32
|
125
|
+
#define QR8_1 1
|
126
|
+
#define QI8_1 8
|
127
|
+
typedef struct {
|
128
|
+
half d; // delta
|
129
|
+
half s; // unquantized sum
|
130
|
+
int8_t qs[QK8_0]; // quants
|
131
|
+
} block_q8_1;
|
132
|
+
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
133
|
+
|
134
|
+
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
135
|
+
|
118
136
|
//================================= k-quants
|
119
137
|
|
120
138
|
#ifdef GGML_QKK_64
|
@@ -198,14 +216,15 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
198
216
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
199
217
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
200
218
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
219
|
+
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
201
220
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
202
221
|
|
203
222
|
// dmmv = dequantize_mul_mat_vec
|
204
223
|
#ifndef GGML_CUDA_DMMV_X
|
205
224
|
#define GGML_CUDA_DMMV_X 32
|
206
225
|
#endif
|
207
|
-
#ifndef
|
208
|
-
#define
|
226
|
+
#ifndef GGML_CUDA_MMV_Y
|
227
|
+
#define GGML_CUDA_MMV_Y 1
|
209
228
|
#endif
|
210
229
|
|
211
230
|
#ifndef K_QUANTS_PER_ITERATION
|
@@ -214,6 +233,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
214
233
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
215
234
|
#endif
|
216
235
|
|
236
|
+
struct ggml_tensor_extra_gpu {
|
237
|
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
238
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
239
|
+
};
|
240
|
+
|
217
241
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
218
242
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
219
243
|
|
@@ -265,7 +289,6 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
265
289
|
}
|
266
290
|
|
267
291
|
// sum up partial sums
|
268
|
-
__syncthreads();
|
269
292
|
#pragma unroll
|
270
293
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
271
294
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -709,7 +732,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
709
732
|
#endif
|
710
733
|
|
711
734
|
// sum up partial sums and write back result
|
712
|
-
__syncthreads();
|
713
735
|
#pragma unroll
|
714
736
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
715
737
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -814,7 +836,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
814
836
|
#endif
|
815
837
|
|
816
838
|
// sum up partial sums and write back result
|
817
|
-
__syncthreads();
|
818
839
|
#pragma unroll
|
819
840
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
820
841
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -918,7 +939,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
918
939
|
#endif
|
919
940
|
|
920
941
|
// sum up partial sums and write back result
|
921
|
-
__syncthreads();
|
922
942
|
#pragma unroll
|
923
943
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
924
944
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1023,7 +1043,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
1023
1043
|
#endif
|
1024
1044
|
|
1025
1045
|
// sum up partial sums and write back result
|
1026
|
-
__syncthreads();
|
1027
1046
|
#pragma unroll
|
1028
1047
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1029
1048
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1134,7 +1153,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
|
1134
1153
|
#endif
|
1135
1154
|
|
1136
1155
|
// sum up partial sums and write back result
|
1137
|
-
__syncthreads();
|
1138
1156
|
#pragma unroll
|
1139
1157
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1140
1158
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1153,6 +1171,41 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1153
1171
|
v.y = x[ib + iqs + 1];
|
1154
1172
|
}
|
1155
1173
|
|
1174
|
+
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
1175
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
1176
|
+
|
1177
|
+
if (i >= k) {
|
1178
|
+
return;
|
1179
|
+
}
|
1180
|
+
|
1181
|
+
block_q8_1 * y = (block_q8_1 *) vy;
|
1182
|
+
|
1183
|
+
const int ib = i / QK8_0; // block index
|
1184
|
+
const int iqs = i % QK8_0; // quant index
|
1185
|
+
|
1186
|
+
const float xi = x[i];
|
1187
|
+
float amax = fabsf(xi);
|
1188
|
+
float sum = xi;
|
1189
|
+
|
1190
|
+
#pragma unroll
|
1191
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1192
|
+
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
1193
|
+
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
const float d = amax / 127;
|
1197
|
+
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
1198
|
+
|
1199
|
+
y[ib].qs[iqs] = q;
|
1200
|
+
|
1201
|
+
if (iqs > 0) {
|
1202
|
+
return;
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
y[ib].d = d;
|
1206
|
+
y[ib].s = sum;
|
1207
|
+
}
|
1208
|
+
|
1156
1209
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1157
1210
|
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
1158
1211
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
@@ -1174,6 +1227,182 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
1174
1227
|
y[iybs + iqs + y_offset] = v.y;
|
1175
1228
|
}
|
1176
1229
|
|
1230
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1231
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1232
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1233
|
+
|
1234
|
+
int vi;
|
1235
|
+
memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1236
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1237
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1238
|
+
|
1239
|
+
const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
|
1240
|
+
|
1241
|
+
// subtract 8 from each quantized value
|
1242
|
+
const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
|
1243
|
+
const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
|
1244
|
+
|
1245
|
+
// SIMD dot product of quantized values
|
1246
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1247
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1248
|
+
|
1249
|
+
return sumi*d;
|
1250
|
+
#else
|
1251
|
+
return 0.0f; // only to satisfy the compiler
|
1252
|
+
#endif // __CUDA_ARCH__ >= 600
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1256
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1257
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1258
|
+
|
1259
|
+
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
1260
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1261
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
1262
|
+
|
1263
|
+
const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
|
1264
|
+
const float m = bq4_1->m;
|
1265
|
+
const float s = bq8_1->s;
|
1266
|
+
|
1267
|
+
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
|
1268
|
+
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
|
1269
|
+
|
1270
|
+
// SIMD dot product of quantized values
|
1271
|
+
int sumi = __dp4a(vi0, ui0, 0);
|
1272
|
+
sumi = __dp4a(vi1, ui1, sumi);
|
1273
|
+
|
1274
|
+
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1275
|
+
#else
|
1276
|
+
return 0.0f; // only to satisfy the compiler
|
1277
|
+
#endif // __CUDA_ARCH__ >= 600
|
1278
|
+
}
|
1279
|
+
|
1280
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1281
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1282
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1283
|
+
|
1284
|
+
int qs;
|
1285
|
+
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1286
|
+
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1287
|
+
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1288
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1289
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1290
|
+
|
1291
|
+
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1292
|
+
|
1293
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1294
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1295
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1296
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1297
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1298
|
+
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1299
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1300
|
+
|
1301
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1302
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1303
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1304
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1305
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1306
|
+
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1307
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1308
|
+
|
1309
|
+
return sumi*d;
|
1310
|
+
#else
|
1311
|
+
return 0.0f; // only to satisfy the compiler
|
1312
|
+
#endif // __CUDA_ARCH__ >= 600
|
1313
|
+
}
|
1314
|
+
|
1315
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1316
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1317
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1318
|
+
|
1319
|
+
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1320
|
+
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1321
|
+
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1322
|
+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1323
|
+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1324
|
+
|
1325
|
+
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1326
|
+
const float m = bq5_1->m;
|
1327
|
+
const float s = bq8_1->s;
|
1328
|
+
|
1329
|
+
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1330
|
+
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1331
|
+
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1332
|
+
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1333
|
+
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1334
|
+
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1335
|
+
|
1336
|
+
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1337
|
+
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1338
|
+
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1339
|
+
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1340
|
+
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1341
|
+
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1342
|
+
|
1343
|
+
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1344
|
+
#else
|
1345
|
+
return 0.0f; // only to satisfy the compiler
|
1346
|
+
#endif // __CUDA_ARCH__ >= 600
|
1347
|
+
}
|
1348
|
+
|
1349
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
1350
|
+
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
1351
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1352
|
+
|
1353
|
+
int vi;
|
1354
|
+
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
1355
|
+
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1356
|
+
|
1357
|
+
const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
|
1358
|
+
|
1359
|
+
// SIMD dot product of quantized values
|
1360
|
+
int sumi = __dp4a(vi, ui, 0);
|
1361
|
+
|
1362
|
+
return sumi*d;
|
1363
|
+
#else
|
1364
|
+
return 0.0f; // only to satisfy the compiler
|
1365
|
+
#endif // __CUDA_ARCH__ >= 600
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1369
|
+
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
1370
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1371
|
+
|
1372
|
+
if (row >= nrows) {
|
1373
|
+
return;
|
1374
|
+
}
|
1375
|
+
|
1376
|
+
const int blocks_per_row = ncols / qk;
|
1377
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
1378
|
+
|
1379
|
+
// partial sum for each thread
|
1380
|
+
float tmp = 0.0f;
|
1381
|
+
|
1382
|
+
const block_q_t * x = (const block_q_t *) vx;
|
1383
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1384
|
+
|
1385
|
+
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1386
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1387
|
+
|
1388
|
+
const int iby = i + threadIdx.x / qi; // y block index
|
1389
|
+
|
1390
|
+
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1391
|
+
|
1392
|
+
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1393
|
+
}
|
1394
|
+
|
1395
|
+
// sum up partial sums and write back result
|
1396
|
+
#pragma unroll
|
1397
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
1398
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
1399
|
+
}
|
1400
|
+
|
1401
|
+
if (threadIdx.x == 0) {
|
1402
|
+
dst[row] = tmp;
|
1403
|
+
}
|
1404
|
+
}
|
1405
|
+
|
1177
1406
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1178
1407
|
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
1179
1408
|
// qk = quantized weights per x block
|
@@ -1228,7 +1457,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1228
1457
|
}
|
1229
1458
|
|
1230
1459
|
// sum up partial sums and write back result
|
1231
|
-
__syncthreads();
|
1232
1460
|
#pragma unroll
|
1233
1461
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1234
1462
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1279,7 +1507,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1279
1507
|
const int idst = channel*nrows_dst + row_dst;
|
1280
1508
|
|
1281
1509
|
// sum up partial sums and write back result
|
1282
|
-
__syncthreads();
|
1283
1510
|
#pragma unroll
|
1284
1511
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1285
1512
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1325,7 +1552,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1325
1552
|
}
|
1326
1553
|
|
1327
1554
|
// sum up partial sums and write back result
|
1328
|
-
__syncthreads();
|
1329
1555
|
#pragma unroll
|
1330
1556
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1331
1557
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1435,7 +1661,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
1435
1661
|
}
|
1436
1662
|
|
1437
1663
|
// sum up partial sums
|
1438
|
-
__syncthreads();
|
1439
1664
|
#pragma unroll
|
1440
1665
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
1441
1666
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
@@ -1489,6 +1714,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
1489
1714
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
1490
1715
|
}
|
1491
1716
|
|
1717
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
|
1718
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
1719
|
+
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
|
1720
|
+
}
|
1721
|
+
|
1492
1722
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1493
1723
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1494
1724
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1557,45 +1787,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
1557
1787
|
|
1558
1788
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1559
1789
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1560
|
-
const int block_num_y = (nrows +
|
1790
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1561
1791
|
const dim3 block_nums(1, block_num_y, 1);
|
1562
|
-
const dim3 block_dims(WARP_SIZE,
|
1792
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1563
1793
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
1564
1794
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1565
1795
|
}
|
1566
1796
|
|
1567
1797
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1568
1798
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1569
|
-
const int block_num_y = (nrows +
|
1799
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1570
1800
|
const dim3 block_nums(1, block_num_y, 1);
|
1571
|
-
const dim3 block_dims(WARP_SIZE,
|
1801
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1572
1802
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
1573
1803
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1574
1804
|
}
|
1575
1805
|
|
1576
1806
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1577
1807
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1578
|
-
const int block_num_y = (nrows +
|
1808
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1579
1809
|
const dim3 block_nums(1, block_num_y, 1);
|
1580
|
-
const dim3 block_dims(WARP_SIZE,
|
1810
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1581
1811
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
1582
1812
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1583
1813
|
}
|
1584
1814
|
|
1585
1815
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1586
1816
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1587
|
-
const int block_num_y = (nrows +
|
1817
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1588
1818
|
const dim3 block_nums(1, block_num_y, 1);
|
1589
|
-
const dim3 block_dims(WARP_SIZE,
|
1819
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1590
1820
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
1591
1821
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1592
1822
|
}
|
1593
1823
|
|
1594
1824
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1595
1825
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1596
|
-
const int block_num_y = (nrows +
|
1826
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1597
1827
|
const dim3 block_nums(1, block_num_y, 1);
|
1598
|
-
const dim3 block_dims(WARP_SIZE,
|
1828
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1599
1829
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
1600
1830
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1601
1831
|
}
|
@@ -1642,6 +1872,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1642
1872
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1643
1873
|
}
|
1644
1874
|
|
1875
|
+
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1876
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1877
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1878
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1879
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1880
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
1881
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1882
|
+
}
|
1883
|
+
|
1884
|
+
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1885
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1886
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1887
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1888
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1889
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
1890
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1891
|
+
}
|
1892
|
+
|
1893
|
+
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1894
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1895
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1896
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1897
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1898
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
1899
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1900
|
+
}
|
1901
|
+
|
1902
|
+
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1903
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1904
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1905
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1906
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1907
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
1908
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1909
|
+
}
|
1910
|
+
|
1911
|
+
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1912
|
+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1913
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1914
|
+
const dim3 block_nums(1, block_num_y, 1);
|
1915
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1916
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
1917
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1918
|
+
}
|
1919
|
+
|
1645
1920
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1646
1921
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1647
1922
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -1649,9 +1924,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
1649
1924
|
|
1650
1925
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1651
1926
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1652
|
-
const int block_num_y = (nrows +
|
1927
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1653
1928
|
const dim3 block_nums(1, block_num_y, 1);
|
1654
|
-
const dim3 block_dims(WARP_SIZE,
|
1929
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
1655
1930
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
1656
1931
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1657
1932
|
}
|
@@ -1817,6 +2092,7 @@ static size_t g_scratch_offset = 0;
|
|
1817
2092
|
|
1818
2093
|
static int g_device_count = -1;
|
1819
2094
|
static int g_main_device = 0;
|
2095
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
1820
2096
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1821
2097
|
|
1822
2098
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -1834,9 +2110,12 @@ void ggml_init_cublas() {
|
|
1834
2110
|
for (int id = 0; id < g_device_count; ++id) {
|
1835
2111
|
cudaDeviceProp prop;
|
1836
2112
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
1837
|
-
fprintf(stderr, " Device %d: %s\n", id, prop.name);
|
2113
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
2114
|
+
|
1838
2115
|
g_tensor_split[id] = total_vram;
|
1839
2116
|
total_vram += prop.totalGlobalMem;
|
2117
|
+
|
2118
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
1840
2119
|
}
|
1841
2120
|
for (int id = 0; id < g_device_count; ++id) {
|
1842
2121
|
g_tensor_split[id] /= total_vram;
|
@@ -1970,7 +2249,6 @@ inline void ggml_cuda_op_add(
|
|
1970
2249
|
} else {
|
1971
2250
|
GGML_ASSERT(false);
|
1972
2251
|
}
|
1973
|
-
CUDA_CHECK(cudaGetLastError());
|
1974
2252
|
|
1975
2253
|
(void) src1;
|
1976
2254
|
(void) dst;
|
@@ -2002,7 +2280,6 @@ inline void ggml_cuda_op_mul(
|
|
2002
2280
|
|
2003
2281
|
// compute
|
2004
2282
|
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
2005
|
-
CUDA_CHECK(cudaGetLastError());
|
2006
2283
|
}
|
2007
2284
|
|
2008
2285
|
(void) dst;
|
@@ -2023,7 +2300,6 @@ inline void ggml_cuda_op_silu(
|
|
2023
2300
|
|
2024
2301
|
// compute
|
2025
2302
|
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2026
|
-
CUDA_CHECK(cudaGetLastError());
|
2027
2303
|
|
2028
2304
|
(void) src1;
|
2029
2305
|
(void) dst;
|
@@ -2046,7 +2322,6 @@ inline void ggml_cuda_op_rms_norm(
|
|
2046
2322
|
|
2047
2323
|
// compute
|
2048
2324
|
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2049
|
-
CUDA_CHECK(cudaGetLastError());
|
2050
2325
|
|
2051
2326
|
(void) src1;
|
2052
2327
|
(void) dst;
|
@@ -2056,7 +2331,7 @@ inline void ggml_cuda_op_rms_norm(
|
|
2056
2331
|
(void) i1;
|
2057
2332
|
}
|
2058
2333
|
|
2059
|
-
inline void
|
2334
|
+
inline void ggml_cuda_op_mul_mat_vec(
|
2060
2335
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2061
2336
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
2062
2337
|
cudaStream_t & cudaStream_main){
|
@@ -2068,70 +2343,116 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
2068
2343
|
const int64_t ne00 = src0->ne[0];
|
2069
2344
|
const int64_t nrows = i01_high - i01_low;
|
2070
2345
|
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2346
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
2347
|
+
const bool use_mul_mat_vec_q = false;
|
2348
|
+
#else
|
2349
|
+
int id;
|
2350
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
2075
2351
|
|
2076
|
-
bool
|
2077
|
-
src0->type ==
|
2078
|
-
src0->type ==
|
2352
|
+
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
|
2353
|
+
src0->type == GGML_TYPE_Q4_1 ||
|
2354
|
+
src0->type == GGML_TYPE_Q5_0 ||
|
2355
|
+
src0->type == GGML_TYPE_Q5_1 ||
|
2356
|
+
src0->type == GGML_TYPE_Q8_0;
|
2079
2357
|
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2358
|
+
// The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
|
2359
|
+
// However, they have bad performance with Pascal cards.
|
2360
|
+
// Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
|
2361
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
|
2362
|
+
#endif
|
2363
|
+
|
2364
|
+
if (use_mul_mat_vec_q) {
|
2365
|
+
size_t as;
|
2366
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
|
2367
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
|
2368
|
+
|
2369
|
+
switch (src0->type) {
|
2370
|
+
case GGML_TYPE_Q4_0:
|
2371
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2372
|
+
break;
|
2373
|
+
case GGML_TYPE_Q4_1:
|
2374
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2375
|
+
break;
|
2376
|
+
case GGML_TYPE_Q5_0:
|
2377
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2378
|
+
break;
|
2379
|
+
case GGML_TYPE_Q5_1:
|
2380
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2381
|
+
break;
|
2382
|
+
case GGML_TYPE_Q8_0:
|
2383
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2384
|
+
break;
|
2385
|
+
default:
|
2386
|
+
GGML_ASSERT(false);
|
2387
|
+
break;
|
2388
|
+
}
|
2389
|
+
|
2390
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
2391
|
+
} else {
|
2392
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
2393
|
+
#ifdef GGML_CUDA_DMMV_F16
|
2394
|
+
size_t ash;
|
2395
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
2396
|
+
|
2397
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
2398
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
2399
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
2400
|
+
|
2401
|
+
if (src1_convert_f16) {
|
2402
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
2403
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
2404
|
+
ne00, 1, sizeof(float), 0, 0,
|
2405
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
2406
|
+
}
|
2086
2407
|
#else
|
2087
|
-
|
2408
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
2088
2409
|
#endif // GGML_CUDA_DMMV_F16
|
2089
2410
|
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
|
2101
|
-
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2108
|
-
|
2109
|
-
|
2110
|
-
|
2111
|
-
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
2128
|
-
CUDA_CHECK(cudaGetLastError());
|
2411
|
+
switch (src0->type) {
|
2412
|
+
case GGML_TYPE_Q4_0:
|
2413
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2414
|
+
break;
|
2415
|
+
case GGML_TYPE_Q4_1:
|
2416
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2417
|
+
break;
|
2418
|
+
case GGML_TYPE_Q5_0:
|
2419
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2420
|
+
break;
|
2421
|
+
case GGML_TYPE_Q5_1:
|
2422
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2423
|
+
break;
|
2424
|
+
case GGML_TYPE_Q8_0:
|
2425
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2426
|
+
break;
|
2427
|
+
case GGML_TYPE_Q2_K:
|
2428
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2429
|
+
break;
|
2430
|
+
case GGML_TYPE_Q3_K:
|
2431
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2432
|
+
break;
|
2433
|
+
case GGML_TYPE_Q4_K:
|
2434
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2435
|
+
break;
|
2436
|
+
case GGML_TYPE_Q5_K:
|
2437
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2438
|
+
break;
|
2439
|
+
case GGML_TYPE_Q6_K:
|
2440
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2441
|
+
break;
|
2442
|
+
case GGML_TYPE_F16:
|
2443
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2444
|
+
break;
|
2445
|
+
default:
|
2446
|
+
GGML_ASSERT(false);
|
2447
|
+
break;
|
2448
|
+
}
|
2129
2449
|
|
2130
2450
|
#ifdef GGML_CUDA_DMMV_F16
|
2131
|
-
|
2132
|
-
|
2133
|
-
|
2451
|
+
if (src1_convert_f16) {
|
2452
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
2453
|
+
}
|
2134
2454
|
#endif // GGML_CUDA_DMMV_F16
|
2455
|
+
}
|
2135
2456
|
|
2136
2457
|
(void) src1;
|
2137
2458
|
(void) dst;
|
@@ -2202,7 +2523,6 @@ inline void ggml_cuda_op_rope(
|
|
2202
2523
|
|
2203
2524
|
// compute
|
2204
2525
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
2205
|
-
CUDA_CHECK(cudaGetLastError());
|
2206
2526
|
|
2207
2527
|
(void) dst;
|
2208
2528
|
(void) src0_ddq_i;
|
@@ -2226,7 +2546,6 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
2226
2546
|
|
2227
2547
|
// compute
|
2228
2548
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
2229
|
-
CUDA_CHECK(cudaGetLastError());
|
2230
2549
|
|
2231
2550
|
(void) dst;
|
2232
2551
|
(void) src0_ddq_i;
|
@@ -2248,7 +2567,6 @@ inline void ggml_cuda_op_soft_max(
|
|
2248
2567
|
|
2249
2568
|
// compute
|
2250
2569
|
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2251
|
-
CUDA_CHECK(cudaGetLastError());
|
2252
2570
|
|
2253
2571
|
(void) src1;
|
2254
2572
|
(void) dst;
|
@@ -2344,10 +2662,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2344
2662
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2345
2663
|
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2346
2664
|
|
2347
|
-
// if multiple
|
2665
|
+
// if multiple devices are used they need to wait for the main device
|
2666
|
+
// here an event is recorded that signifies that the main device has finished calculating the input data
|
2348
2667
|
if (split && g_device_count > 1) {
|
2349
2668
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2350
|
-
CUDA_CHECK(
|
2669
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
2351
2670
|
}
|
2352
2671
|
|
2353
2672
|
for (int id = 0; id < g_device_count; ++id) {
|
@@ -2373,6 +2692,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2373
2692
|
int64_t row_diff = row_high - row_low;
|
2374
2693
|
|
2375
2694
|
cudaSetDevice(id);
|
2695
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2696
|
+
|
2697
|
+
// wait for main GPU data if necessary
|
2698
|
+
if (split && id != g_main_device) {
|
2699
|
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
2700
|
+
}
|
2376
2701
|
|
2377
2702
|
if (src0_on_device && src0_is_contiguous) {
|
2378
2703
|
if (src0_is_f32) {
|
@@ -2448,8 +2773,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2448
2773
|
}
|
2449
2774
|
const int64_t i11 = i13*ne12 + i12;
|
2450
2775
|
|
2451
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2452
|
-
|
2453
2776
|
// for split tensors the data begins at i0 == i0_offset_low
|
2454
2777
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2455
2778
|
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
@@ -2509,6 +2832,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2509
2832
|
|
2510
2833
|
// do the computation
|
2511
2834
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
2835
|
+
CUDA_CHECK(cudaGetLastError());
|
2512
2836
|
|
2513
2837
|
// copy dst to host or other device if necessary
|
2514
2838
|
if (!dst_on_device) {
|
@@ -2538,6 +2862,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2538
2862
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
2539
2863
|
}
|
2540
2864
|
}
|
2865
|
+
|
2866
|
+
// signify to main device that other device is done
|
2867
|
+
if (split && g_device_count > 1 && id != g_main_device) {
|
2868
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
|
2869
|
+
}
|
2541
2870
|
}
|
2542
2871
|
}
|
2543
2872
|
}
|
@@ -2549,7 +2878,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2549
2878
|
}
|
2550
2879
|
|
2551
2880
|
CUDA_CHECK(cudaSetDevice(id));
|
2552
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2553
2881
|
|
2554
2882
|
if (src0_asq[id] > 0) {
|
2555
2883
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
@@ -2564,6 +2892,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2564
2892
|
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
|
2565
2893
|
}
|
2566
2894
|
}
|
2895
|
+
|
2896
|
+
// main device waits for all other devices to be finished
|
2897
|
+
if (split && g_device_count > 1) {
|
2898
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2899
|
+
for (int id = 0; id < g_device_count; ++id) {
|
2900
|
+
if (id != g_main_device) {
|
2901
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
2902
|
+
}
|
2903
|
+
}
|
2904
|
+
}
|
2905
|
+
|
2906
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
2907
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2908
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2909
|
+
}
|
2567
2910
|
}
|
2568
2911
|
|
2569
2912
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2679,8 +3022,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
2679
3022
|
}else if (src0->type == GGML_TYPE_F32) {
|
2680
3023
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2681
3024
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
2682
|
-
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0
|
2683
|
-
ggml_cuda_op(src0, src1, dst,
|
3025
|
+
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3026
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
2684
3027
|
} else {
|
2685
3028
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
2686
3029
|
}
|
@@ -2803,25 +3146,32 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
2803
3146
|
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
2804
3147
|
|
2805
3148
|
extra->data_device[id] = buf;
|
3149
|
+
|
3150
|
+
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
3151
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
|
3152
|
+
}
|
2806
3153
|
}
|
2807
3154
|
|
2808
3155
|
tensor->extra = extra;
|
2809
3156
|
}
|
2810
3157
|
|
2811
3158
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
2812
|
-
if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
|
3159
|
+
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
2813
3160
|
return;
|
2814
3161
|
}
|
2815
3162
|
|
2816
3163
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
2817
3164
|
|
2818
3165
|
for (int id = 0; id < g_device_count; ++id) {
|
2819
|
-
if (extra->data_device[id]
|
2820
|
-
|
3166
|
+
if (extra->data_device[id] != nullptr) {
|
3167
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3168
|
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
2821
3169
|
}
|
2822
3170
|
|
2823
|
-
|
2824
|
-
|
3171
|
+
if (extra->events[id] != nullptr) {
|
3172
|
+
CUDA_CHECK(cudaSetDevice(id));
|
3173
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
|
3174
|
+
}
|
2825
3175
|
}
|
2826
3176
|
|
2827
3177
|
delete extra;
|