llama_cpp 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
70
70
 
71
71
  // QK = number of values after dequantization
72
72
  // QR = QK / number of values before dequantization
73
+ // QI = number of 32 bit integers before dequantization
73
74
 
74
75
  #define QK4_0 32
75
76
  #define QR4_0 2
77
+ #define QI4_0 4
76
78
  typedef struct {
77
79
  half d; // delta
78
80
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
81
83
 
82
84
  #define QK4_1 32
83
85
  #define QR4_1 2
86
+ #define QI4_1 4
84
87
  typedef struct {
85
88
  half d; // delta
86
89
  half m; // min
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
90
93
 
91
94
  #define QK5_0 32
92
95
  #define QR5_0 2
96
+ #define QI5_0 4
93
97
  typedef struct {
94
98
  half d; // delta
95
99
  uint8_t qh[4]; // 5-th bit of quants
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
99
103
 
100
104
  #define QK5_1 32
101
105
  #define QR5_1 2
106
+ #define QI5_1 4
102
107
  typedef struct {
103
108
  half d; // delta
104
109
  half m; // min
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
109
114
 
110
115
  #define QK8_0 32
111
116
  #define QR8_0 1
117
+ #define QI8_0 8
112
118
  typedef struct {
113
119
  half d; // delta
114
120
  int8_t qs[QK8_0]; // quants
115
121
  } block_q8_0;
116
122
  static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
117
123
 
124
+ #define QK8_1 32
125
+ #define QR8_1 1
126
+ #define QI8_1 8
127
+ typedef struct {
128
+ half d; // delta
129
+ half s; // unquantized sum
130
+ int8_t qs[QK8_0]; // quants
131
+ } block_q8_1;
132
+ static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
+
134
+ typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
135
+
118
136
  //================================= k-quants
119
137
 
120
138
  #ifdef GGML_QKK_64
@@ -198,14 +216,15 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
198
216
  #define CUDA_SCALE_BLOCK_SIZE 256
199
217
  #define CUDA_ROPE_BLOCK_SIZE 256
200
218
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
219
+ #define CUDA_QUANTIZE_BLOCK_SIZE 256
201
220
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
202
221
 
203
222
  // dmmv = dequantize_mul_mat_vec
204
223
  #ifndef GGML_CUDA_DMMV_X
205
224
  #define GGML_CUDA_DMMV_X 32
206
225
  #endif
207
- #ifndef GGML_CUDA_DMMV_Y
208
- #define GGML_CUDA_DMMV_Y 1
226
+ #ifndef GGML_CUDA_MMV_Y
227
+ #define GGML_CUDA_MMV_Y 1
209
228
  #endif
210
229
 
211
230
  #ifndef K_QUANTS_PER_ITERATION
@@ -214,6 +233,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
214
233
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
215
234
  #endif
216
235
 
236
+ struct ggml_tensor_extra_gpu {
237
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
238
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
239
+ };
240
+
217
241
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
218
242
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
219
243
 
@@ -265,7 +289,6 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
265
289
  }
266
290
 
267
291
  // sum up partial sums
268
- __syncthreads();
269
292
  #pragma unroll
270
293
  for (int mask = 16; mask > 0; mask >>= 1) {
271
294
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -709,7 +732,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
709
732
  #endif
710
733
 
711
734
  // sum up partial sums and write back result
712
- __syncthreads();
713
735
  #pragma unroll
714
736
  for (int mask = 16; mask > 0; mask >>= 1) {
715
737
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -814,7 +836,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
814
836
  #endif
815
837
 
816
838
  // sum up partial sums and write back result
817
- __syncthreads();
818
839
  #pragma unroll
819
840
  for (int mask = 16; mask > 0; mask >>= 1) {
820
841
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -918,7 +939,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
918
939
  #endif
919
940
 
920
941
  // sum up partial sums and write back result
921
- __syncthreads();
922
942
  #pragma unroll
923
943
  for (int mask = 16; mask > 0; mask >>= 1) {
924
944
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1023,7 +1043,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1023
1043
  #endif
1024
1044
 
1025
1045
  // sum up partial sums and write back result
1026
- __syncthreads();
1027
1046
  #pragma unroll
1028
1047
  for (int mask = 16; mask > 0; mask >>= 1) {
1029
1048
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1134,7 +1153,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
1134
1153
  #endif
1135
1154
 
1136
1155
  // sum up partial sums and write back result
1137
- __syncthreads();
1138
1156
  #pragma unroll
1139
1157
  for (int mask = 16; mask > 0; mask >>= 1) {
1140
1158
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1153,6 +1171,41 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1153
1171
  v.y = x[ib + iqs + 1];
1154
1172
  }
1155
1173
 
1174
+ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1175
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
1176
+
1177
+ if (i >= k) {
1178
+ return;
1179
+ }
1180
+
1181
+ block_q8_1 * y = (block_q8_1 *) vy;
1182
+
1183
+ const int ib = i / QK8_0; // block index
1184
+ const int iqs = i % QK8_0; // quant index
1185
+
1186
+ const float xi = x[i];
1187
+ float amax = fabsf(xi);
1188
+ float sum = xi;
1189
+
1190
+ #pragma unroll
1191
+ for (int mask = 16; mask > 0; mask >>= 1) {
1192
+ amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
1193
+ sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
1194
+ }
1195
+
1196
+ const float d = amax / 127;
1197
+ const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
1198
+
1199
+ y[ib].qs[iqs] = q;
1200
+
1201
+ if (iqs > 0) {
1202
+ return;
1203
+ }
1204
+
1205
+ y[ib].d = d;
1206
+ y[ib].s = sum;
1207
+ }
1208
+
1156
1209
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1157
1210
  static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1158
1211
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -1174,6 +1227,182 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1174
1227
  y[iybs + iqs + y_offset] = v.y;
1175
1228
  }
1176
1229
 
1230
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1231
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1232
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1233
+
1234
+ int vi;
1235
+ memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1236
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1237
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1238
+
1239
+ const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1240
+
1241
+ // subtract 8 from each quantized value
1242
+ const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1243
+ const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1244
+
1245
+ // SIMD dot product of quantized values
1246
+ int sumi = __dp4a(vi0, ui0, 0);
1247
+ sumi = __dp4a(vi1, ui1, sumi);
1248
+
1249
+ return sumi*d;
1250
+ #else
1251
+ return 0.0f; // only to satisfy the compiler
1252
+ #endif // __CUDA_ARCH__ >= 600
1253
+ }
1254
+
1255
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1256
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1257
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1258
+
1259
+ const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1260
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1261
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1262
+
1263
+ const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1264
+ const float m = bq4_1->m;
1265
+ const float s = bq8_1->s;
1266
+
1267
+ const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1268
+ const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1269
+
1270
+ // SIMD dot product of quantized values
1271
+ int sumi = __dp4a(vi0, ui0, 0);
1272
+ sumi = __dp4a(vi1, ui1, sumi);
1273
+
1274
+ return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1275
+ #else
1276
+ return 0.0f; // only to satisfy the compiler
1277
+ #endif // __CUDA_ARCH__ >= 600
1278
+ }
1279
+
1280
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1281
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1282
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1283
+
1284
+ int qs;
1285
+ memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1286
+ const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1287
+ const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1288
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1289
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1290
+
1291
+ const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1292
+
1293
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1294
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1295
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1296
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1297
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1298
+ vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1299
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1300
+
1301
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1302
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1303
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1304
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1305
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1306
+ vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1307
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1308
+
1309
+ return sumi*d;
1310
+ #else
1311
+ return 0.0f; // only to satisfy the compiler
1312
+ #endif // __CUDA_ARCH__ >= 600
1313
+ }
1314
+
1315
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1316
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1317
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1318
+
1319
+ const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1320
+ const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1321
+ const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1322
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1323
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1324
+
1325
+ const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1326
+ const float m = bq5_1->m;
1327
+ const float s = bq8_1->s;
1328
+
1329
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1330
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1331
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1332
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1333
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1334
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1335
+
1336
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1337
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1338
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1339
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1340
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1341
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1342
+
1343
+ return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1344
+ #else
1345
+ return 0.0f; // only to satisfy the compiler
1346
+ #endif // __CUDA_ARCH__ >= 600
1347
+ }
1348
+
1349
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1350
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1351
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1352
+
1353
+ int vi;
1354
+ memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1355
+ const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1356
+
1357
+ const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1358
+
1359
+ // SIMD dot product of quantized values
1360
+ int sumi = __dp4a(vi, ui, 0);
1361
+
1362
+ return sumi*d;
1363
+ #else
1364
+ return 0.0f; // only to satisfy the compiler
1365
+ #endif // __CUDA_ARCH__ >= 600
1366
+ }
1367
+
1368
+ template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1369
+ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
1370
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
1371
+
1372
+ if (row >= nrows) {
1373
+ return;
1374
+ }
1375
+
1376
+ const int blocks_per_row = ncols / qk;
1377
+ const int blocks_per_warp = WARP_SIZE / qi;
1378
+
1379
+ // partial sum for each thread
1380
+ float tmp = 0.0f;
1381
+
1382
+ const block_q_t * x = (const block_q_t *) vx;
1383
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1384
+
1385
+ for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1386
+ const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1387
+
1388
+ const int iby = i + threadIdx.x / qi; // y block index
1389
+
1390
+ const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1391
+
1392
+ tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1393
+ }
1394
+
1395
+ // sum up partial sums and write back result
1396
+ #pragma unroll
1397
+ for (int mask = 16; mask > 0; mask >>= 1) {
1398
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1399
+ }
1400
+
1401
+ if (threadIdx.x == 0) {
1402
+ dst[row] = tmp;
1403
+ }
1404
+ }
1405
+
1177
1406
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1178
1407
  static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1179
1408
  // qk = quantized weights per x block
@@ -1228,7 +1457,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1228
1457
  }
1229
1458
 
1230
1459
  // sum up partial sums and write back result
1231
- __syncthreads();
1232
1460
  #pragma unroll
1233
1461
  for (int mask = 16; mask > 0; mask >>= 1) {
1234
1462
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1279,7 +1507,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1279
1507
  const int idst = channel*nrows_dst + row_dst;
1280
1508
 
1281
1509
  // sum up partial sums and write back result
1282
- __syncthreads();
1283
1510
  #pragma unroll
1284
1511
  for (int mask = 16; mask > 0; mask >>= 1) {
1285
1512
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1325,7 +1552,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1325
1552
  }
1326
1553
 
1327
1554
  // sum up partial sums and write back result
1328
- __syncthreads();
1329
1555
  #pragma unroll
1330
1556
  for (int mask = 16; mask > 0; mask >>= 1) {
1331
1557
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1435,7 +1661,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
1435
1661
  }
1436
1662
 
1437
1663
  // sum up partial sums
1438
- __syncthreads();
1439
1664
  #pragma unroll
1440
1665
  for (int mask = 16; mask > 0; mask >>= 1) {
1441
1666
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1489,6 +1714,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
1489
1714
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1490
1715
  }
1491
1716
 
1717
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
1718
+ const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1719
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
1720
+ }
1721
+
1492
1722
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1493
1723
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1494
1724
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1557,45 +1787,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
1557
1787
 
1558
1788
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1559
1789
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1560
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1790
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1561
1791
  const dim3 block_nums(1, block_num_y, 1);
1562
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1792
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1563
1793
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
1564
1794
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1565
1795
  }
1566
1796
 
1567
1797
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1568
1798
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1569
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1799
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1570
1800
  const dim3 block_nums(1, block_num_y, 1);
1571
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1801
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1572
1802
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
1573
1803
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1574
1804
  }
1575
1805
 
1576
1806
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1577
1807
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1578
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1808
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1579
1809
  const dim3 block_nums(1, block_num_y, 1);
1580
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1810
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1581
1811
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
1582
1812
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1583
1813
  }
1584
1814
 
1585
1815
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1586
1816
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1587
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1817
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1588
1818
  const dim3 block_nums(1, block_num_y, 1);
1589
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1819
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1590
1820
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
1591
1821
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1592
1822
  }
1593
1823
 
1594
1824
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1595
1825
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1596
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1826
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1597
1827
  const dim3 block_nums(1, block_num_y, 1);
1598
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1828
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1599
1829
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
1600
1830
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1601
1831
  }
@@ -1642,6 +1872,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1642
1872
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1643
1873
  }
1644
1874
 
1875
+ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1876
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1877
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1878
+ const dim3 block_nums(1, block_num_y, 1);
1879
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1880
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
1881
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1882
+ }
1883
+
1884
+ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1885
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1886
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1887
+ const dim3 block_nums(1, block_num_y, 1);
1888
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1889
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
1890
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1891
+ }
1892
+
1893
+ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1894
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1895
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1896
+ const dim3 block_nums(1, block_num_y, 1);
1897
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1898
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
1899
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1900
+ }
1901
+
1902
+ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1903
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1904
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1905
+ const dim3 block_nums(1, block_num_y, 1);
1906
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1907
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
1908
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1909
+ }
1910
+
1911
+ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1912
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1913
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1914
+ const dim3 block_nums(1, block_num_y, 1);
1915
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1916
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
1917
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1918
+ }
1919
+
1645
1920
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1646
1921
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1647
1922
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1649,9 +1924,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
1649
1924
 
1650
1925
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1651
1926
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1652
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1927
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1653
1928
  const dim3 block_nums(1, block_num_y, 1);
1654
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1929
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1655
1930
  dequantize_mul_mat_vec<1, 1, convert_f16>
1656
1931
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1657
1932
  }
@@ -1817,6 +2092,7 @@ static size_t g_scratch_offset = 0;
1817
2092
 
1818
2093
  static int g_device_count = -1;
1819
2094
  static int g_main_device = 0;
2095
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
1820
2096
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1821
2097
 
1822
2098
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -1834,9 +2110,12 @@ void ggml_init_cublas() {
1834
2110
  for (int id = 0; id < g_device_count; ++id) {
1835
2111
  cudaDeviceProp prop;
1836
2112
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
1837
- fprintf(stderr, " Device %d: %s\n", id, prop.name);
2113
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
2114
+
1838
2115
  g_tensor_split[id] = total_vram;
1839
2116
  total_vram += prop.totalGlobalMem;
2117
+
2118
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
1840
2119
  }
1841
2120
  for (int id = 0; id < g_device_count; ++id) {
1842
2121
  g_tensor_split[id] /= total_vram;
@@ -1970,7 +2249,6 @@ inline void ggml_cuda_op_add(
1970
2249
  } else {
1971
2250
  GGML_ASSERT(false);
1972
2251
  }
1973
- CUDA_CHECK(cudaGetLastError());
1974
2252
 
1975
2253
  (void) src1;
1976
2254
  (void) dst;
@@ -2002,7 +2280,6 @@ inline void ggml_cuda_op_mul(
2002
2280
 
2003
2281
  // compute
2004
2282
  mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
2005
- CUDA_CHECK(cudaGetLastError());
2006
2283
  }
2007
2284
 
2008
2285
  (void) dst;
@@ -2023,7 +2300,6 @@ inline void ggml_cuda_op_silu(
2023
2300
 
2024
2301
  // compute
2025
2302
  silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2026
- CUDA_CHECK(cudaGetLastError());
2027
2303
 
2028
2304
  (void) src1;
2029
2305
  (void) dst;
@@ -2046,7 +2322,6 @@ inline void ggml_cuda_op_rms_norm(
2046
2322
 
2047
2323
  // compute
2048
2324
  rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2049
- CUDA_CHECK(cudaGetLastError());
2050
2325
 
2051
2326
  (void) src1;
2052
2327
  (void) dst;
@@ -2056,7 +2331,7 @@ inline void ggml_cuda_op_rms_norm(
2056
2331
  (void) i1;
2057
2332
  }
2058
2333
 
2059
- inline void ggml_cuda_op_dequantize_mul_mat_vec(
2334
+ inline void ggml_cuda_op_mul_mat_vec(
2060
2335
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2061
2336
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2062
2337
  cudaStream_t & cudaStream_main){
@@ -2068,70 +2343,116 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
2068
2343
  const int64_t ne00 = src0->ne[0];
2069
2344
  const int64_t nrows = i01_high - i01_low;
2070
2345
 
2071
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2072
- #ifdef GGML_CUDA_DMMV_F16
2073
- size_t ash;
2074
- dfloat * src1_dfloat = nullptr; // dfloat == half
2346
+ #ifdef GGML_CUDA_FORCE_DMMV
2347
+ const bool use_mul_mat_vec_q = false;
2348
+ #else
2349
+ int id;
2350
+ CUDA_CHECK(cudaGetDevice(&id));
2075
2351
 
2076
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2077
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2078
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2352
+ const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2353
+ src0->type == GGML_TYPE_Q4_1 ||
2354
+ src0->type == GGML_TYPE_Q5_0 ||
2355
+ src0->type == GGML_TYPE_Q5_1 ||
2356
+ src0->type == GGML_TYPE_Q8_0;
2079
2357
 
2080
- if (src1_convert_f16) {
2081
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2082
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2083
- ne00, 1, sizeof(float), 0, 0,
2084
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2085
- }
2358
+ // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
2359
+ // However, they have bad performance with Pascal cards.
2360
+ // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
2361
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
2362
+ #endif
2363
+
2364
+ if (use_mul_mat_vec_q) {
2365
+ size_t as;
2366
+ void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
2367
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
2368
+
2369
+ switch (src0->type) {
2370
+ case GGML_TYPE_Q4_0:
2371
+ mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2372
+ break;
2373
+ case GGML_TYPE_Q4_1:
2374
+ mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2375
+ break;
2376
+ case GGML_TYPE_Q5_0:
2377
+ mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2378
+ break;
2379
+ case GGML_TYPE_Q5_1:
2380
+ mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2381
+ break;
2382
+ case GGML_TYPE_Q8_0:
2383
+ mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2384
+ break;
2385
+ default:
2386
+ GGML_ASSERT(false);
2387
+ break;
2388
+ }
2389
+
2390
+ ggml_cuda_pool_free(src1_q8_1, as);
2391
+ } else {
2392
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2393
+ #ifdef GGML_CUDA_DMMV_F16
2394
+ size_t ash;
2395
+ dfloat * src1_dfloat = nullptr; // dfloat == half
2396
+
2397
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2398
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2399
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2400
+
2401
+ if (src1_convert_f16) {
2402
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2403
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2404
+ ne00, 1, sizeof(float), 0, 0,
2405
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2406
+ }
2086
2407
  #else
2087
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2408
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2088
2409
  #endif // GGML_CUDA_DMMV_F16
2089
2410
 
2090
- switch (src0->type) {
2091
- case GGML_TYPE_Q4_0:
2092
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2093
- break;
2094
- case GGML_TYPE_Q4_1:
2095
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2096
- break;
2097
- case GGML_TYPE_Q5_0:
2098
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2099
- break;
2100
- case GGML_TYPE_Q5_1:
2101
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2102
- break;
2103
- case GGML_TYPE_Q8_0:
2104
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2105
- break;
2106
- case GGML_TYPE_Q2_K:
2107
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2108
- break;
2109
- case GGML_TYPE_Q3_K:
2110
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2111
- break;
2112
- case GGML_TYPE_Q4_K:
2113
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2114
- break;
2115
- case GGML_TYPE_Q5_K:
2116
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2117
- break;
2118
- case GGML_TYPE_Q6_K:
2119
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2120
- break;
2121
- case GGML_TYPE_F16:
2122
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2123
- break;
2124
- default:
2125
- GGML_ASSERT(false);
2126
- break;
2127
- }
2128
- CUDA_CHECK(cudaGetLastError());
2411
+ switch (src0->type) {
2412
+ case GGML_TYPE_Q4_0:
2413
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2414
+ break;
2415
+ case GGML_TYPE_Q4_1:
2416
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2417
+ break;
2418
+ case GGML_TYPE_Q5_0:
2419
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2420
+ break;
2421
+ case GGML_TYPE_Q5_1:
2422
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2423
+ break;
2424
+ case GGML_TYPE_Q8_0:
2425
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2426
+ break;
2427
+ case GGML_TYPE_Q2_K:
2428
+ dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2429
+ break;
2430
+ case GGML_TYPE_Q3_K:
2431
+ dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2432
+ break;
2433
+ case GGML_TYPE_Q4_K:
2434
+ dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2435
+ break;
2436
+ case GGML_TYPE_Q5_K:
2437
+ dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2438
+ break;
2439
+ case GGML_TYPE_Q6_K:
2440
+ dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2441
+ break;
2442
+ case GGML_TYPE_F16:
2443
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2444
+ break;
2445
+ default:
2446
+ GGML_ASSERT(false);
2447
+ break;
2448
+ }
2129
2449
 
2130
2450
  #ifdef GGML_CUDA_DMMV_F16
2131
- if (src1_convert_f16) {
2132
- ggml_cuda_pool_free(src1_dfloat, ash);
2133
- }
2451
+ if (src1_convert_f16) {
2452
+ ggml_cuda_pool_free(src1_dfloat, ash);
2453
+ }
2134
2454
  #endif // GGML_CUDA_DMMV_F16
2455
+ }
2135
2456
 
2136
2457
  (void) src1;
2137
2458
  (void) dst;
@@ -2202,7 +2523,6 @@ inline void ggml_cuda_op_rope(
2202
2523
 
2203
2524
  // compute
2204
2525
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
2205
- CUDA_CHECK(cudaGetLastError());
2206
2526
 
2207
2527
  (void) dst;
2208
2528
  (void) src0_ddq_i;
@@ -2226,7 +2546,6 @@ inline void ggml_cuda_op_diag_mask_inf(
2226
2546
 
2227
2547
  // compute
2228
2548
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
2229
- CUDA_CHECK(cudaGetLastError());
2230
2549
 
2231
2550
  (void) dst;
2232
2551
  (void) src0_ddq_i;
@@ -2248,7 +2567,6 @@ inline void ggml_cuda_op_soft_max(
2248
2567
 
2249
2568
  // compute
2250
2569
  soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2251
- CUDA_CHECK(cudaGetLastError());
2252
2570
 
2253
2571
  (void) src1;
2254
2572
  (void) dst;
@@ -2344,10 +2662,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2344
2662
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
2345
2663
  size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2346
2664
 
2347
- // if multiple GPUs are used they need to wait for the main GPU to finish
2665
+ // if multiple devices are used they need to wait for the main device
2666
+ // here an event is recorded that signifies that the main device has finished calculating the input data
2348
2667
  if (split && g_device_count > 1) {
2349
2668
  CUDA_CHECK(cudaSetDevice(g_main_device));
2350
- CUDA_CHECK(cudaDeviceSynchronize());
2669
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
2351
2670
  }
2352
2671
 
2353
2672
  for (int id = 0; id < g_device_count; ++id) {
@@ -2373,6 +2692,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2373
2692
  int64_t row_diff = row_high - row_low;
2374
2693
 
2375
2694
  cudaSetDevice(id);
2695
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2696
+
2697
+ // wait for main GPU data if necessary
2698
+ if (split && id != g_main_device) {
2699
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
2700
+ }
2376
2701
 
2377
2702
  if (src0_on_device && src0_is_contiguous) {
2378
2703
  if (src0_is_f32) {
@@ -2448,8 +2773,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2448
2773
  }
2449
2774
  const int64_t i11 = i13*ne12 + i12;
2450
2775
 
2451
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2452
-
2453
2776
  // for split tensors the data begins at i0 == i0_offset_low
2454
2777
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
2455
2778
  float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@@ -2509,6 +2832,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2509
2832
 
2510
2833
  // do the computation
2511
2834
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2835
+ CUDA_CHECK(cudaGetLastError());
2512
2836
 
2513
2837
  // copy dst to host or other device if necessary
2514
2838
  if (!dst_on_device) {
@@ -2538,6 +2862,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2538
2862
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
2539
2863
  }
2540
2864
  }
2865
+
2866
+ // signify to main device that other device is done
2867
+ if (split && g_device_count > 1 && id != g_main_device) {
2868
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
2869
+ }
2541
2870
  }
2542
2871
  }
2543
2872
  }
@@ -2549,7 +2878,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2549
2878
  }
2550
2879
 
2551
2880
  CUDA_CHECK(cudaSetDevice(id));
2552
- CUDA_CHECK(cudaDeviceSynchronize());
2553
2881
 
2554
2882
  if (src0_asq[id] > 0) {
2555
2883
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
@@ -2564,6 +2892,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2564
2892
  ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
2565
2893
  }
2566
2894
  }
2895
+
2896
+ // main device waits for all other devices to be finished
2897
+ if (split && g_device_count > 1) {
2898
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2899
+ for (int id = 0; id < g_device_count; ++id) {
2900
+ if (id != g_main_device) {
2901
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
2902
+ }
2903
+ }
2904
+ }
2905
+
2906
+ if (dst->backend == GGML_BACKEND_CPU) {
2907
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2908
+ CUDA_CHECK(cudaDeviceSynchronize());
2909
+ }
2567
2910
  }
2568
2911
 
2569
2912
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2679,8 +3022,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
2679
3022
  }else if (src0->type == GGML_TYPE_F32) {
2680
3023
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2681
3024
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
2682
- if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2683
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
3025
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3026
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
2684
3027
  } else {
2685
3028
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2686
3029
  }
@@ -2803,25 +3146,32 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
2803
3146
  cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
2804
3147
 
2805
3148
  extra->data_device[id] = buf;
3149
+
3150
+ if (backend == GGML_BACKEND_GPU_SPLIT) {
3151
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
3152
+ }
2806
3153
  }
2807
3154
 
2808
3155
  tensor->extra = extra;
2809
3156
  }
2810
3157
 
2811
3158
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2812
- if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
3159
+ if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
2813
3160
  return;
2814
3161
  }
2815
3162
 
2816
3163
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
2817
3164
 
2818
3165
  for (int id = 0; id < g_device_count; ++id) {
2819
- if (extra->data_device[id] == nullptr) {
2820
- continue;
3166
+ if (extra->data_device[id] != nullptr) {
3167
+ CUDA_CHECK(cudaSetDevice(id));
3168
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
2821
3169
  }
2822
3170
 
2823
- CUDA_CHECK(cudaSetDevice(id));
2824
- CUDA_CHECK(cudaFree(extra->data_device[id]));
3171
+ if (extra->events[id] != nullptr) {
3172
+ CUDA_CHECK(cudaSetDevice(id));
3173
+ CUDA_CHECK(cudaEventDestroy(extra->events[id]));
3174
+ }
2825
3175
  }
2826
3176
 
2827
3177
  delete extra;