llama_cpp 0.3.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
70
70
 
71
71
  // QK = number of values after dequantization
72
72
  // QR = QK / number of values before dequantization
73
+ // QI = number of 32 bit integers before dequantization
73
74
 
74
75
  #define QK4_0 32
75
76
  #define QR4_0 2
77
+ #define QI4_0 4
76
78
  typedef struct {
77
79
  half d; // delta
78
80
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
81
83
 
82
84
  #define QK4_1 32
83
85
  #define QR4_1 2
86
+ #define QI4_1 4
84
87
  typedef struct {
85
88
  half d; // delta
86
89
  half m; // min
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
90
93
 
91
94
  #define QK5_0 32
92
95
  #define QR5_0 2
96
+ #define QI5_0 4
93
97
  typedef struct {
94
98
  half d; // delta
95
99
  uint8_t qh[4]; // 5-th bit of quants
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
99
103
 
100
104
  #define QK5_1 32
101
105
  #define QR5_1 2
106
+ #define QI5_1 4
102
107
  typedef struct {
103
108
  half d; // delta
104
109
  half m; // min
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
109
114
 
110
115
  #define QK8_0 32
111
116
  #define QR8_0 1
117
+ #define QI8_0 8
112
118
  typedef struct {
113
119
  half d; // delta
114
120
  int8_t qs[QK8_0]; // quants
115
121
  } block_q8_0;
116
122
  static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
117
123
 
124
+ #define QK8_1 32
125
+ #define QR8_1 1
126
+ #define QI8_1 8
127
+ typedef struct {
128
+ half d; // delta
129
+ half s; // unquantized sum
130
+ int8_t qs[QK8_0]; // quants
131
+ } block_q8_1;
132
+ static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
+
134
+ typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
135
+
118
136
  //================================= k-quants
119
137
 
120
138
  #ifdef GGML_QKK_64
@@ -198,14 +216,15 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
198
216
  #define CUDA_SCALE_BLOCK_SIZE 256
199
217
  #define CUDA_ROPE_BLOCK_SIZE 256
200
218
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
219
+ #define CUDA_QUANTIZE_BLOCK_SIZE 256
201
220
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
202
221
 
203
222
  // dmmv = dequantize_mul_mat_vec
204
223
  #ifndef GGML_CUDA_DMMV_X
205
224
  #define GGML_CUDA_DMMV_X 32
206
225
  #endif
207
- #ifndef GGML_CUDA_DMMV_Y
208
- #define GGML_CUDA_DMMV_Y 1
226
+ #ifndef GGML_CUDA_MMV_Y
227
+ #define GGML_CUDA_MMV_Y 1
209
228
  #endif
210
229
 
211
230
  #ifndef K_QUANTS_PER_ITERATION
@@ -214,6 +233,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
214
233
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
215
234
  #endif
216
235
 
236
+ struct ggml_tensor_extra_gpu {
237
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
238
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
239
+ };
240
+
217
241
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
218
242
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
219
243
 
@@ -223,6 +247,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
223
247
  dst[i] = x[i] + y[i];
224
248
  }
225
249
 
250
+ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
251
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
252
+
253
+ if (i >= k) {
254
+ return;
255
+ }
256
+ dst[i] = __hadd(x[i], __float2half(y[i]));
257
+ }
258
+
226
259
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
227
260
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
228
261
 
@@ -256,7 +289,6 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
256
289
  }
257
290
 
258
291
  // sum up partial sums
259
- __syncthreads();
260
292
  #pragma unroll
261
293
  for (int mask = 16; mask > 0; mask >>= 1) {
262
294
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -700,7 +732,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
700
732
  #endif
701
733
 
702
734
  // sum up partial sums and write back result
703
- __syncthreads();
704
735
  #pragma unroll
705
736
  for (int mask = 16; mask > 0; mask >>= 1) {
706
737
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -805,7 +836,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
805
836
  #endif
806
837
 
807
838
  // sum up partial sums and write back result
808
- __syncthreads();
809
839
  #pragma unroll
810
840
  for (int mask = 16; mask > 0; mask >>= 1) {
811
841
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -909,7 +939,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
909
939
  #endif
910
940
 
911
941
  // sum up partial sums and write back result
912
- __syncthreads();
913
942
  #pragma unroll
914
943
  for (int mask = 16; mask > 0; mask >>= 1) {
915
944
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1014,7 +1043,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1014
1043
  #endif
1015
1044
 
1016
1045
  // sum up partial sums and write back result
1017
- __syncthreads();
1018
1046
  #pragma unroll
1019
1047
  for (int mask = 16; mask > 0; mask >>= 1) {
1020
1048
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1125,7 +1153,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
1125
1153
  #endif
1126
1154
 
1127
1155
  // sum up partial sums and write back result
1128
- __syncthreads();
1129
1156
  #pragma unroll
1130
1157
  for (int mask = 16; mask > 0; mask >>= 1) {
1131
1158
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1144,6 +1171,41 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1144
1171
  v.y = x[ib + iqs + 1];
1145
1172
  }
1146
1173
 
1174
+ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1175
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
1176
+
1177
+ if (i >= k) {
1178
+ return;
1179
+ }
1180
+
1181
+ block_q8_1 * y = (block_q8_1 *) vy;
1182
+
1183
+ const int ib = i / QK8_0; // block index
1184
+ const int iqs = i % QK8_0; // quant index
1185
+
1186
+ const float xi = x[i];
1187
+ float amax = fabsf(xi);
1188
+ float sum = xi;
1189
+
1190
+ #pragma unroll
1191
+ for (int mask = 16; mask > 0; mask >>= 1) {
1192
+ amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
1193
+ sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
1194
+ }
1195
+
1196
+ const float d = amax / 127;
1197
+ const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
1198
+
1199
+ y[ib].qs[iqs] = q;
1200
+
1201
+ if (iqs > 0) {
1202
+ return;
1203
+ }
1204
+
1205
+ y[ib].d = d;
1206
+ y[ib].s = sum;
1207
+ }
1208
+
1147
1209
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1148
1210
  static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1149
1211
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -1165,6 +1227,182 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1165
1227
  y[iybs + iqs + y_offset] = v.y;
1166
1228
  }
1167
1229
 
1230
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1231
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1232
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1233
+
1234
+ int vi;
1235
+ memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1236
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1237
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1238
+
1239
+ const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1240
+
1241
+ // subtract 8 from each quantized value
1242
+ const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1243
+ const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1244
+
1245
+ // SIMD dot product of quantized values
1246
+ int sumi = __dp4a(vi0, ui0, 0);
1247
+ sumi = __dp4a(vi1, ui1, sumi);
1248
+
1249
+ return sumi*d;
1250
+ #else
1251
+ return 0.0f; // only to satisfy the compiler
1252
+ #endif // __CUDA_ARCH__ >= 600
1253
+ }
1254
+
1255
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1256
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1257
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1258
+
1259
+ const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1260
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1261
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1262
+
1263
+ const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1264
+ const float m = bq4_1->m;
1265
+ const float s = bq8_1->s;
1266
+
1267
+ const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1268
+ const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1269
+
1270
+ // SIMD dot product of quantized values
1271
+ int sumi = __dp4a(vi0, ui0, 0);
1272
+ sumi = __dp4a(vi1, ui1, sumi);
1273
+
1274
+ return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1275
+ #else
1276
+ return 0.0f; // only to satisfy the compiler
1277
+ #endif // __CUDA_ARCH__ >= 600
1278
+ }
1279
+
1280
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1281
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1282
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1283
+
1284
+ int qs;
1285
+ memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1286
+ const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1287
+ const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1288
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1289
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1290
+
1291
+ const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1292
+
1293
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1294
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1295
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1296
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1297
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1298
+ vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1299
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1300
+
1301
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1302
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1303
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1304
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1305
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1306
+ vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1307
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1308
+
1309
+ return sumi*d;
1310
+ #else
1311
+ return 0.0f; // only to satisfy the compiler
1312
+ #endif // __CUDA_ARCH__ >= 600
1313
+ }
1314
+
1315
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1316
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1317
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1318
+
1319
+ const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1320
+ const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1321
+ const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1322
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1323
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1324
+
1325
+ const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1326
+ const float m = bq5_1->m;
1327
+ const float s = bq8_1->s;
1328
+
1329
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1330
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1331
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1332
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1333
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1334
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1335
+
1336
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1337
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1338
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1339
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1340
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1341
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1342
+
1343
+ return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1344
+ #else
1345
+ return 0.0f; // only to satisfy the compiler
1346
+ #endif // __CUDA_ARCH__ >= 600
1347
+ }
1348
+
1349
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1350
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1351
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1352
+
1353
+ int vi;
1354
+ memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1355
+ const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1356
+
1357
+ const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1358
+
1359
+ // SIMD dot product of quantized values
1360
+ int sumi = __dp4a(vi, ui, 0);
1361
+
1362
+ return sumi*d;
1363
+ #else
1364
+ return 0.0f; // only to satisfy the compiler
1365
+ #endif // __CUDA_ARCH__ >= 600
1366
+ }
1367
+
1368
+ template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1369
+ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
1370
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
1371
+
1372
+ if (row >= nrows) {
1373
+ return;
1374
+ }
1375
+
1376
+ const int blocks_per_row = ncols / qk;
1377
+ const int blocks_per_warp = WARP_SIZE / qi;
1378
+
1379
+ // partial sum for each thread
1380
+ float tmp = 0.0f;
1381
+
1382
+ const block_q_t * x = (const block_q_t *) vx;
1383
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1384
+
1385
+ for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1386
+ const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1387
+
1388
+ const int iby = i + threadIdx.x / qi; // y block index
1389
+
1390
+ const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1391
+
1392
+ tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1393
+ }
1394
+
1395
+ // sum up partial sums and write back result
1396
+ #pragma unroll
1397
+ for (int mask = 16; mask > 0; mask >>= 1) {
1398
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1399
+ }
1400
+
1401
+ if (threadIdx.x == 0) {
1402
+ dst[row] = tmp;
1403
+ }
1404
+ }
1405
+
1168
1406
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1169
1407
  static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1170
1408
  // qk = quantized weights per x block
@@ -1219,7 +1457,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1219
1457
  }
1220
1458
 
1221
1459
  // sum up partial sums and write back result
1222
- __syncthreads();
1223
1460
  #pragma unroll
1224
1461
  for (int mask = 16; mask > 0; mask >>= 1) {
1225
1462
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1235,7 +1472,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1235
1472
  }
1236
1473
 
1237
1474
  static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1238
- const half * x = (half *) vx;
1475
+ const half * x = (const half *) vx;
1239
1476
 
1240
1477
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1241
1478
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1270,7 +1507,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1270
1507
  const int idst = channel*nrows_dst + row_dst;
1271
1508
 
1272
1509
  // sum up partial sums and write back result
1273
- __syncthreads();
1274
1510
  #pragma unroll
1275
1511
  for (int mask = 16; mask > 0; mask >>= 1) {
1276
1512
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1283,9 +1519,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1283
1519
 
1284
1520
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1285
1521
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1286
- const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
1522
+ const int row_stride_x, const int channel_stride_x) {
1287
1523
 
1288
- const half * x = (half *) vx;
1524
+ const half * x = (const half *) vx;
1289
1525
 
1290
1526
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1291
1527
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1316,7 +1552,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1316
1552
  }
1317
1553
 
1318
1554
  // sum up partial sums and write back result
1319
- __syncthreads();
1320
1555
  #pragma unroll
1321
1556
  for (int mask = 16; mask > 0; mask >>= 1) {
1322
1557
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1328,14 +1563,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1328
1563
  }
1329
1564
 
1330
1565
  static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1331
- const float * xi = (float *) cxi;
1566
+ const float * xi = (const float *) cxi;
1332
1567
  float * dsti = (float *) cdsti;
1333
1568
 
1334
1569
  *dsti = *xi;
1335
1570
  }
1336
1571
 
1337
1572
  static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1338
- const float * xi = (float *) cxi;
1573
+ const float * xi = (const float *) cxi;
1339
1574
  half * dsti = (half *) cdsti;
1340
1575
 
1341
1576
  *dsti = __float2half(*xi);
@@ -1426,7 +1661,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
1426
1661
  }
1427
1662
 
1428
1663
  // sum up partial sums
1429
- __syncthreads();
1430
1664
  #pragma unroll
1431
1665
  for (int mask = 16; mask > 0; mask >>= 1) {
1432
1666
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1459,6 +1693,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
1459
1693
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1460
1694
  }
1461
1695
 
1696
+ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
1697
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1698
+ add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1699
+ }
1700
+
1462
1701
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
1463
1702
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
1464
1703
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -1475,6 +1714,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
1475
1714
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1476
1715
  }
1477
1716
 
1717
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
1718
+ const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1719
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
1720
+ }
1721
+
1478
1722
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1479
1723
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1480
1724
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1543,45 +1787,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
1543
1787
 
1544
1788
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1545
1789
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1546
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1790
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1547
1791
  const dim3 block_nums(1, block_num_y, 1);
1548
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1792
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1549
1793
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
1550
1794
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1551
1795
  }
1552
1796
 
1553
1797
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1554
1798
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1555
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1799
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1556
1800
  const dim3 block_nums(1, block_num_y, 1);
1557
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1801
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1558
1802
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
1559
1803
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1560
1804
  }
1561
1805
 
1562
1806
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1563
1807
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1564
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1808
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1565
1809
  const dim3 block_nums(1, block_num_y, 1);
1566
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1810
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1567
1811
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
1568
1812
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1569
1813
  }
1570
1814
 
1571
1815
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1572
1816
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1573
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1817
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1574
1818
  const dim3 block_nums(1, block_num_y, 1);
1575
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1819
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1576
1820
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
1577
1821
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1578
1822
  }
1579
1823
 
1580
1824
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1581
1825
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1582
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1826
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1583
1827
  const dim3 block_nums(1, block_num_y, 1);
1584
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1828
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1585
1829
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
1586
1830
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1587
1831
  }
@@ -1628,6 +1872,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1628
1872
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1629
1873
  }
1630
1874
 
1875
+ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1876
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1877
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1878
+ const dim3 block_nums(1, block_num_y, 1);
1879
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1880
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
1881
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1882
+ }
1883
+
1884
+ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1885
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1886
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1887
+ const dim3 block_nums(1, block_num_y, 1);
1888
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1889
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
1890
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1891
+ }
1892
+
1893
+ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1894
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1895
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1896
+ const dim3 block_nums(1, block_num_y, 1);
1897
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1898
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
1899
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1900
+ }
1901
+
1902
+ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1903
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1904
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1905
+ const dim3 block_nums(1, block_num_y, 1);
1906
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1907
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
1908
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1909
+ }
1910
+
1911
+ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1912
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1913
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1914
+ const dim3 block_nums(1, block_num_y, 1);
1915
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1916
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
1917
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1918
+ }
1919
+
1631
1920
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1632
1921
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1633
1922
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1635,9 +1924,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
1635
1924
 
1636
1925
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1637
1926
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1638
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1927
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1639
1928
  const dim3 block_nums(1, block_num_y, 1);
1640
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1929
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1641
1930
  dequantize_mul_mat_vec<1, 1, convert_f16>
1642
1931
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1643
1932
  }
@@ -1684,7 +1973,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1684
1973
  const dim3 block_nums(1, nrows_x, nchannels_x);
1685
1974
  const dim3 block_dims(WARP_SIZE, 1, 1);
1686
1975
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1687
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1976
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
1688
1977
  }
1689
1978
 
1690
1979
  static void ggml_cpy_f32_f32_cuda(
@@ -1803,6 +2092,7 @@ static size_t g_scratch_offset = 0;
1803
2092
 
1804
2093
  static int g_device_count = -1;
1805
2094
  static int g_main_device = 0;
2095
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
1806
2096
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1807
2097
 
1808
2098
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -1820,9 +2110,12 @@ void ggml_init_cublas() {
1820
2110
  for (int id = 0; id < g_device_count; ++id) {
1821
2111
  cudaDeviceProp prop;
1822
2112
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
1823
- fprintf(stderr, " Device %d: %s\n", id, prop.name);
2113
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
2114
+
1824
2115
  g_tensor_split[id] = total_vram;
1825
2116
  total_vram += prop.totalGlobalMem;
2117
+
2118
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
1826
2119
  }
1827
2120
  for (int id = 0; id < g_device_count; ++id) {
1828
2121
  g_tensor_split[id] /= total_vram;
@@ -1941,7 +2234,7 @@ inline void ggml_cuda_op_add(
1941
2234
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1942
2235
  cudaStream_t & cudaStream_main){
1943
2236
 
1944
- GGML_ASSERT(src0_ddf_i != nullptr);
2237
+ GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
1945
2238
  GGML_ASSERT(src1_ddf_i != nullptr);
1946
2239
  GGML_ASSERT(dst_ddf_i != nullptr);
1947
2240
 
@@ -1949,8 +2242,13 @@ inline void ggml_cuda_op_add(
1949
2242
  const int64_t i01_diff = i01_high - i01_low;
1950
2243
 
1951
2244
  // compute
1952
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1953
- CUDA_CHECK(cudaGetLastError());
2245
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2246
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
2247
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
2248
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
2249
+ } else {
2250
+ GGML_ASSERT(false);
2251
+ }
1954
2252
 
1955
2253
  (void) src1;
1956
2254
  (void) dst;
@@ -1982,7 +2280,6 @@ inline void ggml_cuda_op_mul(
1982
2280
 
1983
2281
  // compute
1984
2282
  mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
1985
- CUDA_CHECK(cudaGetLastError());
1986
2283
  }
1987
2284
 
1988
2285
  (void) dst;
@@ -2003,7 +2300,6 @@ inline void ggml_cuda_op_silu(
2003
2300
 
2004
2301
  // compute
2005
2302
  silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2006
- CUDA_CHECK(cudaGetLastError());
2007
2303
 
2008
2304
  (void) src1;
2009
2305
  (void) dst;
@@ -2026,7 +2322,6 @@ inline void ggml_cuda_op_rms_norm(
2026
2322
 
2027
2323
  // compute
2028
2324
  rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2029
- CUDA_CHECK(cudaGetLastError());
2030
2325
 
2031
2326
  (void) src1;
2032
2327
  (void) dst;
@@ -2036,7 +2331,7 @@ inline void ggml_cuda_op_rms_norm(
2036
2331
  (void) i1;
2037
2332
  }
2038
2333
 
2039
- inline void ggml_cuda_op_dequantize_mul_mat_vec(
2334
+ inline void ggml_cuda_op_mul_mat_vec(
2040
2335
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2041
2336
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2042
2337
  cudaStream_t & cudaStream_main){
@@ -2048,70 +2343,116 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
2048
2343
  const int64_t ne00 = src0->ne[0];
2049
2344
  const int64_t nrows = i01_high - i01_low;
2050
2345
 
2051
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2052
- #ifdef GGML_CUDA_DMMV_F16
2053
- size_t ash;
2054
- dfloat * src1_dfloat = nullptr; // dfloat == half
2346
+ #ifdef GGML_CUDA_FORCE_DMMV
2347
+ const bool use_mul_mat_vec_q = false;
2348
+ #else
2349
+ int id;
2350
+ CUDA_CHECK(cudaGetDevice(&id));
2055
2351
 
2056
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2057
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2058
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2352
+ const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2353
+ src0->type == GGML_TYPE_Q4_1 ||
2354
+ src0->type == GGML_TYPE_Q5_0 ||
2355
+ src0->type == GGML_TYPE_Q5_1 ||
2356
+ src0->type == GGML_TYPE_Q8_0;
2059
2357
 
2060
- if (src1_convert_f16) {
2061
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2062
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2063
- ne00, 1, sizeof(float), 0, 0,
2064
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2065
- }
2358
+ // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
2359
+ // However, they have bad performance with Pascal cards.
2360
+ // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
2361
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
2362
+ #endif
2363
+
2364
+ if (use_mul_mat_vec_q) {
2365
+ size_t as;
2366
+ void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
2367
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
2368
+
2369
+ switch (src0->type) {
2370
+ case GGML_TYPE_Q4_0:
2371
+ mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2372
+ break;
2373
+ case GGML_TYPE_Q4_1:
2374
+ mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2375
+ break;
2376
+ case GGML_TYPE_Q5_0:
2377
+ mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2378
+ break;
2379
+ case GGML_TYPE_Q5_1:
2380
+ mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2381
+ break;
2382
+ case GGML_TYPE_Q8_0:
2383
+ mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2384
+ break;
2385
+ default:
2386
+ GGML_ASSERT(false);
2387
+ break;
2388
+ }
2389
+
2390
+ ggml_cuda_pool_free(src1_q8_1, as);
2391
+ } else {
2392
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2393
+ #ifdef GGML_CUDA_DMMV_F16
2394
+ size_t ash;
2395
+ dfloat * src1_dfloat = nullptr; // dfloat == half
2396
+
2397
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2398
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2399
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2400
+
2401
+ if (src1_convert_f16) {
2402
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2403
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2404
+ ne00, 1, sizeof(float), 0, 0,
2405
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2406
+ }
2066
2407
  #else
2067
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2408
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2068
2409
  #endif // GGML_CUDA_DMMV_F16
2069
2410
 
2070
- switch (src0->type) {
2071
- case GGML_TYPE_Q4_0:
2072
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2073
- break;
2074
- case GGML_TYPE_Q4_1:
2075
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2076
- break;
2077
- case GGML_TYPE_Q5_0:
2078
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2079
- break;
2080
- case GGML_TYPE_Q5_1:
2081
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2082
- break;
2083
- case GGML_TYPE_Q8_0:
2084
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2085
- break;
2086
- case GGML_TYPE_Q2_K:
2087
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2088
- break;
2089
- case GGML_TYPE_Q3_K:
2090
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2091
- break;
2092
- case GGML_TYPE_Q4_K:
2093
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2094
- break;
2095
- case GGML_TYPE_Q5_K:
2096
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2097
- break;
2098
- case GGML_TYPE_Q6_K:
2099
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2100
- break;
2101
- case GGML_TYPE_F16:
2102
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2103
- break;
2104
- default:
2105
- GGML_ASSERT(false);
2106
- break;
2107
- }
2108
- CUDA_CHECK(cudaGetLastError());
2411
+ switch (src0->type) {
2412
+ case GGML_TYPE_Q4_0:
2413
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2414
+ break;
2415
+ case GGML_TYPE_Q4_1:
2416
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2417
+ break;
2418
+ case GGML_TYPE_Q5_0:
2419
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2420
+ break;
2421
+ case GGML_TYPE_Q5_1:
2422
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2423
+ break;
2424
+ case GGML_TYPE_Q8_0:
2425
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2426
+ break;
2427
+ case GGML_TYPE_Q2_K:
2428
+ dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2429
+ break;
2430
+ case GGML_TYPE_Q3_K:
2431
+ dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2432
+ break;
2433
+ case GGML_TYPE_Q4_K:
2434
+ dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2435
+ break;
2436
+ case GGML_TYPE_Q5_K:
2437
+ dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2438
+ break;
2439
+ case GGML_TYPE_Q6_K:
2440
+ dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2441
+ break;
2442
+ case GGML_TYPE_F16:
2443
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2444
+ break;
2445
+ default:
2446
+ GGML_ASSERT(false);
2447
+ break;
2448
+ }
2109
2449
 
2110
2450
  #ifdef GGML_CUDA_DMMV_F16
2111
- if (src1_convert_f16) {
2112
- ggml_cuda_pool_free(src1_dfloat, ash);
2113
- }
2451
+ if (src1_convert_f16) {
2452
+ ggml_cuda_pool_free(src1_dfloat, ash);
2453
+ }
2114
2454
  #endif // GGML_CUDA_DMMV_F16
2455
+ }
2115
2456
 
2116
2457
  (void) src1;
2117
2458
  (void) dst;
@@ -2182,7 +2523,6 @@ inline void ggml_cuda_op_rope(
2182
2523
 
2183
2524
  // compute
2184
2525
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
2185
- CUDA_CHECK(cudaGetLastError());
2186
2526
 
2187
2527
  (void) dst;
2188
2528
  (void) src0_ddq_i;
@@ -2206,7 +2546,6 @@ inline void ggml_cuda_op_diag_mask_inf(
2206
2546
 
2207
2547
  // compute
2208
2548
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
2209
- CUDA_CHECK(cudaGetLastError());
2210
2549
 
2211
2550
  (void) dst;
2212
2551
  (void) src0_ddq_i;
@@ -2228,7 +2567,6 @@ inline void ggml_cuda_op_soft_max(
2228
2567
 
2229
2568
  // compute
2230
2569
  soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2231
- CUDA_CHECK(cudaGetLastError());
2232
2570
 
2233
2571
  (void) src1;
2234
2572
  (void) dst;
@@ -2324,10 +2662,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2324
2662
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
2325
2663
  size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2326
2664
 
2327
- // if multiple GPUs are used they need to wait for the main GPU to finish
2665
+ // if multiple devices are used they need to wait for the main device
2666
+ // here an event is recorded that signifies that the main device has finished calculating the input data
2328
2667
  if (split && g_device_count > 1) {
2329
2668
  CUDA_CHECK(cudaSetDevice(g_main_device));
2330
- CUDA_CHECK(cudaDeviceSynchronize());
2669
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
2331
2670
  }
2332
2671
 
2333
2672
  for (int id = 0; id < g_device_count; ++id) {
@@ -2353,6 +2692,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2353
2692
  int64_t row_diff = row_high - row_low;
2354
2693
 
2355
2694
  cudaSetDevice(id);
2695
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2696
+
2697
+ // wait for main GPU data if necessary
2698
+ if (split && id != g_main_device) {
2699
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
2700
+ }
2356
2701
 
2357
2702
  if (src0_on_device && src0_is_contiguous) {
2358
2703
  if (src0_is_f32) {
@@ -2428,8 +2773,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2428
2773
  }
2429
2774
  const int64_t i11 = i13*ne12 + i12;
2430
2775
 
2431
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2432
-
2433
2776
  // for split tensors the data begins at i0 == i0_offset_low
2434
2777
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
2435
2778
  float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@@ -2489,6 +2832,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2489
2832
 
2490
2833
  // do the computation
2491
2834
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2835
+ CUDA_CHECK(cudaGetLastError());
2492
2836
 
2493
2837
  // copy dst to host or other device if necessary
2494
2838
  if (!dst_on_device) {
@@ -2518,6 +2862,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2518
2862
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
2519
2863
  }
2520
2864
  }
2865
+
2866
+ // signify to main device that other device is done
2867
+ if (split && g_device_count > 1 && id != g_main_device) {
2868
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
2869
+ }
2521
2870
  }
2522
2871
  }
2523
2872
  }
@@ -2529,7 +2878,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2529
2878
  }
2530
2879
 
2531
2880
  CUDA_CHECK(cudaSetDevice(id));
2532
- CUDA_CHECK(cudaDeviceSynchronize());
2533
2881
 
2534
2882
  if (src0_asq[id] > 0) {
2535
2883
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
@@ -2544,11 +2892,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2544
2892
  ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
2545
2893
  }
2546
2894
  }
2895
+
2896
+ // main device waits for all other devices to be finished
2897
+ if (split && g_device_count > 1) {
2898
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2899
+ for (int id = 0; id < g_device_count; ++id) {
2900
+ if (id != g_main_device) {
2901
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
2902
+ }
2903
+ }
2904
+ }
2905
+
2906
+ if (dst->backend == GGML_BACKEND_CPU) {
2907
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2908
+ CUDA_CHECK(cudaDeviceSynchronize());
2909
+ }
2547
2910
  }
2548
2911
 
2549
2912
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2550
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2551
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
2913
+ // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
2914
+ // Due to flatten_rows == true this does in practice not make a difference however.
2915
+ // Better solution would be nice but right now that would require disproportionate changes.
2916
+ GGML_ASSERT(
2917
+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
2918
+ src1->type == GGML_TYPE_F32 &&
2919
+ (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
2920
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
2552
2921
  }
2553
2922
 
2554
2923
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2653,8 +3022,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
2653
3022
  }else if (src0->type == GGML_TYPE_F32) {
2654
3023
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2655
3024
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
2656
- if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2657
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
3025
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3026
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
2658
3027
  } else {
2659
3028
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2660
3029
  }
@@ -2777,31 +3146,38 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
2777
3146
  cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
2778
3147
 
2779
3148
  extra->data_device[id] = buf;
3149
+
3150
+ if (backend == GGML_BACKEND_GPU_SPLIT) {
3151
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
3152
+ }
2780
3153
  }
2781
3154
 
2782
3155
  tensor->extra = extra;
2783
3156
  }
2784
3157
 
2785
3158
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2786
- if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
3159
+ if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
2787
3160
  return;
2788
3161
  }
2789
3162
 
2790
3163
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
2791
3164
 
2792
3165
  for (int id = 0; id < g_device_count; ++id) {
2793
- if (extra->data_device[id] == nullptr) {
2794
- continue;
3166
+ if (extra->data_device[id] != nullptr) {
3167
+ CUDA_CHECK(cudaSetDevice(id));
3168
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
2795
3169
  }
2796
3170
 
2797
- CUDA_CHECK(cudaSetDevice(id));
2798
- CUDA_CHECK(cudaFree(extra->data_device[id]));
3171
+ if (extra->events[id] != nullptr) {
3172
+ CUDA_CHECK(cudaSetDevice(id));
3173
+ CUDA_CHECK(cudaEventDestroy(extra->events[id]));
3174
+ }
2799
3175
  }
2800
3176
 
2801
3177
  delete extra;
2802
3178
  }
2803
3179
 
2804
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
3180
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
2805
3181
  if (scratch && g_scratch_size == 0) {
2806
3182
  return;
2807
3183
  }
@@ -2810,11 +3186,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2810
3186
  if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2811
3187
  const ggml_op src0_op = tensor->src0->op;
2812
3188
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2813
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
3189
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
2814
3190
  }
2815
3191
  }
2816
3192
  if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2817
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
3193
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
2818
3194
  }
2819
3195
 
2820
3196
  tensor->backend = GGML_BACKEND_GPU;
@@ -2822,11 +3198,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2822
3198
  memset(extra, 0, sizeof(*extra));
2823
3199
 
2824
3200
  const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2825
- tensor->op == GGML_OP_VIEW;
3201
+ tensor->op == GGML_OP_VIEW ||
3202
+ force_inplace;
2826
3203
  const size_t size = ggml_nbytes(tensor);
2827
3204
 
2828
3205
  CUDA_CHECK(cudaSetDevice(g_main_device));
2829
- if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
3206
+ if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
2830
3207
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
2831
3208
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2832
3209
  size_t offset = 0;
@@ -2865,11 +3242,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2865
3242
  }
2866
3243
 
2867
3244
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2868
- ggml_cuda_assign_buffers_impl(tensor, true);
3245
+ ggml_cuda_assign_buffers_impl(tensor, true, false);
2869
3246
  }
2870
3247
 
2871
3248
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2872
- ggml_cuda_assign_buffers_impl(tensor, false);
3249
+ ggml_cuda_assign_buffers_impl(tensor, false, false);
3250
+ }
3251
+
3252
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
3253
+ ggml_cuda_assign_buffers_impl(tensor, false, true);
2873
3254
  }
2874
3255
 
2875
3256
  void ggml_cuda_set_main_device(int main_device) {