llama_cpp 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
70
70
 
71
71
  // QK = number of values after dequantization
72
72
  // QR = QK / number of values before dequantization
73
+ // QI = number of 32 bit integers before dequantization
73
74
 
74
75
  #define QK4_0 32
75
76
  #define QR4_0 2
77
+ #define QI4_0 4
76
78
  typedef struct {
77
79
  half d; // delta
78
80
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
81
83
 
82
84
  #define QK4_1 32
83
85
  #define QR4_1 2
86
+ #define QI4_1 4
84
87
  typedef struct {
85
88
  half d; // delta
86
89
  half m; // min
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
90
93
 
91
94
  #define QK5_0 32
92
95
  #define QR5_0 2
96
+ #define QI5_0 4
93
97
  typedef struct {
94
98
  half d; // delta
95
99
  uint8_t qh[4]; // 5-th bit of quants
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
99
103
 
100
104
  #define QK5_1 32
101
105
  #define QR5_1 2
106
+ #define QI5_1 4
102
107
  typedef struct {
103
108
  half d; // delta
104
109
  half m; // min
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
109
114
 
110
115
  #define QK8_0 32
111
116
  #define QR8_0 1
117
+ #define QI8_0 8
112
118
  typedef struct {
113
119
  half d; // delta
114
120
  int8_t qs[QK8_0]; // quants
115
121
  } block_q8_0;
116
122
  static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
117
123
 
124
+ #define QK8_1 32
125
+ #define QR8_1 1
126
+ #define QI8_1 8
127
+ typedef struct {
128
+ half d; // delta
129
+ half s; // unquantized sum
130
+ int8_t qs[QK8_0]; // quants
131
+ } block_q8_1;
132
+ static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
+
134
+ typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
135
+
118
136
  //================================= k-quants
119
137
 
120
138
  #ifdef GGML_QKK_64
@@ -198,14 +216,15 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
198
216
  #define CUDA_SCALE_BLOCK_SIZE 256
199
217
  #define CUDA_ROPE_BLOCK_SIZE 256
200
218
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
219
+ #define CUDA_QUANTIZE_BLOCK_SIZE 256
201
220
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
202
221
 
203
222
  // dmmv = dequantize_mul_mat_vec
204
223
  #ifndef GGML_CUDA_DMMV_X
205
224
  #define GGML_CUDA_DMMV_X 32
206
225
  #endif
207
- #ifndef GGML_CUDA_DMMV_Y
208
- #define GGML_CUDA_DMMV_Y 1
226
+ #ifndef GGML_CUDA_MMV_Y
227
+ #define GGML_CUDA_MMV_Y 1
209
228
  #endif
210
229
 
211
230
  #ifndef K_QUANTS_PER_ITERATION
@@ -214,6 +233,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
214
233
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
215
234
  #endif
216
235
 
236
+ struct ggml_tensor_extra_gpu {
237
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
238
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
239
+ };
240
+
217
241
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
218
242
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
219
243
 
@@ -223,6 +247,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
223
247
  dst[i] = x[i] + y[i];
224
248
  }
225
249
 
250
+ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
251
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
252
+
253
+ if (i >= k) {
254
+ return;
255
+ }
256
+ dst[i] = __hadd(x[i], __float2half(y[i]));
257
+ }
258
+
226
259
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
227
260
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
228
261
 
@@ -256,7 +289,6 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
256
289
  }
257
290
 
258
291
  // sum up partial sums
259
- __syncthreads();
260
292
  #pragma unroll
261
293
  for (int mask = 16; mask > 0; mask >>= 1) {
262
294
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -700,7 +732,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
700
732
  #endif
701
733
 
702
734
  // sum up partial sums and write back result
703
- __syncthreads();
704
735
  #pragma unroll
705
736
  for (int mask = 16; mask > 0; mask >>= 1) {
706
737
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -805,7 +836,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
805
836
  #endif
806
837
 
807
838
  // sum up partial sums and write back result
808
- __syncthreads();
809
839
  #pragma unroll
810
840
  for (int mask = 16; mask > 0; mask >>= 1) {
811
841
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -909,7 +939,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
909
939
  #endif
910
940
 
911
941
  // sum up partial sums and write back result
912
- __syncthreads();
913
942
  #pragma unroll
914
943
  for (int mask = 16; mask > 0; mask >>= 1) {
915
944
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1014,7 +1043,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1014
1043
  #endif
1015
1044
 
1016
1045
  // sum up partial sums and write back result
1017
- __syncthreads();
1018
1046
  #pragma unroll
1019
1047
  for (int mask = 16; mask > 0; mask >>= 1) {
1020
1048
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1125,7 +1153,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
1125
1153
  #endif
1126
1154
 
1127
1155
  // sum up partial sums and write back result
1128
- __syncthreads();
1129
1156
  #pragma unroll
1130
1157
  for (int mask = 16; mask > 0; mask >>= 1) {
1131
1158
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1144,6 +1171,41 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1144
1171
  v.y = x[ib + iqs + 1];
1145
1172
  }
1146
1173
 
1174
+ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1175
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
1176
+
1177
+ if (i >= k) {
1178
+ return;
1179
+ }
1180
+
1181
+ block_q8_1 * y = (block_q8_1 *) vy;
1182
+
1183
+ const int ib = i / QK8_0; // block index
1184
+ const int iqs = i % QK8_0; // quant index
1185
+
1186
+ const float xi = x[i];
1187
+ float amax = fabsf(xi);
1188
+ float sum = xi;
1189
+
1190
+ #pragma unroll
1191
+ for (int mask = 16; mask > 0; mask >>= 1) {
1192
+ amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
1193
+ sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
1194
+ }
1195
+
1196
+ const float d = amax / 127;
1197
+ const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
1198
+
1199
+ y[ib].qs[iqs] = q;
1200
+
1201
+ if (iqs > 0) {
1202
+ return;
1203
+ }
1204
+
1205
+ y[ib].d = d;
1206
+ y[ib].s = sum;
1207
+ }
1208
+
1147
1209
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1148
1210
  static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1149
1211
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -1165,6 +1227,182 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1165
1227
  y[iybs + iqs + y_offset] = v.y;
1166
1228
  }
1167
1229
 
1230
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1231
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1232
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1233
+
1234
+ int vi;
1235
+ memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1236
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1237
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1238
+
1239
+ const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1240
+
1241
+ // subtract 8 from each quantized value
1242
+ const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1243
+ const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1244
+
1245
+ // SIMD dot product of quantized values
1246
+ int sumi = __dp4a(vi0, ui0, 0);
1247
+ sumi = __dp4a(vi1, ui1, sumi);
1248
+
1249
+ return sumi*d;
1250
+ #else
1251
+ return 0.0f; // only to satisfy the compiler
1252
+ #endif // __CUDA_ARCH__ >= 600
1253
+ }
1254
+
1255
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1256
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1257
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1258
+
1259
+ const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1260
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1261
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1262
+
1263
+ const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1264
+ const float m = bq4_1->m;
1265
+ const float s = bq8_1->s;
1266
+
1267
+ const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1268
+ const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1269
+
1270
+ // SIMD dot product of quantized values
1271
+ int sumi = __dp4a(vi0, ui0, 0);
1272
+ sumi = __dp4a(vi1, ui1, sumi);
1273
+
1274
+ return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1275
+ #else
1276
+ return 0.0f; // only to satisfy the compiler
1277
+ #endif // __CUDA_ARCH__ >= 600
1278
+ }
1279
+
1280
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1281
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1282
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1283
+
1284
+ int qs;
1285
+ memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1286
+ const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1287
+ const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1288
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1289
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1290
+
1291
+ const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1292
+
1293
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1294
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1295
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1296
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1297
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1298
+ vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1299
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1300
+
1301
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1302
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1303
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1304
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1305
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1306
+ vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1307
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1308
+
1309
+ return sumi*d;
1310
+ #else
1311
+ return 0.0f; // only to satisfy the compiler
1312
+ #endif // __CUDA_ARCH__ >= 600
1313
+ }
1314
+
1315
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1316
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1317
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1318
+
1319
+ const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1320
+ const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1321
+ const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1322
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1323
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1324
+
1325
+ const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1326
+ const float m = bq5_1->m;
1327
+ const float s = bq8_1->s;
1328
+
1329
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1330
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1331
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1332
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1333
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1334
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1335
+
1336
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1337
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1338
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1339
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1340
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1341
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1342
+
1343
+ return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1344
+ #else
1345
+ return 0.0f; // only to satisfy the compiler
1346
+ #endif // __CUDA_ARCH__ >= 600
1347
+ }
1348
+
1349
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1350
+ #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1351
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1352
+
1353
+ int vi;
1354
+ memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1355
+ const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1356
+
1357
+ const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1358
+
1359
+ // SIMD dot product of quantized values
1360
+ int sumi = __dp4a(vi, ui, 0);
1361
+
1362
+ return sumi*d;
1363
+ #else
1364
+ return 0.0f; // only to satisfy the compiler
1365
+ #endif // __CUDA_ARCH__ >= 600
1366
+ }
1367
+
1368
+ template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1369
+ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
1370
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
1371
+
1372
+ if (row >= nrows) {
1373
+ return;
1374
+ }
1375
+
1376
+ const int blocks_per_row = ncols / qk;
1377
+ const int blocks_per_warp = WARP_SIZE / qi;
1378
+
1379
+ // partial sum for each thread
1380
+ float tmp = 0.0f;
1381
+
1382
+ const block_q_t * x = (const block_q_t *) vx;
1383
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1384
+
1385
+ for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1386
+ const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1387
+
1388
+ const int iby = i + threadIdx.x / qi; // y block index
1389
+
1390
+ const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1391
+
1392
+ tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1393
+ }
1394
+
1395
+ // sum up partial sums and write back result
1396
+ #pragma unroll
1397
+ for (int mask = 16; mask > 0; mask >>= 1) {
1398
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1399
+ }
1400
+
1401
+ if (threadIdx.x == 0) {
1402
+ dst[row] = tmp;
1403
+ }
1404
+ }
1405
+
1168
1406
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1169
1407
  static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1170
1408
  // qk = quantized weights per x block
@@ -1219,7 +1457,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1219
1457
  }
1220
1458
 
1221
1459
  // sum up partial sums and write back result
1222
- __syncthreads();
1223
1460
  #pragma unroll
1224
1461
  for (int mask = 16; mask > 0; mask >>= 1) {
1225
1462
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1235,7 +1472,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1235
1472
  }
1236
1473
 
1237
1474
  static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1238
- const half * x = (half *) vx;
1475
+ const half * x = (const half *) vx;
1239
1476
 
1240
1477
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1241
1478
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1270,7 +1507,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1270
1507
  const int idst = channel*nrows_dst + row_dst;
1271
1508
 
1272
1509
  // sum up partial sums and write back result
1273
- __syncthreads();
1274
1510
  #pragma unroll
1275
1511
  for (int mask = 16; mask > 0; mask >>= 1) {
1276
1512
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1283,9 +1519,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1283
1519
 
1284
1520
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1285
1521
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1286
- const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
1522
+ const int row_stride_x, const int channel_stride_x) {
1287
1523
 
1288
- const half * x = (half *) vx;
1524
+ const half * x = (const half *) vx;
1289
1525
 
1290
1526
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1291
1527
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1316,7 +1552,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1316
1552
  }
1317
1553
 
1318
1554
  // sum up partial sums and write back result
1319
- __syncthreads();
1320
1555
  #pragma unroll
1321
1556
  for (int mask = 16; mask > 0; mask >>= 1) {
1322
1557
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1328,14 +1563,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1328
1563
  }
1329
1564
 
1330
1565
  static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1331
- const float * xi = (float *) cxi;
1566
+ const float * xi = (const float *) cxi;
1332
1567
  float * dsti = (float *) cdsti;
1333
1568
 
1334
1569
  *dsti = *xi;
1335
1570
  }
1336
1571
 
1337
1572
  static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1338
- const float * xi = (float *) cxi;
1573
+ const float * xi = (const float *) cxi;
1339
1574
  half * dsti = (half *) cdsti;
1340
1575
 
1341
1576
  *dsti = __float2half(*xi);
@@ -1426,7 +1661,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
1426
1661
  }
1427
1662
 
1428
1663
  // sum up partial sums
1429
- __syncthreads();
1430
1664
  #pragma unroll
1431
1665
  for (int mask = 16; mask > 0; mask >>= 1) {
1432
1666
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1459,6 +1693,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
1459
1693
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1460
1694
  }
1461
1695
 
1696
+ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
1697
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1698
+ add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1699
+ }
1700
+
1462
1701
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
1463
1702
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
1464
1703
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -1475,6 +1714,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
1475
1714
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1476
1715
  }
1477
1716
 
1717
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
1718
+ const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1719
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
1720
+ }
1721
+
1478
1722
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1479
1723
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1480
1724
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1543,45 +1787,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
1543
1787
 
1544
1788
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1545
1789
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1546
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1790
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1547
1791
  const dim3 block_nums(1, block_num_y, 1);
1548
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1792
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1549
1793
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
1550
1794
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1551
1795
  }
1552
1796
 
1553
1797
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1554
1798
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1555
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1799
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1556
1800
  const dim3 block_nums(1, block_num_y, 1);
1557
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1801
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1558
1802
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
1559
1803
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1560
1804
  }
1561
1805
 
1562
1806
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1563
1807
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1564
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1808
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1565
1809
  const dim3 block_nums(1, block_num_y, 1);
1566
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1810
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1567
1811
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
1568
1812
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1569
1813
  }
1570
1814
 
1571
1815
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1572
1816
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1573
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1817
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1574
1818
  const dim3 block_nums(1, block_num_y, 1);
1575
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1819
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1576
1820
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
1577
1821
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1578
1822
  }
1579
1823
 
1580
1824
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1581
1825
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1582
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1826
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1583
1827
  const dim3 block_nums(1, block_num_y, 1);
1584
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1828
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1585
1829
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
1586
1830
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1587
1831
  }
@@ -1628,6 +1872,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1628
1872
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1629
1873
  }
1630
1874
 
1875
+ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1876
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1877
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1878
+ const dim3 block_nums(1, block_num_y, 1);
1879
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1880
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
1881
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1882
+ }
1883
+
1884
+ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1885
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1886
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1887
+ const dim3 block_nums(1, block_num_y, 1);
1888
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1889
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
1890
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1891
+ }
1892
+
1893
+ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1894
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1895
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1896
+ const dim3 block_nums(1, block_num_y, 1);
1897
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1898
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
1899
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1900
+ }
1901
+
1902
+ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1903
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1904
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1905
+ const dim3 block_nums(1, block_num_y, 1);
1906
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1907
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
1908
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1909
+ }
1910
+
1911
+ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1912
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1913
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1914
+ const dim3 block_nums(1, block_num_y, 1);
1915
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1916
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
1917
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1918
+ }
1919
+
1631
1920
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1632
1921
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1633
1922
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1635,9 +1924,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
1635
1924
 
1636
1925
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1637
1926
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1638
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1927
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1639
1928
  const dim3 block_nums(1, block_num_y, 1);
1640
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1929
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1641
1930
  dequantize_mul_mat_vec<1, 1, convert_f16>
1642
1931
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1643
1932
  }
@@ -1684,7 +1973,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1684
1973
  const dim3 block_nums(1, nrows_x, nchannels_x);
1685
1974
  const dim3 block_dims(WARP_SIZE, 1, 1);
1686
1975
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1687
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1976
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
1688
1977
  }
1689
1978
 
1690
1979
  static void ggml_cpy_f32_f32_cuda(
@@ -1803,6 +2092,7 @@ static size_t g_scratch_offset = 0;
1803
2092
 
1804
2093
  static int g_device_count = -1;
1805
2094
  static int g_main_device = 0;
2095
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
1806
2096
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1807
2097
 
1808
2098
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -1820,9 +2110,12 @@ void ggml_init_cublas() {
1820
2110
  for (int id = 0; id < g_device_count; ++id) {
1821
2111
  cudaDeviceProp prop;
1822
2112
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
1823
- fprintf(stderr, " Device %d: %s\n", id, prop.name);
2113
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
2114
+
1824
2115
  g_tensor_split[id] = total_vram;
1825
2116
  total_vram += prop.totalGlobalMem;
2117
+
2118
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
1826
2119
  }
1827
2120
  for (int id = 0; id < g_device_count; ++id) {
1828
2121
  g_tensor_split[id] /= total_vram;
@@ -1941,7 +2234,7 @@ inline void ggml_cuda_op_add(
1941
2234
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1942
2235
  cudaStream_t & cudaStream_main){
1943
2236
 
1944
- GGML_ASSERT(src0_ddf_i != nullptr);
2237
+ GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
1945
2238
  GGML_ASSERT(src1_ddf_i != nullptr);
1946
2239
  GGML_ASSERT(dst_ddf_i != nullptr);
1947
2240
 
@@ -1949,8 +2242,13 @@ inline void ggml_cuda_op_add(
1949
2242
  const int64_t i01_diff = i01_high - i01_low;
1950
2243
 
1951
2244
  // compute
1952
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1953
- CUDA_CHECK(cudaGetLastError());
2245
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2246
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
2247
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
2248
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
2249
+ } else {
2250
+ GGML_ASSERT(false);
2251
+ }
1954
2252
 
1955
2253
  (void) src1;
1956
2254
  (void) dst;
@@ -1982,7 +2280,6 @@ inline void ggml_cuda_op_mul(
1982
2280
 
1983
2281
  // compute
1984
2282
  mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
1985
- CUDA_CHECK(cudaGetLastError());
1986
2283
  }
1987
2284
 
1988
2285
  (void) dst;
@@ -2003,7 +2300,6 @@ inline void ggml_cuda_op_silu(
2003
2300
 
2004
2301
  // compute
2005
2302
  silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2006
- CUDA_CHECK(cudaGetLastError());
2007
2303
 
2008
2304
  (void) src1;
2009
2305
  (void) dst;
@@ -2026,7 +2322,6 @@ inline void ggml_cuda_op_rms_norm(
2026
2322
 
2027
2323
  // compute
2028
2324
  rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2029
- CUDA_CHECK(cudaGetLastError());
2030
2325
 
2031
2326
  (void) src1;
2032
2327
  (void) dst;
@@ -2036,7 +2331,7 @@ inline void ggml_cuda_op_rms_norm(
2036
2331
  (void) i1;
2037
2332
  }
2038
2333
 
2039
- inline void ggml_cuda_op_dequantize_mul_mat_vec(
2334
+ inline void ggml_cuda_op_mul_mat_vec(
2040
2335
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2041
2336
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2042
2337
  cudaStream_t & cudaStream_main){
@@ -2048,70 +2343,116 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
2048
2343
  const int64_t ne00 = src0->ne[0];
2049
2344
  const int64_t nrows = i01_high - i01_low;
2050
2345
 
2051
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2052
- #ifdef GGML_CUDA_DMMV_F16
2053
- size_t ash;
2054
- dfloat * src1_dfloat = nullptr; // dfloat == half
2346
+ #ifdef GGML_CUDA_FORCE_DMMV
2347
+ const bool use_mul_mat_vec_q = false;
2348
+ #else
2349
+ int id;
2350
+ CUDA_CHECK(cudaGetDevice(&id));
2055
2351
 
2056
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2057
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2058
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2352
+ const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2353
+ src0->type == GGML_TYPE_Q4_1 ||
2354
+ src0->type == GGML_TYPE_Q5_0 ||
2355
+ src0->type == GGML_TYPE_Q5_1 ||
2356
+ src0->type == GGML_TYPE_Q8_0;
2059
2357
 
2060
- if (src1_convert_f16) {
2061
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2062
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2063
- ne00, 1, sizeof(float), 0, 0,
2064
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2065
- }
2358
+ // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
2359
+ // However, they have bad performance with Pascal cards.
2360
+ // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
2361
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
2362
+ #endif
2363
+
2364
+ if (use_mul_mat_vec_q) {
2365
+ size_t as;
2366
+ void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
2367
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
2368
+
2369
+ switch (src0->type) {
2370
+ case GGML_TYPE_Q4_0:
2371
+ mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2372
+ break;
2373
+ case GGML_TYPE_Q4_1:
2374
+ mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2375
+ break;
2376
+ case GGML_TYPE_Q5_0:
2377
+ mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2378
+ break;
2379
+ case GGML_TYPE_Q5_1:
2380
+ mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2381
+ break;
2382
+ case GGML_TYPE_Q8_0:
2383
+ mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2384
+ break;
2385
+ default:
2386
+ GGML_ASSERT(false);
2387
+ break;
2388
+ }
2389
+
2390
+ ggml_cuda_pool_free(src1_q8_1, as);
2391
+ } else {
2392
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2393
+ #ifdef GGML_CUDA_DMMV_F16
2394
+ size_t ash;
2395
+ dfloat * src1_dfloat = nullptr; // dfloat == half
2396
+
2397
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2398
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2399
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2400
+
2401
+ if (src1_convert_f16) {
2402
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2403
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2404
+ ne00, 1, sizeof(float), 0, 0,
2405
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2406
+ }
2066
2407
  #else
2067
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2408
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2068
2409
  #endif // GGML_CUDA_DMMV_F16
2069
2410
 
2070
- switch (src0->type) {
2071
- case GGML_TYPE_Q4_0:
2072
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2073
- break;
2074
- case GGML_TYPE_Q4_1:
2075
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2076
- break;
2077
- case GGML_TYPE_Q5_0:
2078
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2079
- break;
2080
- case GGML_TYPE_Q5_1:
2081
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2082
- break;
2083
- case GGML_TYPE_Q8_0:
2084
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2085
- break;
2086
- case GGML_TYPE_Q2_K:
2087
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2088
- break;
2089
- case GGML_TYPE_Q3_K:
2090
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2091
- break;
2092
- case GGML_TYPE_Q4_K:
2093
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2094
- break;
2095
- case GGML_TYPE_Q5_K:
2096
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2097
- break;
2098
- case GGML_TYPE_Q6_K:
2099
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2100
- break;
2101
- case GGML_TYPE_F16:
2102
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2103
- break;
2104
- default:
2105
- GGML_ASSERT(false);
2106
- break;
2107
- }
2108
- CUDA_CHECK(cudaGetLastError());
2411
+ switch (src0->type) {
2412
+ case GGML_TYPE_Q4_0:
2413
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2414
+ break;
2415
+ case GGML_TYPE_Q4_1:
2416
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2417
+ break;
2418
+ case GGML_TYPE_Q5_0:
2419
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2420
+ break;
2421
+ case GGML_TYPE_Q5_1:
2422
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2423
+ break;
2424
+ case GGML_TYPE_Q8_0:
2425
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2426
+ break;
2427
+ case GGML_TYPE_Q2_K:
2428
+ dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2429
+ break;
2430
+ case GGML_TYPE_Q3_K:
2431
+ dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2432
+ break;
2433
+ case GGML_TYPE_Q4_K:
2434
+ dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2435
+ break;
2436
+ case GGML_TYPE_Q5_K:
2437
+ dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2438
+ break;
2439
+ case GGML_TYPE_Q6_K:
2440
+ dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2441
+ break;
2442
+ case GGML_TYPE_F16:
2443
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2444
+ break;
2445
+ default:
2446
+ GGML_ASSERT(false);
2447
+ break;
2448
+ }
2109
2449
 
2110
2450
  #ifdef GGML_CUDA_DMMV_F16
2111
- if (src1_convert_f16) {
2112
- ggml_cuda_pool_free(src1_dfloat, ash);
2113
- }
2451
+ if (src1_convert_f16) {
2452
+ ggml_cuda_pool_free(src1_dfloat, ash);
2453
+ }
2114
2454
  #endif // GGML_CUDA_DMMV_F16
2455
+ }
2115
2456
 
2116
2457
  (void) src1;
2117
2458
  (void) dst;
@@ -2182,7 +2523,6 @@ inline void ggml_cuda_op_rope(
2182
2523
 
2183
2524
  // compute
2184
2525
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
2185
- CUDA_CHECK(cudaGetLastError());
2186
2526
 
2187
2527
  (void) dst;
2188
2528
  (void) src0_ddq_i;
@@ -2206,7 +2546,6 @@ inline void ggml_cuda_op_diag_mask_inf(
2206
2546
 
2207
2547
  // compute
2208
2548
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
2209
- CUDA_CHECK(cudaGetLastError());
2210
2549
 
2211
2550
  (void) dst;
2212
2551
  (void) src0_ddq_i;
@@ -2228,7 +2567,6 @@ inline void ggml_cuda_op_soft_max(
2228
2567
 
2229
2568
  // compute
2230
2569
  soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2231
- CUDA_CHECK(cudaGetLastError());
2232
2570
 
2233
2571
  (void) src1;
2234
2572
  (void) dst;
@@ -2324,10 +2662,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2324
2662
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
2325
2663
  size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2326
2664
 
2327
- // if multiple GPUs are used they need to wait for the main GPU to finish
2665
+ // if multiple devices are used they need to wait for the main device
2666
+ // here an event is recorded that signifies that the main device has finished calculating the input data
2328
2667
  if (split && g_device_count > 1) {
2329
2668
  CUDA_CHECK(cudaSetDevice(g_main_device));
2330
- CUDA_CHECK(cudaDeviceSynchronize());
2669
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
2331
2670
  }
2332
2671
 
2333
2672
  for (int id = 0; id < g_device_count; ++id) {
@@ -2353,6 +2692,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2353
2692
  int64_t row_diff = row_high - row_low;
2354
2693
 
2355
2694
  cudaSetDevice(id);
2695
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2696
+
2697
+ // wait for main GPU data if necessary
2698
+ if (split && id != g_main_device) {
2699
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
2700
+ }
2356
2701
 
2357
2702
  if (src0_on_device && src0_is_contiguous) {
2358
2703
  if (src0_is_f32) {
@@ -2428,8 +2773,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2428
2773
  }
2429
2774
  const int64_t i11 = i13*ne12 + i12;
2430
2775
 
2431
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2432
-
2433
2776
  // for split tensors the data begins at i0 == i0_offset_low
2434
2777
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
2435
2778
  float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@@ -2489,6 +2832,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2489
2832
 
2490
2833
  // do the computation
2491
2834
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2835
+ CUDA_CHECK(cudaGetLastError());
2492
2836
 
2493
2837
  // copy dst to host or other device if necessary
2494
2838
  if (!dst_on_device) {
@@ -2518,6 +2862,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2518
2862
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
2519
2863
  }
2520
2864
  }
2865
+
2866
+ // signify to main device that other device is done
2867
+ if (split && g_device_count > 1 && id != g_main_device) {
2868
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
2869
+ }
2521
2870
  }
2522
2871
  }
2523
2872
  }
@@ -2529,7 +2878,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2529
2878
  }
2530
2879
 
2531
2880
  CUDA_CHECK(cudaSetDevice(id));
2532
- CUDA_CHECK(cudaDeviceSynchronize());
2533
2881
 
2534
2882
  if (src0_asq[id] > 0) {
2535
2883
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
@@ -2544,11 +2892,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2544
2892
  ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
2545
2893
  }
2546
2894
  }
2895
+
2896
+ // main device waits for all other devices to be finished
2897
+ if (split && g_device_count > 1) {
2898
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2899
+ for (int id = 0; id < g_device_count; ++id) {
2900
+ if (id != g_main_device) {
2901
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
2902
+ }
2903
+ }
2904
+ }
2905
+
2906
+ if (dst->backend == GGML_BACKEND_CPU) {
2907
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2908
+ CUDA_CHECK(cudaDeviceSynchronize());
2909
+ }
2547
2910
  }
2548
2911
 
2549
2912
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2550
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2551
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
2913
+ // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
2914
+ // Due to flatten_rows == true this does in practice not make a difference however.
2915
+ // Better solution would be nice but right now that would require disproportionate changes.
2916
+ GGML_ASSERT(
2917
+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
2918
+ src1->type == GGML_TYPE_F32 &&
2919
+ (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
2920
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
2552
2921
  }
2553
2922
 
2554
2923
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2653,8 +3022,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
2653
3022
  }else if (src0->type == GGML_TYPE_F32) {
2654
3023
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2655
3024
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
2656
- if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2657
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
3025
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3026
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
2658
3027
  } else {
2659
3028
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2660
3029
  }
@@ -2777,31 +3146,38 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
2777
3146
  cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
2778
3147
 
2779
3148
  extra->data_device[id] = buf;
3149
+
3150
+ if (backend == GGML_BACKEND_GPU_SPLIT) {
3151
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
3152
+ }
2780
3153
  }
2781
3154
 
2782
3155
  tensor->extra = extra;
2783
3156
  }
2784
3157
 
2785
3158
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2786
- if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
3159
+ if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
2787
3160
  return;
2788
3161
  }
2789
3162
 
2790
3163
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
2791
3164
 
2792
3165
  for (int id = 0; id < g_device_count; ++id) {
2793
- if (extra->data_device[id] == nullptr) {
2794
- continue;
3166
+ if (extra->data_device[id] != nullptr) {
3167
+ CUDA_CHECK(cudaSetDevice(id));
3168
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
2795
3169
  }
2796
3170
 
2797
- CUDA_CHECK(cudaSetDevice(id));
2798
- CUDA_CHECK(cudaFree(extra->data_device[id]));
3171
+ if (extra->events[id] != nullptr) {
3172
+ CUDA_CHECK(cudaSetDevice(id));
3173
+ CUDA_CHECK(cudaEventDestroy(extra->events[id]));
3174
+ }
2799
3175
  }
2800
3176
 
2801
3177
  delete extra;
2802
3178
  }
2803
3179
 
2804
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
3180
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
2805
3181
  if (scratch && g_scratch_size == 0) {
2806
3182
  return;
2807
3183
  }
@@ -2810,11 +3186,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2810
3186
  if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2811
3187
  const ggml_op src0_op = tensor->src0->op;
2812
3188
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2813
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
3189
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
2814
3190
  }
2815
3191
  }
2816
3192
  if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2817
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
3193
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
2818
3194
  }
2819
3195
 
2820
3196
  tensor->backend = GGML_BACKEND_GPU;
@@ -2822,11 +3198,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2822
3198
  memset(extra, 0, sizeof(*extra));
2823
3199
 
2824
3200
  const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2825
- tensor->op == GGML_OP_VIEW;
3201
+ tensor->op == GGML_OP_VIEW ||
3202
+ force_inplace;
2826
3203
  const size_t size = ggml_nbytes(tensor);
2827
3204
 
2828
3205
  CUDA_CHECK(cudaSetDevice(g_main_device));
2829
- if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
3206
+ if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
2830
3207
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
2831
3208
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2832
3209
  size_t offset = 0;
@@ -2865,11 +3242,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2865
3242
  }
2866
3243
 
2867
3244
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2868
- ggml_cuda_assign_buffers_impl(tensor, true);
3245
+ ggml_cuda_assign_buffers_impl(tensor, true, false);
2869
3246
  }
2870
3247
 
2871
3248
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2872
- ggml_cuda_assign_buffers_impl(tensor, false);
3249
+ ggml_cuda_assign_buffers_impl(tensor, false, false);
3250
+ }
3251
+
3252
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
3253
+ ggml_cuda_assign_buffers_impl(tensor, false, true);
2873
3254
  }
2874
3255
 
2875
3256
  void ggml_cuda_set_main_device(int main_device) {