llama_cpp 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
59
59
  #endif //GGML_CUDA_DMMV_F16
60
60
 
61
61
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
- typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
63
- typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
62
+ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
63
+ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
64
64
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
65
65
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
66
66
  typedef void (*ggml_cuda_op_t)(
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
70
70
 
71
71
  // QK = number of values after dequantization
72
72
  // QR = QK / number of values before dequantization
73
+ // QI = number of 32 bit integers before dequantization
73
74
 
74
75
  #define QK4_0 32
75
76
  #define QR4_0 2
77
+ #define QI4_0 4
76
78
  typedef struct {
77
79
  half d; // delta
78
80
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
81
83
 
82
84
  #define QK4_1 32
83
85
  #define QR4_1 2
86
+ #define QI4_1 4
84
87
  typedef struct {
85
88
  half d; // delta
86
89
  half m; // min
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
90
93
 
91
94
  #define QK5_0 32
92
95
  #define QR5_0 2
96
+ #define QI5_0 4
93
97
  typedef struct {
94
98
  half d; // delta
95
99
  uint8_t qh[4]; // 5-th bit of quants
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
99
103
 
100
104
  #define QK5_1 32
101
105
  #define QR5_1 2
106
+ #define QI5_1 4
102
107
  typedef struct {
103
108
  half d; // delta
104
109
  half m; // min
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
109
114
 
110
115
  #define QK8_0 32
111
116
  #define QR8_0 1
117
+ #define QI8_0 8
112
118
  typedef struct {
113
119
  half d; // delta
114
120
  int8_t qs[QK8_0]; // quants
115
121
  } block_q8_0;
116
122
  static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
117
123
 
124
+ #define QK8_1 32
125
+ #define QR8_1 1
126
+ #define QI8_1 8
127
+ typedef struct {
128
+ half d; // delta
129
+ half s; // unquantized sum
130
+ int8_t qs[QK8_0]; // quants
131
+ } block_q8_1;
132
+ static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
+
134
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
135
+
118
136
  //================================= k-quants
119
137
 
120
138
  #ifdef GGML_QKK_64
@@ -190,22 +208,25 @@ typedef struct {
190
208
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
191
209
 
192
210
  #define WARP_SIZE 32
211
+ #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
193
212
 
194
213
  #define CUDA_ADD_BLOCK_SIZE 256
195
214
  #define CUDA_MUL_BLOCK_SIZE 256
215
+ #define CUDA_GELU_BLOCK_SIZE 256
196
216
  #define CUDA_SILU_BLOCK_SIZE 256
197
217
  #define CUDA_CPY_BLOCK_SIZE 32
198
218
  #define CUDA_SCALE_BLOCK_SIZE 256
199
219
  #define CUDA_ROPE_BLOCK_SIZE 256
200
220
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
221
+ #define CUDA_QUANTIZE_BLOCK_SIZE 256
201
222
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
202
223
 
203
224
  // dmmv = dequantize_mul_mat_vec
204
225
  #ifndef GGML_CUDA_DMMV_X
205
226
  #define GGML_CUDA_DMMV_X 32
206
227
  #endif
207
- #ifndef GGML_CUDA_DMMV_Y
208
- #define GGML_CUDA_DMMV_Y 1
228
+ #ifndef GGML_CUDA_MMV_Y
229
+ #define GGML_CUDA_MMV_Y 1
209
230
  #endif
210
231
 
211
232
  #ifndef K_QUANTS_PER_ITERATION
@@ -214,6 +235,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
214
235
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
215
236
  #endif
216
237
 
238
+ struct ggml_tensor_extra_gpu {
239
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
240
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
241
+ };
242
+
217
243
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
218
244
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
219
245
 
@@ -241,6 +267,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
241
267
  dst[i] = x[i] * y[i%ky];
242
268
  }
243
269
 
270
+ static __global__ void gelu_f32(const float * x, float * dst, const int k) {
271
+ const float GELU_COEF_A = 0.044715f;
272
+ const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
273
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
274
+
275
+ if (i >= k) {
276
+ return;
277
+ }
278
+
279
+ float xi = x[i];
280
+ dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
281
+ }
282
+
244
283
  static __global__ void silu_f32(const float * x, float * dst, const int k) {
245
284
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
246
285
 
@@ -250,32 +289,60 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
250
289
  dst[i] = x[i] / (1.0f + expf(-x[i]));
251
290
  }
252
291
 
292
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
293
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
294
+ const int tid = threadIdx.x;
295
+
296
+ const float eps = 1e-5f;
297
+
298
+ float mean = 0.0f;
299
+ float var = 0.0f;
300
+
301
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
302
+ const float xi = x[row*ncols + col];
303
+ mean += xi;
304
+ var += xi * xi;
305
+ }
306
+
307
+ // sum up partial sums
308
+ #pragma unroll
309
+ for (int mask = 16; mask > 0; mask >>= 1) {
310
+ mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
311
+ var += __shfl_xor_sync(0xffffffff, var, mask, 32);
312
+ }
313
+
314
+ mean /= ncols;
315
+ var = var / ncols - mean * mean;
316
+ const float inv_var = rsqrtf(var + eps);
317
+
318
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
319
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
320
+ }
321
+ }
322
+
253
323
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
254
324
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
255
325
  const int tid = threadIdx.x;
256
326
 
257
- const float eps = 1e-6;
327
+ const float eps = 1e-6f;
258
328
 
259
329
  float tmp = 0.0f; // partial sum for thread in warp
260
330
 
261
- for (int i = 0; i < ncols; i += WARP_SIZE) {
262
- const int col = i + tid;
331
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
263
332
  const float xi = x[row*ncols + col];
264
333
  tmp += xi * xi;
265
334
  }
266
335
 
267
336
  // sum up partial sums
268
- __syncthreads();
269
337
  #pragma unroll
270
338
  for (int mask = 16; mask > 0; mask >>= 1) {
271
339
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
272
340
  }
273
341
 
274
342
  const float mean = tmp / ncols;
275
- const float scale = 1.0f / sqrtf(mean + eps);
343
+ const float scale = rsqrtf(mean + eps);
276
344
 
277
- for (int i = 0; i < ncols; i += WARP_SIZE) {
278
- const int col = i + tid;
345
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
279
346
  dst[row*ncols + col] = scale * x[row*ncols + col];
280
347
  }
281
348
  }
@@ -384,7 +451,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
384
451
 
385
452
  //================================== k-quants
386
453
 
387
- static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
454
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
388
455
 
389
456
  const int i = blockIdx.x;
390
457
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -417,7 +484,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
417
484
 
418
485
  }
419
486
 
420
- static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
487
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
421
488
 
422
489
  const int i = blockIdx.x;
423
490
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -481,7 +548,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
481
548
  }
482
549
  #endif
483
550
 
484
- static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
551
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
485
552
  const block_q4_K * x = (const block_q4_K *) vx;
486
553
 
487
554
  const int i = blockIdx.x;
@@ -521,7 +588,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
521
588
  #endif
522
589
  }
523
590
 
524
- static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
591
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
525
592
  const block_q5_K * x = (const block_q5_K *) vx;
526
593
 
527
594
  const int i = blockIdx.x;
@@ -567,7 +634,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
567
634
  #endif
568
635
  }
569
636
 
570
- static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
637
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
571
638
  const block_q6_K * x = (const block_q6_K *) vx;
572
639
 
573
640
  const int i = blockIdx.x;
@@ -611,7 +678,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
611
678
  #endif
612
679
  }
613
680
 
614
- static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
681
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
615
682
 
616
683
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
617
684
 
@@ -709,7 +776,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
709
776
  #endif
710
777
 
711
778
  // sum up partial sums and write back result
712
- __syncthreads();
713
779
  #pragma unroll
714
780
  for (int mask = 16; mask > 0; mask >>= 1) {
715
781
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -720,7 +786,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
720
786
  }
721
787
  }
722
788
 
723
- static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
789
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
724
790
 
725
791
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
726
792
  if (row > nrows) return;
@@ -814,7 +880,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
814
880
  #endif
815
881
 
816
882
  // sum up partial sums and write back result
817
- __syncthreads();
818
883
  #pragma unroll
819
884
  for (int mask = 16; mask > 0; mask >>= 1) {
820
885
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -825,7 +890,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
825
890
  }
826
891
  }
827
892
 
828
- static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
893
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
829
894
 
830
895
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
831
896
  if (row > nrows) return;
@@ -918,7 +983,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
918
983
  #endif
919
984
 
920
985
  // sum up partial sums and write back result
921
- __syncthreads();
922
986
  #pragma unroll
923
987
  for (int mask = 16; mask > 0; mask >>= 1) {
924
988
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -929,7 +993,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
929
993
  }
930
994
  }
931
995
 
932
- static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
996
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
933
997
 
934
998
  const int row = blockIdx.x;
935
999
  const int num_blocks_per_row = ncols / QK_K;
@@ -1023,7 +1087,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1023
1087
  #endif
1024
1088
 
1025
1089
  // sum up partial sums and write back result
1026
- __syncthreads();
1027
1090
  #pragma unroll
1028
1091
  for (int mask = 16; mask > 0; mask >>= 1) {
1029
1092
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1034,7 +1097,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1034
1097
  }
1035
1098
  }
1036
1099
 
1037
- static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
1100
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1038
1101
 
1039
1102
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1040
1103
 
@@ -1134,7 +1197,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
1134
1197
  #endif
1135
1198
 
1136
1199
  // sum up partial sums and write back result
1137
- __syncthreads();
1138
1200
  #pragma unroll
1139
1201
  for (int mask = 16; mask > 0; mask >>= 1) {
1140
1202
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1153,8 +1215,43 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1153
1215
  v.y = x[ib + iqs + 1];
1154
1216
  }
1155
1217
 
1218
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1219
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
1220
+
1221
+ if (i >= k) {
1222
+ return;
1223
+ }
1224
+
1225
+ block_q8_1 * y = (block_q8_1 *) vy;
1226
+
1227
+ const int ib = i / QK8_1; // block index
1228
+ const int iqs = i % QK8_1; // quant index
1229
+
1230
+ const float xi = i < ndata ? x[i] : 0.0f;
1231
+ float amax = fabsf(xi);
1232
+ float sum = xi;
1233
+
1234
+ #pragma unroll
1235
+ for (int mask = 16; mask > 0; mask >>= 1) {
1236
+ amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
1237
+ sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
1238
+ }
1239
+
1240
+ const float d = amax / 127;
1241
+ const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
1242
+
1243
+ y[ib].qs[iqs] = q;
1244
+
1245
+ if (iqs > 0) {
1246
+ return;
1247
+ }
1248
+
1249
+ y[ib].d = d;
1250
+ y[ib].s = sum;
1251
+ }
1252
+
1156
1253
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1157
- static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1254
+ static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1158
1255
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1159
1256
 
1160
1257
  if (i >= k) {
@@ -1174,8 +1271,184 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1174
1271
  y[iybs + iqs + y_offset] = v.y;
1175
1272
  }
1176
1273
 
1274
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1275
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1276
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1277
+
1278
+ int vi;
1279
+ memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1280
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1281
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1282
+
1283
+ const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1284
+
1285
+ // subtract 8 from each quantized value
1286
+ const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1287
+ const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1288
+
1289
+ // SIMD dot product of quantized values
1290
+ int sumi = __dp4a(vi0, ui0, 0);
1291
+ sumi = __dp4a(vi1, ui1, sumi);
1292
+
1293
+ return sumi*d;
1294
+ #else
1295
+ return 0.0f; // only to satisfy the compiler
1296
+ #endif // __CUDA_ARCH__ >= 610
1297
+ }
1298
+
1299
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1300
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1301
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1302
+
1303
+ const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1304
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1305
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1306
+
1307
+ const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1308
+ const float m = bq4_1->m;
1309
+ const float s = bq8_1->s;
1310
+
1311
+ const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1312
+ const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1313
+
1314
+ // SIMD dot product of quantized values
1315
+ int sumi = __dp4a(vi0, ui0, 0);
1316
+ sumi = __dp4a(vi1, ui1, sumi);
1317
+
1318
+ return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1319
+ #else
1320
+ return 0.0f; // only to satisfy the compiler
1321
+ #endif // __CUDA_ARCH__ >= 610
1322
+ }
1323
+
1324
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1325
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1326
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1327
+
1328
+ int qs;
1329
+ memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1330
+ const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1331
+ const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1332
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1333
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1334
+
1335
+ const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1336
+
1337
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1338
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1339
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1340
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1341
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1342
+ vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1343
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1344
+
1345
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1346
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1347
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1348
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1349
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1350
+ vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1351
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1352
+
1353
+ return sumi*d;
1354
+ #else
1355
+ return 0.0f; // only to satisfy the compiler
1356
+ #endif // __CUDA_ARCH__ >= 610
1357
+ }
1358
+
1359
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1360
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1361
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1362
+
1363
+ const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1364
+ const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1365
+ const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1366
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1367
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1368
+
1369
+ const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1370
+ const float m = bq5_1->m;
1371
+ const float s = bq8_1->s;
1372
+
1373
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1374
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1375
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1376
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1377
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1378
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1379
+
1380
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1381
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1382
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1383
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1384
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1385
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1386
+
1387
+ return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1388
+ #else
1389
+ return 0.0f; // only to satisfy the compiler
1390
+ #endif // __CUDA_ARCH__ >= 610
1391
+ }
1392
+
1393
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1394
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1395
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1396
+
1397
+ int vi;
1398
+ memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1399
+ const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1400
+
1401
+ const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1402
+
1403
+ // SIMD dot product of quantized values
1404
+ int sumi = __dp4a(vi, ui, 0);
1405
+
1406
+ return sumi*d;
1407
+ #else
1408
+ return 0.0f; // only to satisfy the compiler
1409
+ #endif // __CUDA_ARCH__ >= 610
1410
+ }
1411
+
1412
+ template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1413
+ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1414
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
1415
+
1416
+ if (row >= nrows) {
1417
+ return;
1418
+ }
1419
+
1420
+ const int blocks_per_row = ncols / qk;
1421
+ const int blocks_per_warp = WARP_SIZE / qi;
1422
+
1423
+ // partial sum for each thread
1424
+ float tmp = 0.0f;
1425
+
1426
+ const block_q_t * x = (const block_q_t *) vx;
1427
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1428
+
1429
+ for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1430
+ const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1431
+
1432
+ const int iby = i + threadIdx.x / qi; // y block index
1433
+
1434
+ const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1435
+
1436
+ tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1437
+ }
1438
+
1439
+ // sum up partial sums and write back result
1440
+ #pragma unroll
1441
+ for (int mask = 16; mask > 0; mask >>= 1) {
1442
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1443
+ }
1444
+
1445
+ if (threadIdx.x == 0) {
1446
+ dst[row] = tmp;
1447
+ }
1448
+ }
1449
+
1177
1450
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1178
- static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1451
+ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
1179
1452
  // qk = quantized weights per x block
1180
1453
  // qr = number of quantized weights per data value in x block
1181
1454
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -1228,7 +1501,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1228
1501
  }
1229
1502
 
1230
1503
  // sum up partial sums and write back result
1231
- __syncthreads();
1232
1504
  #pragma unroll
1233
1505
  for (int mask = 16; mask > 0; mask >>= 1) {
1234
1506
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1243,7 +1515,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1243
1515
  }
1244
1516
  }
1245
1517
 
1246
- static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1518
+ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1247
1519
  const half * x = (const half *) vx;
1248
1520
 
1249
1521
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1279,7 +1551,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1279
1551
  const int idst = channel*nrows_dst + row_dst;
1280
1552
 
1281
1553
  // sum up partial sums and write back result
1282
- __syncthreads();
1283
1554
  #pragma unroll
1284
1555
  for (int mask = 16; mask > 0; mask >>= 1) {
1285
1556
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1291,7 +1562,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1291
1562
  }
1292
1563
 
1293
1564
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1294
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1565
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1295
1566
  const int row_stride_x, const int channel_stride_x) {
1296
1567
 
1297
1568
  const half * x = (const half *) vx;
@@ -1325,7 +1596,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1325
1596
  }
1326
1597
 
1327
1598
  // sum up partial sums and write back result
1328
- __syncthreads();
1329
1599
  #pragma unroll
1330
1600
  for (int mask = 16; mask > 0; mask >>= 1) {
1331
1601
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1435,7 +1705,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
1435
1705
  }
1436
1706
 
1437
1707
  // sum up partial sums
1438
- __syncthreads();
1439
1708
  #pragma unroll
1440
1709
  for (int mask = 16; mask > 0; mask >>= 1) {
1441
1710
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1478,17 +1747,33 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
1478
1747
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1479
1748
  }
1480
1749
 
1750
+ static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1751
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
1752
+ gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1753
+ }
1754
+
1481
1755
  static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1482
1756
  const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
1483
1757
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1484
1758
  }
1485
1759
 
1760
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1761
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
1762
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1763
+ norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1764
+ }
1765
+
1486
1766
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1487
1767
  GGML_ASSERT(ncols % WARP_SIZE == 0);
1488
1768
  const dim3 block_dims(WARP_SIZE, 1, 1);
1489
1769
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1490
1770
  }
1491
1771
 
1772
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
1773
+ const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1774
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
1775
+ }
1776
+
1492
1777
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1493
1778
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1494
1779
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1557,45 +1842,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
1557
1842
 
1558
1843
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1559
1844
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1560
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1845
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1561
1846
  const dim3 block_nums(1, block_num_y, 1);
1562
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1847
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1563
1848
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
1564
1849
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1565
1850
  }
1566
1851
 
1567
1852
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1568
1853
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1569
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1854
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1570
1855
  const dim3 block_nums(1, block_num_y, 1);
1571
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1856
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1572
1857
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
1573
1858
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1574
1859
  }
1575
1860
 
1576
1861
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1577
1862
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1578
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1863
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1579
1864
  const dim3 block_nums(1, block_num_y, 1);
1580
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1865
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1581
1866
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
1582
1867
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1583
1868
  }
1584
1869
 
1585
1870
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1586
1871
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1587
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1872
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1588
1873
  const dim3 block_nums(1, block_num_y, 1);
1589
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1874
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1590
1875
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
1591
1876
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1592
1877
  }
1593
1878
 
1594
1879
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1595
1880
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1596
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1881
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1597
1882
  const dim3 block_nums(1, block_num_y, 1);
1598
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1883
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1599
1884
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
1600
1885
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1601
1886
  }
@@ -1642,6 +1927,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1642
1927
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1643
1928
  }
1644
1929
 
1930
+ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1931
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1932
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1933
+ const dim3 block_nums(1, block_num_y, 1);
1934
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1935
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
1936
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1937
+ }
1938
+
1939
+ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1940
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1941
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1942
+ const dim3 block_nums(1, block_num_y, 1);
1943
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1944
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
1945
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1946
+ }
1947
+
1948
+ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1949
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1950
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1951
+ const dim3 block_nums(1, block_num_y, 1);
1952
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1953
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
1954
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1955
+ }
1956
+
1957
+ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1958
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1959
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1960
+ const dim3 block_nums(1, block_num_y, 1);
1961
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1962
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
1963
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1964
+ }
1965
+
1966
+ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1967
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1968
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1969
+ const dim3 block_nums(1, block_num_y, 1);
1970
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1971
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
1972
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1973
+ }
1974
+
1645
1975
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1646
1976
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1647
1977
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1649,9 +1979,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
1649
1979
 
1650
1980
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1651
1981
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1652
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1982
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1653
1983
  const dim3 block_nums(1, block_num_y, 1);
1654
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1984
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1655
1985
  dequantize_mul_mat_vec<1, 1, convert_f16>
1656
1986
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1657
1987
  }
@@ -1817,6 +2147,7 @@ static size_t g_scratch_offset = 0;
1817
2147
 
1818
2148
  static int g_device_count = -1;
1819
2149
  static int g_main_device = 0;
2150
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
1820
2151
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1821
2152
 
1822
2153
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -1834,9 +2165,12 @@ void ggml_init_cublas() {
1834
2165
  for (int id = 0; id < g_device_count; ++id) {
1835
2166
  cudaDeviceProp prop;
1836
2167
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
1837
- fprintf(stderr, " Device %d: %s\n", id, prop.name);
2168
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
2169
+
1838
2170
  g_tensor_split[id] = total_vram;
1839
2171
  total_vram += prop.totalGlobalMem;
2172
+
2173
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
1840
2174
  }
1841
2175
  for (int id = 0; id < g_device_count; ++id) {
1842
2176
  g_tensor_split[id] /= total_vram;
@@ -1957,20 +2291,24 @@ inline void ggml_cuda_op_add(
1957
2291
 
1958
2292
  GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
1959
2293
  GGML_ASSERT(src1_ddf_i != nullptr);
1960
- GGML_ASSERT(dst_ddf_i != nullptr);
2294
+ GGML_ASSERT(dst_ddf_i != nullptr);
2295
+
2296
+ // TODO: support broadcasting
2297
+ GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
1961
2298
 
1962
- const int64_t ne0 = src0->ne[0];
2299
+ const int64_t ne00 = src0->ne[0];
1963
2300
  const int64_t i01_diff = i01_high - i01_low;
1964
2301
 
2302
+ // const int64_t ne10 = src1->ne[0];
2303
+
1965
2304
  // compute
1966
2305
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1967
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
2306
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
1968
2307
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
1969
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
2308
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
1970
2309
  } else {
1971
2310
  GGML_ASSERT(false);
1972
2311
  }
1973
- CUDA_CHECK(cudaGetLastError());
1974
2312
 
1975
2313
  (void) src1;
1976
2314
  (void) dst;
@@ -1986,10 +2324,9 @@ inline void ggml_cuda_op_mul(
1986
2324
 
1987
2325
  GGML_ASSERT(src0_ddf_i != nullptr);
1988
2326
  GGML_ASSERT(src1_ddf_i != nullptr);
1989
- GGML_ASSERT(dst_ddf_i != nullptr);
2327
+ GGML_ASSERT(dst_ddf_i != nullptr);
1990
2328
 
1991
2329
  const int64_t ne00 = src0->ne[0];
1992
-
1993
2330
  const int64_t ne10 = src1->ne[0];
1994
2331
  const int64_t ne11 = src1->ne[1];
1995
2332
 
@@ -1998,11 +2335,10 @@ inline void ggml_cuda_op_mul(
1998
2335
 
1999
2336
  float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
2000
2337
  float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
2001
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2338
+ float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2002
2339
 
2003
2340
  // compute
2004
2341
  mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
2005
- CUDA_CHECK(cudaGetLastError());
2006
2342
  }
2007
2343
 
2008
2344
  (void) dst;
@@ -2010,6 +2346,28 @@ inline void ggml_cuda_op_mul(
2010
2346
  (void) i02;
2011
2347
  }
2012
2348
 
2349
+ inline void ggml_cuda_op_gelu(
2350
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2351
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2352
+ cudaStream_t & cudaStream_main){
2353
+
2354
+ GGML_ASSERT(src0_ddf_i != nullptr);
2355
+ GGML_ASSERT(dst_ddf_i != nullptr);
2356
+
2357
+ const int64_t ne00 = src0->ne[0];
2358
+ const int64_t i01_diff = i01_high - i01_low;
2359
+
2360
+ // compute
2361
+ gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2362
+
2363
+ (void) src1;
2364
+ (void) dst;
2365
+ (void) src0_ddq_i;
2366
+ (void) src1_ddf_i;
2367
+ (void) i02;
2368
+ (void) i1;
2369
+ }
2370
+
2013
2371
  inline void ggml_cuda_op_silu(
2014
2372
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2015
2373
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2023,7 +2381,28 @@ inline void ggml_cuda_op_silu(
2023
2381
 
2024
2382
  // compute
2025
2383
  silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2026
- CUDA_CHECK(cudaGetLastError());
2384
+
2385
+ (void) src1;
2386
+ (void) dst;
2387
+ (void) src0_ddq_i;
2388
+ (void) src1_ddf_i;
2389
+ (void) i02;
2390
+ (void) i1;
2391
+ }
2392
+
2393
+ inline void ggml_cuda_op_norm(
2394
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2395
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2396
+ cudaStream_t & cudaStream_main){
2397
+
2398
+ GGML_ASSERT(src0_ddf_i != nullptr);
2399
+ GGML_ASSERT(dst_ddf_i != nullptr);
2400
+
2401
+ const int64_t ne00 = src0->ne[0];
2402
+ const int64_t i01_diff = i01_high - i01_low;
2403
+
2404
+ // compute
2405
+ norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2027
2406
 
2028
2407
  (void) src1;
2029
2408
  (void) dst;
@@ -2046,7 +2425,6 @@ inline void ggml_cuda_op_rms_norm(
2046
2425
 
2047
2426
  // compute
2048
2427
  rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2049
- CUDA_CHECK(cudaGetLastError());
2050
2428
 
2051
2429
  (void) src1;
2052
2430
  (void) dst;
@@ -2056,7 +2434,7 @@ inline void ggml_cuda_op_rms_norm(
2056
2434
  (void) i1;
2057
2435
  }
2058
2436
 
2059
- inline void ggml_cuda_op_dequantize_mul_mat_vec(
2437
+ inline void ggml_cuda_op_mul_mat_vec(
2060
2438
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2061
2439
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2062
2440
  cudaStream_t & cudaStream_main){
@@ -2068,70 +2446,115 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
2068
2446
  const int64_t ne00 = src0->ne[0];
2069
2447
  const int64_t nrows = i01_high - i01_low;
2070
2448
 
2071
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2072
- #ifdef GGML_CUDA_DMMV_F16
2073
- size_t ash;
2074
- dfloat * src1_dfloat = nullptr; // dfloat == half
2449
+ #ifdef GGML_CUDA_FORCE_DMMV
2450
+ const bool use_mul_mat_vec_q = false;
2451
+ #else
2452
+ int id;
2453
+ CUDA_CHECK(cudaGetDevice(&id));
2075
2454
 
2076
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2077
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2078
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2455
+ const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2456
+ src0->type == GGML_TYPE_Q4_1 ||
2457
+ src0->type == GGML_TYPE_Q5_0 ||
2458
+ src0->type == GGML_TYPE_Q5_1 ||
2459
+ src0->type == GGML_TYPE_Q8_0;
2079
2460
 
2080
- if (src1_convert_f16) {
2081
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2082
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2083
- ne00, 1, sizeof(float), 0, 0,
2084
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2085
- }
2461
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
2462
+ #endif
2463
+
2464
+ if (use_mul_mat_vec_q) {
2465
+ int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2466
+ padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
2467
+ size_t as;
2468
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2469
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
2470
+
2471
+ switch (src0->type) {
2472
+ case GGML_TYPE_Q4_0:
2473
+ mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2474
+ break;
2475
+ case GGML_TYPE_Q4_1:
2476
+ mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2477
+ break;
2478
+ case GGML_TYPE_Q5_0:
2479
+ mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2480
+ break;
2481
+ case GGML_TYPE_Q5_1:
2482
+ mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2483
+ break;
2484
+ case GGML_TYPE_Q8_0:
2485
+ mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2486
+ break;
2487
+ default:
2488
+ GGML_ASSERT(false);
2489
+ break;
2490
+ }
2491
+
2492
+ ggml_cuda_pool_free(src1_q8_1, as);
2493
+ } else {
2494
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2495
+ #ifdef GGML_CUDA_DMMV_F16
2496
+ size_t ash;
2497
+ dfloat * src1_dfloat = nullptr; // dfloat == half
2498
+
2499
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2500
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2501
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2502
+
2503
+ if (src1_convert_f16) {
2504
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2505
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2506
+ ne00, 1, sizeof(float), 0, 0,
2507
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2508
+ }
2086
2509
  #else
2087
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2510
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2088
2511
  #endif // GGML_CUDA_DMMV_F16
2089
2512
 
2090
- switch (src0->type) {
2091
- case GGML_TYPE_Q4_0:
2092
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2093
- break;
2094
- case GGML_TYPE_Q4_1:
2095
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2096
- break;
2097
- case GGML_TYPE_Q5_0:
2098
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2099
- break;
2100
- case GGML_TYPE_Q5_1:
2101
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2102
- break;
2103
- case GGML_TYPE_Q8_0:
2104
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2105
- break;
2106
- case GGML_TYPE_Q2_K:
2107
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2108
- break;
2109
- case GGML_TYPE_Q3_K:
2110
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2111
- break;
2112
- case GGML_TYPE_Q4_K:
2113
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2114
- break;
2115
- case GGML_TYPE_Q5_K:
2116
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2117
- break;
2118
- case GGML_TYPE_Q6_K:
2119
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2120
- break;
2121
- case GGML_TYPE_F16:
2122
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2123
- break;
2124
- default:
2125
- GGML_ASSERT(false);
2126
- break;
2127
- }
2128
- CUDA_CHECK(cudaGetLastError());
2513
+ switch (src0->type) {
2514
+ case GGML_TYPE_Q4_0:
2515
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2516
+ break;
2517
+ case GGML_TYPE_Q4_1:
2518
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2519
+ break;
2520
+ case GGML_TYPE_Q5_0:
2521
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2522
+ break;
2523
+ case GGML_TYPE_Q5_1:
2524
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2525
+ break;
2526
+ case GGML_TYPE_Q8_0:
2527
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2528
+ break;
2529
+ case GGML_TYPE_Q2_K:
2530
+ dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2531
+ break;
2532
+ case GGML_TYPE_Q3_K:
2533
+ dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2534
+ break;
2535
+ case GGML_TYPE_Q4_K:
2536
+ dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2537
+ break;
2538
+ case GGML_TYPE_Q5_K:
2539
+ dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2540
+ break;
2541
+ case GGML_TYPE_Q6_K:
2542
+ dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2543
+ break;
2544
+ case GGML_TYPE_F16:
2545
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2546
+ break;
2547
+ default:
2548
+ GGML_ASSERT(false);
2549
+ break;
2550
+ }
2129
2551
 
2130
2552
  #ifdef GGML_CUDA_DMMV_F16
2131
- if (src1_convert_f16) {
2132
- ggml_cuda_pool_free(src1_dfloat, ash);
2133
- }
2553
+ if (src1_convert_f16) {
2554
+ ggml_cuda_pool_free(src1_dfloat, ash);
2555
+ }
2134
2556
  #endif // GGML_CUDA_DMMV_F16
2557
+ }
2135
2558
 
2136
2559
  (void) src1;
2137
2560
  (void) dst;
@@ -2202,7 +2625,6 @@ inline void ggml_cuda_op_rope(
2202
2625
 
2203
2626
  // compute
2204
2627
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
2205
- CUDA_CHECK(cudaGetLastError());
2206
2628
 
2207
2629
  (void) dst;
2208
2630
  (void) src0_ddq_i;
@@ -2226,7 +2648,6 @@ inline void ggml_cuda_op_diag_mask_inf(
2226
2648
 
2227
2649
  // compute
2228
2650
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
2229
- CUDA_CHECK(cudaGetLastError());
2230
2651
 
2231
2652
  (void) dst;
2232
2653
  (void) src0_ddq_i;
@@ -2248,7 +2669,6 @@ inline void ggml_cuda_op_soft_max(
2248
2669
 
2249
2670
  // compute
2250
2671
  soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2251
- CUDA_CHECK(cudaGetLastError());
2252
2672
 
2253
2673
  (void) src1;
2254
2674
  (void) dst;
@@ -2344,10 +2764,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2344
2764
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
2345
2765
  size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2346
2766
 
2347
- // if multiple GPUs are used they need to wait for the main GPU to finish
2767
+ // if multiple devices are used they need to wait for the main device
2768
+ // here an event is recorded that signifies that the main device has finished calculating the input data
2348
2769
  if (split && g_device_count > 1) {
2349
2770
  CUDA_CHECK(cudaSetDevice(g_main_device));
2350
- CUDA_CHECK(cudaDeviceSynchronize());
2771
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
2351
2772
  }
2352
2773
 
2353
2774
  for (int id = 0; id < g_device_count; ++id) {
@@ -2373,6 +2794,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2373
2794
  int64_t row_diff = row_high - row_low;
2374
2795
 
2375
2796
  cudaSetDevice(id);
2797
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2798
+
2799
+ // wait for main GPU data if necessary
2800
+ if (split && id != g_main_device) {
2801
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
2802
+ }
2376
2803
 
2377
2804
  if (src0_on_device && src0_is_contiguous) {
2378
2805
  if (src0_is_f32) {
@@ -2448,8 +2875,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2448
2875
  }
2449
2876
  const int64_t i11 = i13*ne12 + i12;
2450
2877
 
2451
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2452
-
2453
2878
  // for split tensors the data begins at i0 == i0_offset_low
2454
2879
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
2455
2880
  float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@@ -2509,6 +2934,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2509
2934
 
2510
2935
  // do the computation
2511
2936
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2937
+ CUDA_CHECK(cudaGetLastError());
2512
2938
 
2513
2939
  // copy dst to host or other device if necessary
2514
2940
  if (!dst_on_device) {
@@ -2538,6 +2964,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2538
2964
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
2539
2965
  }
2540
2966
  }
2967
+
2968
+ // signify to main device that other device is done
2969
+ if (split && g_device_count > 1 && id != g_main_device) {
2970
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
2971
+ }
2541
2972
  }
2542
2973
  }
2543
2974
  }
@@ -2549,7 +2980,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2549
2980
  }
2550
2981
 
2551
2982
  CUDA_CHECK(cudaSetDevice(id));
2552
- CUDA_CHECK(cudaDeviceSynchronize());
2553
2983
 
2554
2984
  if (src0_asq[id] > 0) {
2555
2985
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
@@ -2564,6 +2994,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2564
2994
  ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
2565
2995
  }
2566
2996
  }
2997
+
2998
+ // main device waits for all other devices to be finished
2999
+ if (split && g_device_count > 1) {
3000
+ CUDA_CHECK(cudaSetDevice(g_main_device));
3001
+ for (int id = 0; id < g_device_count; ++id) {
3002
+ if (id != g_main_device) {
3003
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
3004
+ }
3005
+ }
3006
+ }
3007
+
3008
+ if (dst->backend == GGML_BACKEND_CPU) {
3009
+ CUDA_CHECK(cudaSetDevice(g_main_device));
3010
+ CUDA_CHECK(cudaDeviceSynchronize());
3011
+ }
2567
3012
  }
2568
3013
 
2569
3014
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2582,11 +3027,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2582
3027
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
2583
3028
  }
2584
3029
 
3030
+ void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3031
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3032
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
3033
+ }
3034
+
2585
3035
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2586
3036
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2587
3037
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
2588
3038
  }
2589
3039
 
3040
+ void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3041
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3042
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
3043
+ }
3044
+
2590
3045
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2591
3046
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2592
3047
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -2679,8 +3134,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
2679
3134
  }else if (src0->type == GGML_TYPE_F32) {
2680
3135
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2681
3136
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
2682
- if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2683
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
3137
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3138
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
2684
3139
  } else {
2685
3140
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2686
3141
  }
@@ -2765,7 +3220,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2765
3220
 
2766
3221
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
2767
3222
  int nrows = ggml_nrows(tensor);
3223
+
3224
+ const int64_t ne0 = tensor->ne[0];
3225
+
2768
3226
  const size_t nb1 = tensor->nb[1];
3227
+
2769
3228
  ggml_backend backend = tensor->backend;
2770
3229
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
2771
3230
  memset(extra, 0, sizeof(*extra));
@@ -2794,34 +3253,54 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
2794
3253
  int64_t nrows_split = row_high - row_low;
2795
3254
 
2796
3255
  const size_t offset_split = row_low*nb1;
2797
- const size_t size = ggml_nbytes_split(tensor, nrows_split);
3256
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
3257
+ const size_t original_size = size;
3258
+
3259
+ // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3260
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
3261
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3262
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
3263
+ }
2798
3264
 
2799
- void * buf;
3265
+ char * buf;
2800
3266
  CUDA_CHECK(cudaMalloc(&buf, size));
2801
- void * buf_host = (char*)data + offset_split;
3267
+ char * buf_host = (char*)data + offset_split;
3268
+
3269
+ // set padding to 0 to avoid possible NaN values
3270
+ if (size > original_size) {
3271
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
3272
+ }
3273
+
2802
3274
 
2803
- cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
3275
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
2804
3276
 
2805
3277
  extra->data_device[id] = buf;
3278
+
3279
+ if (backend == GGML_BACKEND_GPU_SPLIT) {
3280
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
3281
+ }
2806
3282
  }
2807
3283
 
2808
3284
  tensor->extra = extra;
2809
3285
  }
2810
3286
 
2811
3287
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2812
- if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
3288
+ if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
2813
3289
  return;
2814
3290
  }
2815
3291
 
2816
3292
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
2817
3293
 
2818
3294
  for (int id = 0; id < g_device_count; ++id) {
2819
- if (extra->data_device[id] == nullptr) {
2820
- continue;
3295
+ if (extra->data_device[id] != nullptr) {
3296
+ CUDA_CHECK(cudaSetDevice(id));
3297
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
2821
3298
  }
2822
3299
 
2823
- CUDA_CHECK(cudaSetDevice(id));
2824
- CUDA_CHECK(cudaFree(extra->data_device[id]));
3300
+ if (extra->events[id] != nullptr) {
3301
+ CUDA_CHECK(cudaSetDevice(id));
3302
+ CUDA_CHECK(cudaEventDestroy(extra->events[id]));
3303
+ }
2825
3304
  }
2826
3305
 
2827
3306
  delete extra;
@@ -2833,36 +3312,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
2833
3312
  }
2834
3313
 
2835
3314
  // recursively assign CUDA buffers until a compute tensor is found
2836
- if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2837
- const ggml_op src0_op = tensor->src0->op;
3315
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3316
+ const ggml_op src0_op = tensor->src[0]->op;
2838
3317
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2839
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
3318
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
2840
3319
  }
2841
3320
  }
2842
- if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2843
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
3321
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
3322
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
2844
3323
  }
2845
3324
 
2846
3325
  tensor->backend = GGML_BACKEND_GPU;
2847
3326
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
2848
3327
  memset(extra, 0, sizeof(*extra));
2849
3328
 
2850
- const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
3329
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
2851
3330
  tensor->op == GGML_OP_VIEW ||
2852
3331
  force_inplace;
2853
3332
  const size_t size = ggml_nbytes(tensor);
2854
3333
 
2855
3334
  CUDA_CHECK(cudaSetDevice(g_main_device));
2856
- if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
2857
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
3335
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
3336
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
2858
3337
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2859
3338
  size_t offset = 0;
2860
3339
  if (tensor->op == GGML_OP_VIEW) {
2861
- memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
3340
+ memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
2862
3341
  }
2863
3342
  extra->data_device[g_main_device] = src0_ddc + offset;
2864
3343
  } else if (tensor->op == GGML_OP_CPY) {
2865
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
3344
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
2866
3345
  void * src1_ddv = src1_extra->data_device[g_main_device];
2867
3346
  extra->data_device[g_main_device] = src1_ddv;
2868
3347
  } else if (scratch) {
@@ -2933,8 +3412,8 @@ void ggml_cuda_free_scratch() {
2933
3412
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
2934
3413
  ggml_cuda_func_t func;
2935
3414
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
2936
- || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
2937
- || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
3415
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3416
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
2938
3417
 
2939
3418
  switch (tensor->op) {
2940
3419
  case GGML_OP_ADD:
@@ -2949,12 +3428,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
2949
3428
  }
2950
3429
  func = ggml_cuda_mul;
2951
3430
  break;
3431
+ case GGML_OP_GELU:
3432
+ if (!any_on_device) {
3433
+ return false;
3434
+ }
3435
+ func = ggml_cuda_gelu;
3436
+ break;
2952
3437
  case GGML_OP_SILU:
2953
3438
  if (!any_on_device) {
2954
3439
  return false;
2955
3440
  }
2956
3441
  func = ggml_cuda_silu;
2957
3442
  break;
3443
+ case GGML_OP_NORM:
3444
+ if (!any_on_device) {
3445
+ return false;
3446
+ }
3447
+ func = ggml_cuda_norm;
3448
+ break;
2958
3449
  case GGML_OP_RMS_NORM:
2959
3450
  if (!any_on_device) {
2960
3451
  return false;
@@ -2962,7 +3453,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
2962
3453
  func = ggml_cuda_rms_norm;
2963
3454
  break;
2964
3455
  case GGML_OP_MUL_MAT:
2965
- if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
3456
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
2966
3457
  return false;
2967
3458
  }
2968
3459
  func = ggml_cuda_mul_mat;
@@ -3016,6 +3507,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3016
3507
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3017
3508
  return true;
3018
3509
  }
3019
- func(tensor->src0, tensor->src1, tensor);
3510
+ func(tensor->src[0], tensor->src[1], tensor);
3020
3511
  return true;
3021
3512
  }