llama_cpp 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
59
59
  #endif //GGML_CUDA_DMMV_F16
60
60
 
61
61
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
- typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
63
- typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
62
+ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
63
+ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
64
64
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
65
65
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
66
66
  typedef void (*ggml_cuda_op_t)(
@@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
70
70
 
71
71
  // QK = number of values after dequantization
72
72
  // QR = QK / number of values before dequantization
73
+ // QI = number of 32 bit integers before dequantization
73
74
 
74
75
  #define QK4_0 32
75
76
  #define QR4_0 2
77
+ #define QI4_0 4
76
78
  typedef struct {
77
79
  half d; // delta
78
80
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
81
83
 
82
84
  #define QK4_1 32
83
85
  #define QR4_1 2
86
+ #define QI4_1 4
84
87
  typedef struct {
85
88
  half d; // delta
86
89
  half m; // min
@@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
90
93
 
91
94
  #define QK5_0 32
92
95
  #define QR5_0 2
96
+ #define QI5_0 4
93
97
  typedef struct {
94
98
  half d; // delta
95
99
  uint8_t qh[4]; // 5-th bit of quants
@@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
99
103
 
100
104
  #define QK5_1 32
101
105
  #define QR5_1 2
106
+ #define QI5_1 4
102
107
  typedef struct {
103
108
  half d; // delta
104
109
  half m; // min
@@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
109
114
 
110
115
  #define QK8_0 32
111
116
  #define QR8_0 1
117
+ #define QI8_0 8
112
118
  typedef struct {
113
119
  half d; // delta
114
120
  int8_t qs[QK8_0]; // quants
115
121
  } block_q8_0;
116
122
  static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
117
123
 
124
+ #define QK8_1 32
125
+ #define QR8_1 1
126
+ #define QI8_1 8
127
+ typedef struct {
128
+ half d; // delta
129
+ half s; // unquantized sum
130
+ int8_t qs[QK8_0]; // quants
131
+ } block_q8_1;
132
+ static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
+
134
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
135
+
118
136
  //================================= k-quants
119
137
 
120
138
  #ifdef GGML_QKK_64
@@ -190,22 +208,25 @@ typedef struct {
190
208
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
191
209
 
192
210
  #define WARP_SIZE 32
211
+ #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
193
212
 
194
213
  #define CUDA_ADD_BLOCK_SIZE 256
195
214
  #define CUDA_MUL_BLOCK_SIZE 256
215
+ #define CUDA_GELU_BLOCK_SIZE 256
196
216
  #define CUDA_SILU_BLOCK_SIZE 256
197
217
  #define CUDA_CPY_BLOCK_SIZE 32
198
218
  #define CUDA_SCALE_BLOCK_SIZE 256
199
219
  #define CUDA_ROPE_BLOCK_SIZE 256
200
220
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
221
+ #define CUDA_QUANTIZE_BLOCK_SIZE 256
201
222
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
202
223
 
203
224
  // dmmv = dequantize_mul_mat_vec
204
225
  #ifndef GGML_CUDA_DMMV_X
205
226
  #define GGML_CUDA_DMMV_X 32
206
227
  #endif
207
- #ifndef GGML_CUDA_DMMV_Y
208
- #define GGML_CUDA_DMMV_Y 1
228
+ #ifndef GGML_CUDA_MMV_Y
229
+ #define GGML_CUDA_MMV_Y 1
209
230
  #endif
210
231
 
211
232
  #ifndef K_QUANTS_PER_ITERATION
@@ -214,6 +235,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
214
235
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
215
236
  #endif
216
237
 
238
+ struct ggml_tensor_extra_gpu {
239
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
240
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
241
+ };
242
+
217
243
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
218
244
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
219
245
 
@@ -241,6 +267,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
241
267
  dst[i] = x[i] * y[i%ky];
242
268
  }
243
269
 
270
+ static __global__ void gelu_f32(const float * x, float * dst, const int k) {
271
+ const float GELU_COEF_A = 0.044715f;
272
+ const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
273
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
274
+
275
+ if (i >= k) {
276
+ return;
277
+ }
278
+
279
+ float xi = x[i];
280
+ dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
281
+ }
282
+
244
283
  static __global__ void silu_f32(const float * x, float * dst, const int k) {
245
284
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
246
285
 
@@ -250,32 +289,60 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
250
289
  dst[i] = x[i] / (1.0f + expf(-x[i]));
251
290
  }
252
291
 
292
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
293
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
294
+ const int tid = threadIdx.x;
295
+
296
+ const float eps = 1e-5f;
297
+
298
+ float mean = 0.0f;
299
+ float var = 0.0f;
300
+
301
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
302
+ const float xi = x[row*ncols + col];
303
+ mean += xi;
304
+ var += xi * xi;
305
+ }
306
+
307
+ // sum up partial sums
308
+ #pragma unroll
309
+ for (int mask = 16; mask > 0; mask >>= 1) {
310
+ mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
311
+ var += __shfl_xor_sync(0xffffffff, var, mask, 32);
312
+ }
313
+
314
+ mean /= ncols;
315
+ var = var / ncols - mean * mean;
316
+ const float inv_var = rsqrtf(var + eps);
317
+
318
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
319
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
320
+ }
321
+ }
322
+
253
323
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
254
324
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
255
325
  const int tid = threadIdx.x;
256
326
 
257
- const float eps = 1e-6;
327
+ const float eps = 1e-6f;
258
328
 
259
329
  float tmp = 0.0f; // partial sum for thread in warp
260
330
 
261
- for (int i = 0; i < ncols; i += WARP_SIZE) {
262
- const int col = i + tid;
331
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
263
332
  const float xi = x[row*ncols + col];
264
333
  tmp += xi * xi;
265
334
  }
266
335
 
267
336
  // sum up partial sums
268
- __syncthreads();
269
337
  #pragma unroll
270
338
  for (int mask = 16; mask > 0; mask >>= 1) {
271
339
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
272
340
  }
273
341
 
274
342
  const float mean = tmp / ncols;
275
- const float scale = 1.0f / sqrtf(mean + eps);
343
+ const float scale = rsqrtf(mean + eps);
276
344
 
277
- for (int i = 0; i < ncols; i += WARP_SIZE) {
278
- const int col = i + tid;
345
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
279
346
  dst[row*ncols + col] = scale * x[row*ncols + col];
280
347
  }
281
348
  }
@@ -384,7 +451,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
384
451
 
385
452
  //================================== k-quants
386
453
 
387
- static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
454
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
388
455
 
389
456
  const int i = blockIdx.x;
390
457
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -417,7 +484,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
417
484
 
418
485
  }
419
486
 
420
- static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
487
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
421
488
 
422
489
  const int i = blockIdx.x;
423
490
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -481,7 +548,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
481
548
  }
482
549
  #endif
483
550
 
484
- static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
551
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
485
552
  const block_q4_K * x = (const block_q4_K *) vx;
486
553
 
487
554
  const int i = blockIdx.x;
@@ -521,7 +588,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
521
588
  #endif
522
589
  }
523
590
 
524
- static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
591
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
525
592
  const block_q5_K * x = (const block_q5_K *) vx;
526
593
 
527
594
  const int i = blockIdx.x;
@@ -567,7 +634,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
567
634
  #endif
568
635
  }
569
636
 
570
- static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
637
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
571
638
  const block_q6_K * x = (const block_q6_K *) vx;
572
639
 
573
640
  const int i = blockIdx.x;
@@ -611,7 +678,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
611
678
  #endif
612
679
  }
613
680
 
614
- static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
681
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
615
682
 
616
683
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
617
684
 
@@ -709,7 +776,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
709
776
  #endif
710
777
 
711
778
  // sum up partial sums and write back result
712
- __syncthreads();
713
779
  #pragma unroll
714
780
  for (int mask = 16; mask > 0; mask >>= 1) {
715
781
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -720,7 +786,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
720
786
  }
721
787
  }
722
788
 
723
- static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
789
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
724
790
 
725
791
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
726
792
  if (row > nrows) return;
@@ -814,7 +880,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
814
880
  #endif
815
881
 
816
882
  // sum up partial sums and write back result
817
- __syncthreads();
818
883
  #pragma unroll
819
884
  for (int mask = 16; mask > 0; mask >>= 1) {
820
885
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -825,7 +890,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
825
890
  }
826
891
  }
827
892
 
828
- static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
893
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
829
894
 
830
895
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
831
896
  if (row > nrows) return;
@@ -918,7 +983,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
918
983
  #endif
919
984
 
920
985
  // sum up partial sums and write back result
921
- __syncthreads();
922
986
  #pragma unroll
923
987
  for (int mask = 16; mask > 0; mask >>= 1) {
924
988
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -929,7 +993,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
929
993
  }
930
994
  }
931
995
 
932
- static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
996
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
933
997
 
934
998
  const int row = blockIdx.x;
935
999
  const int num_blocks_per_row = ncols / QK_K;
@@ -1023,7 +1087,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1023
1087
  #endif
1024
1088
 
1025
1089
  // sum up partial sums and write back result
1026
- __syncthreads();
1027
1090
  #pragma unroll
1028
1091
  for (int mask = 16; mask > 0; mask >>= 1) {
1029
1092
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1034,7 +1097,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1034
1097
  }
1035
1098
  }
1036
1099
 
1037
- static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
1100
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1038
1101
 
1039
1102
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1040
1103
 
@@ -1134,7 +1197,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
1134
1197
  #endif
1135
1198
 
1136
1199
  // sum up partial sums and write back result
1137
- __syncthreads();
1138
1200
  #pragma unroll
1139
1201
  for (int mask = 16; mask > 0; mask >>= 1) {
1140
1202
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1153,8 +1215,43 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1153
1215
  v.y = x[ib + iqs + 1];
1154
1216
  }
1155
1217
 
1218
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1219
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
1220
+
1221
+ if (i >= k) {
1222
+ return;
1223
+ }
1224
+
1225
+ block_q8_1 * y = (block_q8_1 *) vy;
1226
+
1227
+ const int ib = i / QK8_1; // block index
1228
+ const int iqs = i % QK8_1; // quant index
1229
+
1230
+ const float xi = i < ndata ? x[i] : 0.0f;
1231
+ float amax = fabsf(xi);
1232
+ float sum = xi;
1233
+
1234
+ #pragma unroll
1235
+ for (int mask = 16; mask > 0; mask >>= 1) {
1236
+ amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
1237
+ sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
1238
+ }
1239
+
1240
+ const float d = amax / 127;
1241
+ const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
1242
+
1243
+ y[ib].qs[iqs] = q;
1244
+
1245
+ if (iqs > 0) {
1246
+ return;
1247
+ }
1248
+
1249
+ y[ib].d = d;
1250
+ y[ib].s = sum;
1251
+ }
1252
+
1156
1253
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1157
- static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1254
+ static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1158
1255
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1159
1256
 
1160
1257
  if (i >= k) {
@@ -1174,8 +1271,184 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1174
1271
  y[iybs + iqs + y_offset] = v.y;
1175
1272
  }
1176
1273
 
1274
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1275
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1276
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1277
+
1278
+ int vi;
1279
+ memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1280
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1281
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1282
+
1283
+ const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1284
+
1285
+ // subtract 8 from each quantized value
1286
+ const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1287
+ const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1288
+
1289
+ // SIMD dot product of quantized values
1290
+ int sumi = __dp4a(vi0, ui0, 0);
1291
+ sumi = __dp4a(vi1, ui1, sumi);
1292
+
1293
+ return sumi*d;
1294
+ #else
1295
+ return 0.0f; // only to satisfy the compiler
1296
+ #endif // __CUDA_ARCH__ >= 610
1297
+ }
1298
+
1299
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1300
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1301
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1302
+
1303
+ const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1304
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1305
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1306
+
1307
+ const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1308
+ const float m = bq4_1->m;
1309
+ const float s = bq8_1->s;
1310
+
1311
+ const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1312
+ const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1313
+
1314
+ // SIMD dot product of quantized values
1315
+ int sumi = __dp4a(vi0, ui0, 0);
1316
+ sumi = __dp4a(vi1, ui1, sumi);
1317
+
1318
+ return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1319
+ #else
1320
+ return 0.0f; // only to satisfy the compiler
1321
+ #endif // __CUDA_ARCH__ >= 610
1322
+ }
1323
+
1324
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1325
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1326
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1327
+
1328
+ int qs;
1329
+ memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1330
+ const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1331
+ const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1332
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1333
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1334
+
1335
+ const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1336
+
1337
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1338
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1339
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1340
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1341
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1342
+ vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1343
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1344
+
1345
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1346
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1347
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1348
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1349
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1350
+ vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1351
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1352
+
1353
+ return sumi*d;
1354
+ #else
1355
+ return 0.0f; // only to satisfy the compiler
1356
+ #endif // __CUDA_ARCH__ >= 610
1357
+ }
1358
+
1359
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1360
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1361
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1362
+
1363
+ const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1364
+ const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1365
+ const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1366
+ const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1367
+ const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1368
+
1369
+ const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1370
+ const float m = bq5_1->m;
1371
+ const float s = bq8_1->s;
1372
+
1373
+ int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1374
+ vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1375
+ vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1376
+ vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1377
+ vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1378
+ int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1379
+
1380
+ int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1381
+ vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1382
+ vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1383
+ vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1384
+ vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1385
+ sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1386
+
1387
+ return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1388
+ #else
1389
+ return 0.0f; // only to satisfy the compiler
1390
+ #endif // __CUDA_ARCH__ >= 610
1391
+ }
1392
+
1393
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1394
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1395
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1396
+
1397
+ int vi;
1398
+ memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1399
+ const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1400
+
1401
+ const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1402
+
1403
+ // SIMD dot product of quantized values
1404
+ int sumi = __dp4a(vi, ui, 0);
1405
+
1406
+ return sumi*d;
1407
+ #else
1408
+ return 0.0f; // only to satisfy the compiler
1409
+ #endif // __CUDA_ARCH__ >= 610
1410
+ }
1411
+
1412
+ template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1413
+ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1414
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
1415
+
1416
+ if (row >= nrows) {
1417
+ return;
1418
+ }
1419
+
1420
+ const int blocks_per_row = ncols / qk;
1421
+ const int blocks_per_warp = WARP_SIZE / qi;
1422
+
1423
+ // partial sum for each thread
1424
+ float tmp = 0.0f;
1425
+
1426
+ const block_q_t * x = (const block_q_t *) vx;
1427
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1428
+
1429
+ for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1430
+ const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1431
+
1432
+ const int iby = i + threadIdx.x / qi; // y block index
1433
+
1434
+ const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1435
+
1436
+ tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1437
+ }
1438
+
1439
+ // sum up partial sums and write back result
1440
+ #pragma unroll
1441
+ for (int mask = 16; mask > 0; mask >>= 1) {
1442
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1443
+ }
1444
+
1445
+ if (threadIdx.x == 0) {
1446
+ dst[row] = tmp;
1447
+ }
1448
+ }
1449
+
1177
1450
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1178
- static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1451
+ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
1179
1452
  // qk = quantized weights per x block
1180
1453
  // qr = number of quantized weights per data value in x block
1181
1454
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -1228,7 +1501,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1228
1501
  }
1229
1502
 
1230
1503
  // sum up partial sums and write back result
1231
- __syncthreads();
1232
1504
  #pragma unroll
1233
1505
  for (int mask = 16; mask > 0; mask >>= 1) {
1234
1506
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1243,7 +1515,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1243
1515
  }
1244
1516
  }
1245
1517
 
1246
- static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1518
+ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1247
1519
  const half * x = (const half *) vx;
1248
1520
 
1249
1521
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1279,7 +1551,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1279
1551
  const int idst = channel*nrows_dst + row_dst;
1280
1552
 
1281
1553
  // sum up partial sums and write back result
1282
- __syncthreads();
1283
1554
  #pragma unroll
1284
1555
  for (int mask = 16; mask > 0; mask >>= 1) {
1285
1556
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1291,7 +1562,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1291
1562
  }
1292
1563
 
1293
1564
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1294
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1565
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1295
1566
  const int row_stride_x, const int channel_stride_x) {
1296
1567
 
1297
1568
  const half * x = (const half *) vx;
@@ -1325,7 +1596,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1325
1596
  }
1326
1597
 
1327
1598
  // sum up partial sums and write back result
1328
- __syncthreads();
1329
1599
  #pragma unroll
1330
1600
  for (int mask = 16; mask > 0; mask >>= 1) {
1331
1601
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1435,7 +1705,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
1435
1705
  }
1436
1706
 
1437
1707
  // sum up partial sums
1438
- __syncthreads();
1439
1708
  #pragma unroll
1440
1709
  for (int mask = 16; mask > 0; mask >>= 1) {
1441
1710
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1478,17 +1747,33 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
1478
1747
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1479
1748
  }
1480
1749
 
1750
+ static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1751
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
1752
+ gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1753
+ }
1754
+
1481
1755
  static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1482
1756
  const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
1483
1757
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1484
1758
  }
1485
1759
 
1760
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1761
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
1762
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1763
+ norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1764
+ }
1765
+
1486
1766
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1487
1767
  GGML_ASSERT(ncols % WARP_SIZE == 0);
1488
1768
  const dim3 block_dims(WARP_SIZE, 1, 1);
1489
1769
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1490
1770
  }
1491
1771
 
1772
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
1773
+ const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1774
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
1775
+ }
1776
+
1492
1777
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1493
1778
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1494
1779
  dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1557,45 +1842,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
1557
1842
 
1558
1843
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1559
1844
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1560
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1845
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1561
1846
  const dim3 block_nums(1, block_num_y, 1);
1562
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1847
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1563
1848
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
1564
1849
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1565
1850
  }
1566
1851
 
1567
1852
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1568
1853
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1569
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1854
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1570
1855
  const dim3 block_nums(1, block_num_y, 1);
1571
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1856
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1572
1857
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
1573
1858
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1574
1859
  }
1575
1860
 
1576
1861
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1577
1862
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1578
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1863
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1579
1864
  const dim3 block_nums(1, block_num_y, 1);
1580
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1865
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1581
1866
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
1582
1867
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1583
1868
  }
1584
1869
 
1585
1870
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1586
1871
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1587
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1872
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1588
1873
  const dim3 block_nums(1, block_num_y, 1);
1589
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1874
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1590
1875
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
1591
1876
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1592
1877
  }
1593
1878
 
1594
1879
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1595
1880
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1596
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1881
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1597
1882
  const dim3 block_nums(1, block_num_y, 1);
1598
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1883
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1599
1884
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
1600
1885
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1601
1886
  }
@@ -1642,6 +1927,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1642
1927
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1643
1928
  }
1644
1929
 
1930
+ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1931
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1932
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1933
+ const dim3 block_nums(1, block_num_y, 1);
1934
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1935
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
1936
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1937
+ }
1938
+
1939
+ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1940
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1941
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1942
+ const dim3 block_nums(1, block_num_y, 1);
1943
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1944
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
1945
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1946
+ }
1947
+
1948
+ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1949
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1950
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1951
+ const dim3 block_nums(1, block_num_y, 1);
1952
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1953
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
1954
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1955
+ }
1956
+
1957
+ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1958
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1959
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1960
+ const dim3 block_nums(1, block_num_y, 1);
1961
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1962
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
1963
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1964
+ }
1965
+
1966
+ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1967
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1968
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1969
+ const dim3 block_nums(1, block_num_y, 1);
1970
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1971
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
1972
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1973
+ }
1974
+
1645
1975
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1646
1976
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1647
1977
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1649,9 +1979,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
1649
1979
 
1650
1980
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1651
1981
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1652
- const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1982
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1653
1983
  const dim3 block_nums(1, block_num_y, 1);
1654
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1984
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
1655
1985
  dequantize_mul_mat_vec<1, 1, convert_f16>
1656
1986
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1657
1987
  }
@@ -1817,6 +2147,7 @@ static size_t g_scratch_offset = 0;
1817
2147
 
1818
2148
  static int g_device_count = -1;
1819
2149
  static int g_main_device = 0;
2150
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
1820
2151
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1821
2152
 
1822
2153
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -1834,9 +2165,12 @@ void ggml_init_cublas() {
1834
2165
  for (int id = 0; id < g_device_count; ++id) {
1835
2166
  cudaDeviceProp prop;
1836
2167
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
1837
- fprintf(stderr, " Device %d: %s\n", id, prop.name);
2168
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
2169
+
1838
2170
  g_tensor_split[id] = total_vram;
1839
2171
  total_vram += prop.totalGlobalMem;
2172
+
2173
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
1840
2174
  }
1841
2175
  for (int id = 0; id < g_device_count; ++id) {
1842
2176
  g_tensor_split[id] /= total_vram;
@@ -1957,20 +2291,24 @@ inline void ggml_cuda_op_add(
1957
2291
 
1958
2292
  GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
1959
2293
  GGML_ASSERT(src1_ddf_i != nullptr);
1960
- GGML_ASSERT(dst_ddf_i != nullptr);
2294
+ GGML_ASSERT(dst_ddf_i != nullptr);
2295
+
2296
+ // TODO: support broadcasting
2297
+ GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
1961
2298
 
1962
- const int64_t ne0 = src0->ne[0];
2299
+ const int64_t ne00 = src0->ne[0];
1963
2300
  const int64_t i01_diff = i01_high - i01_low;
1964
2301
 
2302
+ // const int64_t ne10 = src1->ne[0];
2303
+
1965
2304
  // compute
1966
2305
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1967
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
2306
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
1968
2307
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
1969
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
2308
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
1970
2309
  } else {
1971
2310
  GGML_ASSERT(false);
1972
2311
  }
1973
- CUDA_CHECK(cudaGetLastError());
1974
2312
 
1975
2313
  (void) src1;
1976
2314
  (void) dst;
@@ -1986,10 +2324,9 @@ inline void ggml_cuda_op_mul(
1986
2324
 
1987
2325
  GGML_ASSERT(src0_ddf_i != nullptr);
1988
2326
  GGML_ASSERT(src1_ddf_i != nullptr);
1989
- GGML_ASSERT(dst_ddf_i != nullptr);
2327
+ GGML_ASSERT(dst_ddf_i != nullptr);
1990
2328
 
1991
2329
  const int64_t ne00 = src0->ne[0];
1992
-
1993
2330
  const int64_t ne10 = src1->ne[0];
1994
2331
  const int64_t ne11 = src1->ne[1];
1995
2332
 
@@ -1998,11 +2335,10 @@ inline void ggml_cuda_op_mul(
1998
2335
 
1999
2336
  float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
2000
2337
  float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
2001
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2338
+ float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2002
2339
 
2003
2340
  // compute
2004
2341
  mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
2005
- CUDA_CHECK(cudaGetLastError());
2006
2342
  }
2007
2343
 
2008
2344
  (void) dst;
@@ -2010,6 +2346,28 @@ inline void ggml_cuda_op_mul(
2010
2346
  (void) i02;
2011
2347
  }
2012
2348
 
2349
+ inline void ggml_cuda_op_gelu(
2350
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2351
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2352
+ cudaStream_t & cudaStream_main){
2353
+
2354
+ GGML_ASSERT(src0_ddf_i != nullptr);
2355
+ GGML_ASSERT(dst_ddf_i != nullptr);
2356
+
2357
+ const int64_t ne00 = src0->ne[0];
2358
+ const int64_t i01_diff = i01_high - i01_low;
2359
+
2360
+ // compute
2361
+ gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2362
+
2363
+ (void) src1;
2364
+ (void) dst;
2365
+ (void) src0_ddq_i;
2366
+ (void) src1_ddf_i;
2367
+ (void) i02;
2368
+ (void) i1;
2369
+ }
2370
+
2013
2371
  inline void ggml_cuda_op_silu(
2014
2372
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2015
2373
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2023,7 +2381,28 @@ inline void ggml_cuda_op_silu(
2023
2381
 
2024
2382
  // compute
2025
2383
  silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2026
- CUDA_CHECK(cudaGetLastError());
2384
+
2385
+ (void) src1;
2386
+ (void) dst;
2387
+ (void) src0_ddq_i;
2388
+ (void) src1_ddf_i;
2389
+ (void) i02;
2390
+ (void) i1;
2391
+ }
2392
+
2393
+ inline void ggml_cuda_op_norm(
2394
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2395
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2396
+ cudaStream_t & cudaStream_main){
2397
+
2398
+ GGML_ASSERT(src0_ddf_i != nullptr);
2399
+ GGML_ASSERT(dst_ddf_i != nullptr);
2400
+
2401
+ const int64_t ne00 = src0->ne[0];
2402
+ const int64_t i01_diff = i01_high - i01_low;
2403
+
2404
+ // compute
2405
+ norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2027
2406
 
2028
2407
  (void) src1;
2029
2408
  (void) dst;
@@ -2046,7 +2425,6 @@ inline void ggml_cuda_op_rms_norm(
2046
2425
 
2047
2426
  // compute
2048
2427
  rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2049
- CUDA_CHECK(cudaGetLastError());
2050
2428
 
2051
2429
  (void) src1;
2052
2430
  (void) dst;
@@ -2056,7 +2434,7 @@ inline void ggml_cuda_op_rms_norm(
2056
2434
  (void) i1;
2057
2435
  }
2058
2436
 
2059
- inline void ggml_cuda_op_dequantize_mul_mat_vec(
2437
+ inline void ggml_cuda_op_mul_mat_vec(
2060
2438
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2061
2439
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2062
2440
  cudaStream_t & cudaStream_main){
@@ -2068,70 +2446,115 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
2068
2446
  const int64_t ne00 = src0->ne[0];
2069
2447
  const int64_t nrows = i01_high - i01_low;
2070
2448
 
2071
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2072
- #ifdef GGML_CUDA_DMMV_F16
2073
- size_t ash;
2074
- dfloat * src1_dfloat = nullptr; // dfloat == half
2449
+ #ifdef GGML_CUDA_FORCE_DMMV
2450
+ const bool use_mul_mat_vec_q = false;
2451
+ #else
2452
+ int id;
2453
+ CUDA_CHECK(cudaGetDevice(&id));
2075
2454
 
2076
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2077
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2078
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2455
+ const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2456
+ src0->type == GGML_TYPE_Q4_1 ||
2457
+ src0->type == GGML_TYPE_Q5_0 ||
2458
+ src0->type == GGML_TYPE_Q5_1 ||
2459
+ src0->type == GGML_TYPE_Q8_0;
2079
2460
 
2080
- if (src1_convert_f16) {
2081
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2082
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2083
- ne00, 1, sizeof(float), 0, 0,
2084
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2085
- }
2461
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
2462
+ #endif
2463
+
2464
+ if (use_mul_mat_vec_q) {
2465
+ int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2466
+ padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
2467
+ size_t as;
2468
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2469
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
2470
+
2471
+ switch (src0->type) {
2472
+ case GGML_TYPE_Q4_0:
2473
+ mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2474
+ break;
2475
+ case GGML_TYPE_Q4_1:
2476
+ mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2477
+ break;
2478
+ case GGML_TYPE_Q5_0:
2479
+ mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2480
+ break;
2481
+ case GGML_TYPE_Q5_1:
2482
+ mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2483
+ break;
2484
+ case GGML_TYPE_Q8_0:
2485
+ mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2486
+ break;
2487
+ default:
2488
+ GGML_ASSERT(false);
2489
+ break;
2490
+ }
2491
+
2492
+ ggml_cuda_pool_free(src1_q8_1, as);
2493
+ } else {
2494
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2495
+ #ifdef GGML_CUDA_DMMV_F16
2496
+ size_t ash;
2497
+ dfloat * src1_dfloat = nullptr; // dfloat == half
2498
+
2499
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
2500
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
2501
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
2502
+
2503
+ if (src1_convert_f16) {
2504
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
2505
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
2506
+ ne00, 1, sizeof(float), 0, 0,
2507
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
2508
+ }
2086
2509
  #else
2087
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2510
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2088
2511
  #endif // GGML_CUDA_DMMV_F16
2089
2512
 
2090
- switch (src0->type) {
2091
- case GGML_TYPE_Q4_0:
2092
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2093
- break;
2094
- case GGML_TYPE_Q4_1:
2095
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2096
- break;
2097
- case GGML_TYPE_Q5_0:
2098
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2099
- break;
2100
- case GGML_TYPE_Q5_1:
2101
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2102
- break;
2103
- case GGML_TYPE_Q8_0:
2104
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2105
- break;
2106
- case GGML_TYPE_Q2_K:
2107
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2108
- break;
2109
- case GGML_TYPE_Q3_K:
2110
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2111
- break;
2112
- case GGML_TYPE_Q4_K:
2113
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2114
- break;
2115
- case GGML_TYPE_Q5_K:
2116
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2117
- break;
2118
- case GGML_TYPE_Q6_K:
2119
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2120
- break;
2121
- case GGML_TYPE_F16:
2122
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2123
- break;
2124
- default:
2125
- GGML_ASSERT(false);
2126
- break;
2127
- }
2128
- CUDA_CHECK(cudaGetLastError());
2513
+ switch (src0->type) {
2514
+ case GGML_TYPE_Q4_0:
2515
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2516
+ break;
2517
+ case GGML_TYPE_Q4_1:
2518
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2519
+ break;
2520
+ case GGML_TYPE_Q5_0:
2521
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2522
+ break;
2523
+ case GGML_TYPE_Q5_1:
2524
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2525
+ break;
2526
+ case GGML_TYPE_Q8_0:
2527
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2528
+ break;
2529
+ case GGML_TYPE_Q2_K:
2530
+ dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2531
+ break;
2532
+ case GGML_TYPE_Q3_K:
2533
+ dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2534
+ break;
2535
+ case GGML_TYPE_Q4_K:
2536
+ dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2537
+ break;
2538
+ case GGML_TYPE_Q5_K:
2539
+ dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2540
+ break;
2541
+ case GGML_TYPE_Q6_K:
2542
+ dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
2543
+ break;
2544
+ case GGML_TYPE_F16:
2545
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2546
+ break;
2547
+ default:
2548
+ GGML_ASSERT(false);
2549
+ break;
2550
+ }
2129
2551
 
2130
2552
  #ifdef GGML_CUDA_DMMV_F16
2131
- if (src1_convert_f16) {
2132
- ggml_cuda_pool_free(src1_dfloat, ash);
2133
- }
2553
+ if (src1_convert_f16) {
2554
+ ggml_cuda_pool_free(src1_dfloat, ash);
2555
+ }
2134
2556
  #endif // GGML_CUDA_DMMV_F16
2557
+ }
2135
2558
 
2136
2559
  (void) src1;
2137
2560
  (void) dst;
@@ -2202,7 +2625,6 @@ inline void ggml_cuda_op_rope(
2202
2625
 
2203
2626
  // compute
2204
2627
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
2205
- CUDA_CHECK(cudaGetLastError());
2206
2628
 
2207
2629
  (void) dst;
2208
2630
  (void) src0_ddq_i;
@@ -2226,7 +2648,6 @@ inline void ggml_cuda_op_diag_mask_inf(
2226
2648
 
2227
2649
  // compute
2228
2650
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
2229
- CUDA_CHECK(cudaGetLastError());
2230
2651
 
2231
2652
  (void) dst;
2232
2653
  (void) src0_ddq_i;
@@ -2248,7 +2669,6 @@ inline void ggml_cuda_op_soft_max(
2248
2669
 
2249
2670
  // compute
2250
2671
  soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2251
- CUDA_CHECK(cudaGetLastError());
2252
2672
 
2253
2673
  (void) src1;
2254
2674
  (void) dst;
@@ -2344,10 +2764,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2344
2764
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
2345
2765
  size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2346
2766
 
2347
- // if multiple GPUs are used they need to wait for the main GPU to finish
2767
+ // if multiple devices are used they need to wait for the main device
2768
+ // here an event is recorded that signifies that the main device has finished calculating the input data
2348
2769
  if (split && g_device_count > 1) {
2349
2770
  CUDA_CHECK(cudaSetDevice(g_main_device));
2350
- CUDA_CHECK(cudaDeviceSynchronize());
2771
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
2351
2772
  }
2352
2773
 
2353
2774
  for (int id = 0; id < g_device_count; ++id) {
@@ -2373,6 +2794,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2373
2794
  int64_t row_diff = row_high - row_low;
2374
2795
 
2375
2796
  cudaSetDevice(id);
2797
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2798
+
2799
+ // wait for main GPU data if necessary
2800
+ if (split && id != g_main_device) {
2801
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
2802
+ }
2376
2803
 
2377
2804
  if (src0_on_device && src0_is_contiguous) {
2378
2805
  if (src0_is_f32) {
@@ -2448,8 +2875,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2448
2875
  }
2449
2876
  const int64_t i11 = i13*ne12 + i12;
2450
2877
 
2451
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2452
-
2453
2878
  // for split tensors the data begins at i0 == i0_offset_low
2454
2879
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
2455
2880
  float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@@ -2509,6 +2934,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2509
2934
 
2510
2935
  // do the computation
2511
2936
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2937
+ CUDA_CHECK(cudaGetLastError());
2512
2938
 
2513
2939
  // copy dst to host or other device if necessary
2514
2940
  if (!dst_on_device) {
@@ -2538,6 +2964,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2538
2964
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
2539
2965
  }
2540
2966
  }
2967
+
2968
+ // signify to main device that other device is done
2969
+ if (split && g_device_count > 1 && id != g_main_device) {
2970
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
2971
+ }
2541
2972
  }
2542
2973
  }
2543
2974
  }
@@ -2549,7 +2980,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2549
2980
  }
2550
2981
 
2551
2982
  CUDA_CHECK(cudaSetDevice(id));
2552
- CUDA_CHECK(cudaDeviceSynchronize());
2553
2983
 
2554
2984
  if (src0_asq[id] > 0) {
2555
2985
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
@@ -2564,6 +2994,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2564
2994
  ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
2565
2995
  }
2566
2996
  }
2997
+
2998
+ // main device waits for all other devices to be finished
2999
+ if (split && g_device_count > 1) {
3000
+ CUDA_CHECK(cudaSetDevice(g_main_device));
3001
+ for (int id = 0; id < g_device_count; ++id) {
3002
+ if (id != g_main_device) {
3003
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
3004
+ }
3005
+ }
3006
+ }
3007
+
3008
+ if (dst->backend == GGML_BACKEND_CPU) {
3009
+ CUDA_CHECK(cudaSetDevice(g_main_device));
3010
+ CUDA_CHECK(cudaDeviceSynchronize());
3011
+ }
2567
3012
  }
2568
3013
 
2569
3014
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2582,11 +3027,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2582
3027
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
2583
3028
  }
2584
3029
 
3030
+ void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3031
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3032
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
3033
+ }
3034
+
2585
3035
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2586
3036
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2587
3037
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
2588
3038
  }
2589
3039
 
3040
+ void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3041
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3042
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
3043
+ }
3044
+
2590
3045
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2591
3046
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2592
3047
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -2679,8 +3134,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
2679
3134
  }else if (src0->type == GGML_TYPE_F32) {
2680
3135
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2681
3136
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
2682
- if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2683
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
3137
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3138
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
2684
3139
  } else {
2685
3140
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2686
3141
  }
@@ -2765,7 +3220,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2765
3220
 
2766
3221
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
2767
3222
  int nrows = ggml_nrows(tensor);
3223
+
3224
+ const int64_t ne0 = tensor->ne[0];
3225
+
2768
3226
  const size_t nb1 = tensor->nb[1];
3227
+
2769
3228
  ggml_backend backend = tensor->backend;
2770
3229
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
2771
3230
  memset(extra, 0, sizeof(*extra));
@@ -2794,34 +3253,54 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
2794
3253
  int64_t nrows_split = row_high - row_low;
2795
3254
 
2796
3255
  const size_t offset_split = row_low*nb1;
2797
- const size_t size = ggml_nbytes_split(tensor, nrows_split);
3256
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
3257
+ const size_t original_size = size;
3258
+
3259
+ // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3260
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
3261
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3262
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
3263
+ }
2798
3264
 
2799
- void * buf;
3265
+ char * buf;
2800
3266
  CUDA_CHECK(cudaMalloc(&buf, size));
2801
- void * buf_host = (char*)data + offset_split;
3267
+ char * buf_host = (char*)data + offset_split;
3268
+
3269
+ // set padding to 0 to avoid possible NaN values
3270
+ if (size > original_size) {
3271
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
3272
+ }
3273
+
2802
3274
 
2803
- cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
3275
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
2804
3276
 
2805
3277
  extra->data_device[id] = buf;
3278
+
3279
+ if (backend == GGML_BACKEND_GPU_SPLIT) {
3280
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
3281
+ }
2806
3282
  }
2807
3283
 
2808
3284
  tensor->extra = extra;
2809
3285
  }
2810
3286
 
2811
3287
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2812
- if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
3288
+ if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
2813
3289
  return;
2814
3290
  }
2815
3291
 
2816
3292
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
2817
3293
 
2818
3294
  for (int id = 0; id < g_device_count; ++id) {
2819
- if (extra->data_device[id] == nullptr) {
2820
- continue;
3295
+ if (extra->data_device[id] != nullptr) {
3296
+ CUDA_CHECK(cudaSetDevice(id));
3297
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
2821
3298
  }
2822
3299
 
2823
- CUDA_CHECK(cudaSetDevice(id));
2824
- CUDA_CHECK(cudaFree(extra->data_device[id]));
3300
+ if (extra->events[id] != nullptr) {
3301
+ CUDA_CHECK(cudaSetDevice(id));
3302
+ CUDA_CHECK(cudaEventDestroy(extra->events[id]));
3303
+ }
2825
3304
  }
2826
3305
 
2827
3306
  delete extra;
@@ -2833,36 +3312,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
2833
3312
  }
2834
3313
 
2835
3314
  // recursively assign CUDA buffers until a compute tensor is found
2836
- if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2837
- const ggml_op src0_op = tensor->src0->op;
3315
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3316
+ const ggml_op src0_op = tensor->src[0]->op;
2838
3317
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2839
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
3318
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
2840
3319
  }
2841
3320
  }
2842
- if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2843
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
3321
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
3322
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
2844
3323
  }
2845
3324
 
2846
3325
  tensor->backend = GGML_BACKEND_GPU;
2847
3326
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
2848
3327
  memset(extra, 0, sizeof(*extra));
2849
3328
 
2850
- const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
3329
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
2851
3330
  tensor->op == GGML_OP_VIEW ||
2852
3331
  force_inplace;
2853
3332
  const size_t size = ggml_nbytes(tensor);
2854
3333
 
2855
3334
  CUDA_CHECK(cudaSetDevice(g_main_device));
2856
- if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
2857
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
3335
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
3336
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
2858
3337
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2859
3338
  size_t offset = 0;
2860
3339
  if (tensor->op == GGML_OP_VIEW) {
2861
- memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
3340
+ memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
2862
3341
  }
2863
3342
  extra->data_device[g_main_device] = src0_ddc + offset;
2864
3343
  } else if (tensor->op == GGML_OP_CPY) {
2865
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
3344
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
2866
3345
  void * src1_ddv = src1_extra->data_device[g_main_device];
2867
3346
  extra->data_device[g_main_device] = src1_ddv;
2868
3347
  } else if (scratch) {
@@ -2933,8 +3412,8 @@ void ggml_cuda_free_scratch() {
2933
3412
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
2934
3413
  ggml_cuda_func_t func;
2935
3414
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
2936
- || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
2937
- || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
3415
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3416
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
2938
3417
 
2939
3418
  switch (tensor->op) {
2940
3419
  case GGML_OP_ADD:
@@ -2949,12 +3428,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
2949
3428
  }
2950
3429
  func = ggml_cuda_mul;
2951
3430
  break;
3431
+ case GGML_OP_GELU:
3432
+ if (!any_on_device) {
3433
+ return false;
3434
+ }
3435
+ func = ggml_cuda_gelu;
3436
+ break;
2952
3437
  case GGML_OP_SILU:
2953
3438
  if (!any_on_device) {
2954
3439
  return false;
2955
3440
  }
2956
3441
  func = ggml_cuda_silu;
2957
3442
  break;
3443
+ case GGML_OP_NORM:
3444
+ if (!any_on_device) {
3445
+ return false;
3446
+ }
3447
+ func = ggml_cuda_norm;
3448
+ break;
2958
3449
  case GGML_OP_RMS_NORM:
2959
3450
  if (!any_on_device) {
2960
3451
  return false;
@@ -2962,7 +3453,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
2962
3453
  func = ggml_cuda_rms_norm;
2963
3454
  break;
2964
3455
  case GGML_OP_MUL_MAT:
2965
- if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
3456
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
2966
3457
  return false;
2967
3458
  }
2968
3459
  func = ggml_cuda_mul_mat;
@@ -3016,6 +3507,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3016
3507
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3017
3508
  return true;
3018
3509
  }
3019
- func(tensor->src0, tensor->src1, tensor);
3510
+ func(tensor->src[0], tensor->src[1], tensor);
3020
3511
  return true;
3021
3512
  }