llama_cpp 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
59
59
  #endif //GGML_CUDA_DMMV_F16
60
60
 
61
61
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
- typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
63
- typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
62
+ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
63
+ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
64
64
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
65
65
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
66
66
  typedef void (*ggml_cuda_op_t)(
@@ -131,7 +131,7 @@ typedef struct {
131
131
  } block_q8_1;
132
132
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
133
 
134
- typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
134
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
135
135
 
136
136
  //================================= k-quants
137
137
 
@@ -208,9 +208,11 @@ typedef struct {
208
208
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
209
209
 
210
210
  #define WARP_SIZE 32
211
+ #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
211
212
 
212
213
  #define CUDA_ADD_BLOCK_SIZE 256
213
214
  #define CUDA_MUL_BLOCK_SIZE 256
215
+ #define CUDA_GELU_BLOCK_SIZE 256
214
216
  #define CUDA_SILU_BLOCK_SIZE 256
215
217
  #define CUDA_CPY_BLOCK_SIZE 32
216
218
  #define CUDA_SCALE_BLOCK_SIZE 256
@@ -265,6 +267,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
265
267
  dst[i] = x[i] * y[i%ky];
266
268
  }
267
269
 
270
+ static __global__ void gelu_f32(const float * x, float * dst, const int k) {
271
+ const float GELU_COEF_A = 0.044715f;
272
+ const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
273
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
274
+
275
+ if (i >= k) {
276
+ return;
277
+ }
278
+
279
+ float xi = x[i];
280
+ dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
281
+ }
282
+
268
283
  static __global__ void silu_f32(const float * x, float * dst, const int k) {
269
284
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
270
285
 
@@ -274,16 +289,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
274
289
  dst[i] = x[i] / (1.0f + expf(-x[i]));
275
290
  }
276
291
 
292
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
293
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
294
+ const int tid = threadIdx.x;
295
+
296
+ const float eps = 1e-5f;
297
+
298
+ float mean = 0.0f;
299
+ float var = 0.0f;
300
+
301
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
302
+ const float xi = x[row*ncols + col];
303
+ mean += xi;
304
+ var += xi * xi;
305
+ }
306
+
307
+ // sum up partial sums
308
+ #pragma unroll
309
+ for (int mask = 16; mask > 0; mask >>= 1) {
310
+ mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
311
+ var += __shfl_xor_sync(0xffffffff, var, mask, 32);
312
+ }
313
+
314
+ mean /= ncols;
315
+ var = var / ncols - mean * mean;
316
+ const float inv_var = rsqrtf(var + eps);
317
+
318
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
319
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
320
+ }
321
+ }
322
+
277
323
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
278
324
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
279
325
  const int tid = threadIdx.x;
280
326
 
281
- const float eps = 1e-6;
327
+ const float eps = 1e-6f;
282
328
 
283
329
  float tmp = 0.0f; // partial sum for thread in warp
284
330
 
285
- for (int i = 0; i < ncols; i += WARP_SIZE) {
286
- const int col = i + tid;
331
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
287
332
  const float xi = x[row*ncols + col];
288
333
  tmp += xi * xi;
289
334
  }
@@ -295,10 +340,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
295
340
  }
296
341
 
297
342
  const float mean = tmp / ncols;
298
- const float scale = 1.0f / sqrtf(mean + eps);
343
+ const float scale = rsqrtf(mean + eps);
299
344
 
300
- for (int i = 0; i < ncols; i += WARP_SIZE) {
301
- const int col = i + tid;
345
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
302
346
  dst[row*ncols + col] = scale * x[row*ncols + col];
303
347
  }
304
348
  }
@@ -407,7 +451,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
407
451
 
408
452
  //================================== k-quants
409
453
 
410
- static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
454
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
411
455
 
412
456
  const int i = blockIdx.x;
413
457
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -440,7 +484,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
440
484
 
441
485
  }
442
486
 
443
- static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
487
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
444
488
 
445
489
  const int i = blockIdx.x;
446
490
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -504,7 +548,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
504
548
  }
505
549
  #endif
506
550
 
507
- static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
551
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
508
552
  const block_q4_K * x = (const block_q4_K *) vx;
509
553
 
510
554
  const int i = blockIdx.x;
@@ -544,7 +588,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
544
588
  #endif
545
589
  }
546
590
 
547
- static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
591
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
548
592
  const block_q5_K * x = (const block_q5_K *) vx;
549
593
 
550
594
  const int i = blockIdx.x;
@@ -590,7 +634,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
590
634
  #endif
591
635
  }
592
636
 
593
- static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
637
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
594
638
  const block_q6_K * x = (const block_q6_K *) vx;
595
639
 
596
640
  const int i = blockIdx.x;
@@ -634,7 +678,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
634
678
  #endif
635
679
  }
636
680
 
637
- static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
681
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
638
682
 
639
683
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
640
684
 
@@ -742,7 +786,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
742
786
  }
743
787
  }
744
788
 
745
- static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
789
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
746
790
 
747
791
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
748
792
  if (row > nrows) return;
@@ -846,7 +890,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
846
890
  }
847
891
  }
848
892
 
849
- static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
893
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
850
894
 
851
895
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
852
896
  if (row > nrows) return;
@@ -949,7 +993,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
949
993
  }
950
994
  }
951
995
 
952
- static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
996
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
953
997
 
954
998
  const int row = blockIdx.x;
955
999
  const int num_blocks_per_row = ncols / QK_K;
@@ -1053,7 +1097,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1053
1097
  }
1054
1098
  }
1055
1099
 
1056
- static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
1100
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1057
1101
 
1058
1102
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1059
1103
 
@@ -1171,7 +1215,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1171
1215
  v.y = x[ib + iqs + 1];
1172
1216
  }
1173
1217
 
1174
- static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1218
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1175
1219
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
1176
1220
 
1177
1221
  if (i >= k) {
@@ -1180,10 +1224,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1180
1224
 
1181
1225
  block_q8_1 * y = (block_q8_1 *) vy;
1182
1226
 
1183
- const int ib = i / QK8_0; // block index
1184
- const int iqs = i % QK8_0; // quant index
1227
+ const int ib = i / QK8_1; // block index
1228
+ const int iqs = i % QK8_1; // quant index
1185
1229
 
1186
- const float xi = x[i];
1230
+ const float xi = i < ndata ? x[i] : 0.0f;
1187
1231
  float amax = fabsf(xi);
1188
1232
  float sum = xi;
1189
1233
 
@@ -1207,7 +1251,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1207
1251
  }
1208
1252
 
1209
1253
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1210
- static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1254
+ static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1211
1255
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1212
1256
 
1213
1257
  if (i >= k) {
@@ -1227,8 +1271,8 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1227
1271
  y[iybs + iqs + y_offset] = v.y;
1228
1272
  }
1229
1273
 
1230
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1231
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1274
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1275
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1232
1276
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1233
1277
 
1234
1278
  int vi;
@@ -1249,11 +1293,11 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
1249
1293
  return sumi*d;
1250
1294
  #else
1251
1295
  return 0.0f; // only to satisfy the compiler
1252
- #endif // __CUDA_ARCH__ >= 600
1296
+ #endif // __CUDA_ARCH__ >= 610
1253
1297
  }
1254
1298
 
1255
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1256
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1299
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1300
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1257
1301
  const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1258
1302
 
1259
1303
  const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1274,11 +1318,11 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
1274
1318
  return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1275
1319
  #else
1276
1320
  return 0.0f; // only to satisfy the compiler
1277
- #endif // __CUDA_ARCH__ >= 600
1321
+ #endif // __CUDA_ARCH__ >= 610
1278
1322
  }
1279
1323
 
1280
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1281
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1324
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1325
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1282
1326
  const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1283
1327
 
1284
1328
  int qs;
@@ -1309,11 +1353,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
1309
1353
  return sumi*d;
1310
1354
  #else
1311
1355
  return 0.0f; // only to satisfy the compiler
1312
- #endif // __CUDA_ARCH__ >= 600
1356
+ #endif // __CUDA_ARCH__ >= 610
1313
1357
  }
1314
1358
 
1315
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1316
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1359
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1360
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1317
1361
  const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1318
1362
 
1319
1363
  const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1343,11 +1387,11 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
1343
1387
  return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1344
1388
  #else
1345
1389
  return 0.0f; // only to satisfy the compiler
1346
- #endif // __CUDA_ARCH__ >= 600
1390
+ #endif // __CUDA_ARCH__ >= 610
1347
1391
  }
1348
1392
 
1349
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1350
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1393
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1394
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1351
1395
  const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1352
1396
 
1353
1397
  int vi;
@@ -1362,11 +1406,11 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
1362
1406
  return sumi*d;
1363
1407
  #else
1364
1408
  return 0.0f; // only to satisfy the compiler
1365
- #endif // __CUDA_ARCH__ >= 600
1409
+ #endif // __CUDA_ARCH__ >= 610
1366
1410
  }
1367
1411
 
1368
1412
  template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1369
- static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
1413
+ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1370
1414
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1371
1415
 
1372
1416
  if (row >= nrows) {
@@ -1404,7 +1448,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1404
1448
  }
1405
1449
 
1406
1450
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1407
- static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1451
+ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
1408
1452
  // qk = quantized weights per x block
1409
1453
  // qr = number of quantized weights per data value in x block
1410
1454
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -1471,7 +1515,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1471
1515
  }
1472
1516
  }
1473
1517
 
1474
- static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1518
+ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1475
1519
  const half * x = (const half *) vx;
1476
1520
 
1477
1521
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1518,7 +1562,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1518
1562
  }
1519
1563
 
1520
1564
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1521
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1565
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1522
1566
  const int row_stride_x, const int channel_stride_x) {
1523
1567
 
1524
1568
  const half * x = (const half *) vx;
@@ -1703,20 +1747,31 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
1703
1747
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1704
1748
  }
1705
1749
 
1750
+ static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1751
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
1752
+ gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1753
+ }
1754
+
1706
1755
  static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1707
1756
  const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
1708
1757
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1709
1758
  }
1710
1759
 
1760
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1761
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
1762
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1763
+ norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1764
+ }
1765
+
1711
1766
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1712
1767
  GGML_ASSERT(ncols % WARP_SIZE == 0);
1713
1768
  const dim3 block_dims(WARP_SIZE, 1, 1);
1714
1769
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1715
1770
  }
1716
1771
 
1717
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
1772
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
1718
1773
  const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1719
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
1774
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
1720
1775
  }
1721
1776
 
1722
1777
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2236,16 +2291,21 @@ inline void ggml_cuda_op_add(
2236
2291
 
2237
2292
  GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
2238
2293
  GGML_ASSERT(src1_ddf_i != nullptr);
2239
- GGML_ASSERT(dst_ddf_i != nullptr);
2294
+ GGML_ASSERT(dst_ddf_i != nullptr);
2240
2295
 
2241
- const int64_t ne0 = src0->ne[0];
2296
+ // TODO: support broadcasting
2297
+ GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
2298
+
2299
+ const int64_t ne00 = src0->ne[0];
2242
2300
  const int64_t i01_diff = i01_high - i01_low;
2243
2301
 
2302
+ // const int64_t ne10 = src1->ne[0];
2303
+
2244
2304
  // compute
2245
2305
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2246
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
2306
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2247
2307
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
2248
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
2308
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
2249
2309
  } else {
2250
2310
  GGML_ASSERT(false);
2251
2311
  }
@@ -2264,10 +2324,9 @@ inline void ggml_cuda_op_mul(
2264
2324
 
2265
2325
  GGML_ASSERT(src0_ddf_i != nullptr);
2266
2326
  GGML_ASSERT(src1_ddf_i != nullptr);
2267
- GGML_ASSERT(dst_ddf_i != nullptr);
2327
+ GGML_ASSERT(dst_ddf_i != nullptr);
2268
2328
 
2269
2329
  const int64_t ne00 = src0->ne[0];
2270
-
2271
2330
  const int64_t ne10 = src1->ne[0];
2272
2331
  const int64_t ne11 = src1->ne[1];
2273
2332
 
@@ -2276,7 +2335,7 @@ inline void ggml_cuda_op_mul(
2276
2335
 
2277
2336
  float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
2278
2337
  float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
2279
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2338
+ float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2280
2339
 
2281
2340
  // compute
2282
2341
  mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
@@ -2287,6 +2346,28 @@ inline void ggml_cuda_op_mul(
2287
2346
  (void) i02;
2288
2347
  }
2289
2348
 
2349
+ inline void ggml_cuda_op_gelu(
2350
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2351
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2352
+ cudaStream_t & cudaStream_main){
2353
+
2354
+ GGML_ASSERT(src0_ddf_i != nullptr);
2355
+ GGML_ASSERT(dst_ddf_i != nullptr);
2356
+
2357
+ const int64_t ne00 = src0->ne[0];
2358
+ const int64_t i01_diff = i01_high - i01_low;
2359
+
2360
+ // compute
2361
+ gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2362
+
2363
+ (void) src1;
2364
+ (void) dst;
2365
+ (void) src0_ddq_i;
2366
+ (void) src1_ddf_i;
2367
+ (void) i02;
2368
+ (void) i1;
2369
+ }
2370
+
2290
2371
  inline void ggml_cuda_op_silu(
2291
2372
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2292
2373
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2309,6 +2390,28 @@ inline void ggml_cuda_op_silu(
2309
2390
  (void) i1;
2310
2391
  }
2311
2392
 
2393
+ inline void ggml_cuda_op_norm(
2394
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2395
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2396
+ cudaStream_t & cudaStream_main){
2397
+
2398
+ GGML_ASSERT(src0_ddf_i != nullptr);
2399
+ GGML_ASSERT(dst_ddf_i != nullptr);
2400
+
2401
+ const int64_t ne00 = src0->ne[0];
2402
+ const int64_t i01_diff = i01_high - i01_low;
2403
+
2404
+ // compute
2405
+ norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2406
+
2407
+ (void) src1;
2408
+ (void) dst;
2409
+ (void) src0_ddq_i;
2410
+ (void) src1_ddf_i;
2411
+ (void) i02;
2412
+ (void) i1;
2413
+ }
2414
+
2312
2415
  inline void ggml_cuda_op_rms_norm(
2313
2416
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2314
2417
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2355,16 +2458,15 @@ inline void ggml_cuda_op_mul_mat_vec(
2355
2458
  src0->type == GGML_TYPE_Q5_1 ||
2356
2459
  src0->type == GGML_TYPE_Q8_0;
2357
2460
 
2358
- // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
2359
- // However, they have bad performance with Pascal cards.
2360
- // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
2361
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
2461
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
2362
2462
  #endif
2363
2463
 
2364
2464
  if (use_mul_mat_vec_q) {
2465
+ int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2466
+ padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
2365
2467
  size_t as;
2366
- void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
2367
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
2468
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2469
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
2368
2470
 
2369
2471
  switch (src0->type) {
2370
2472
  case GGML_TYPE_Q4_0:
@@ -2925,11 +3027,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2925
3027
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
2926
3028
  }
2927
3029
 
3030
+ void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3031
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3032
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
3033
+ }
3034
+
2928
3035
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2929
3036
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2930
3037
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
2931
3038
  }
2932
3039
 
3040
+ void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3041
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3042
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
3043
+ }
3044
+
2933
3045
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2934
3046
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2935
3047
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -3108,7 +3220,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3108
3220
 
3109
3221
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3110
3222
  int nrows = ggml_nrows(tensor);
3223
+
3224
+ const int64_t ne0 = tensor->ne[0];
3225
+
3111
3226
  const size_t nb1 = tensor->nb[1];
3227
+
3112
3228
  ggml_backend backend = tensor->backend;
3113
3229
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
3114
3230
  memset(extra, 0, sizeof(*extra));
@@ -3137,13 +3253,26 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3137
3253
  int64_t nrows_split = row_high - row_low;
3138
3254
 
3139
3255
  const size_t offset_split = row_low*nb1;
3140
- const size_t size = ggml_nbytes_split(tensor, nrows_split);
3256
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
3257
+ const size_t original_size = size;
3258
+
3259
+ // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3260
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
3261
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3262
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
3263
+ }
3141
3264
 
3142
- void * buf;
3265
+ char * buf;
3143
3266
  CUDA_CHECK(cudaMalloc(&buf, size));
3144
- void * buf_host = (char*)data + offset_split;
3267
+ char * buf_host = (char*)data + offset_split;
3268
+
3269
+ // set padding to 0 to avoid possible NaN values
3270
+ if (size > original_size) {
3271
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
3272
+ }
3145
3273
 
3146
- cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
3274
+
3275
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
3147
3276
 
3148
3277
  extra->data_device[id] = buf;
3149
3278
 
@@ -3183,36 +3312,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3183
3312
  }
3184
3313
 
3185
3314
  // recursively assign CUDA buffers until a compute tensor is found
3186
- if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
3187
- const ggml_op src0_op = tensor->src0->op;
3315
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3316
+ const ggml_op src0_op = tensor->src[0]->op;
3188
3317
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
3189
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
3318
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
3190
3319
  }
3191
3320
  }
3192
- if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
3193
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
3321
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
3322
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
3194
3323
  }
3195
3324
 
3196
3325
  tensor->backend = GGML_BACKEND_GPU;
3197
3326
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
3198
3327
  memset(extra, 0, sizeof(*extra));
3199
3328
 
3200
- const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
3329
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
3201
3330
  tensor->op == GGML_OP_VIEW ||
3202
3331
  force_inplace;
3203
3332
  const size_t size = ggml_nbytes(tensor);
3204
3333
 
3205
3334
  CUDA_CHECK(cudaSetDevice(g_main_device));
3206
- if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
3207
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
3335
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
3336
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
3208
3337
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3209
3338
  size_t offset = 0;
3210
3339
  if (tensor->op == GGML_OP_VIEW) {
3211
- memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
3340
+ memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3212
3341
  }
3213
3342
  extra->data_device[g_main_device] = src0_ddc + offset;
3214
3343
  } else if (tensor->op == GGML_OP_CPY) {
3215
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
3344
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
3216
3345
  void * src1_ddv = src1_extra->data_device[g_main_device];
3217
3346
  extra->data_device[g_main_device] = src1_ddv;
3218
3347
  } else if (scratch) {
@@ -3283,8 +3412,8 @@ void ggml_cuda_free_scratch() {
3283
3412
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
3284
3413
  ggml_cuda_func_t func;
3285
3414
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
3286
- || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
3287
- || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
3415
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3416
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
3288
3417
 
3289
3418
  switch (tensor->op) {
3290
3419
  case GGML_OP_ADD:
@@ -3299,12 +3428,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3299
3428
  }
3300
3429
  func = ggml_cuda_mul;
3301
3430
  break;
3431
+ case GGML_OP_GELU:
3432
+ if (!any_on_device) {
3433
+ return false;
3434
+ }
3435
+ func = ggml_cuda_gelu;
3436
+ break;
3302
3437
  case GGML_OP_SILU:
3303
3438
  if (!any_on_device) {
3304
3439
  return false;
3305
3440
  }
3306
3441
  func = ggml_cuda_silu;
3307
3442
  break;
3443
+ case GGML_OP_NORM:
3444
+ if (!any_on_device) {
3445
+ return false;
3446
+ }
3447
+ func = ggml_cuda_norm;
3448
+ break;
3308
3449
  case GGML_OP_RMS_NORM:
3309
3450
  if (!any_on_device) {
3310
3451
  return false;
@@ -3312,7 +3453,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3312
3453
  func = ggml_cuda_rms_norm;
3313
3454
  break;
3314
3455
  case GGML_OP_MUL_MAT:
3315
- if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
3456
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
3316
3457
  return false;
3317
3458
  }
3318
3459
  func = ggml_cuda_mul_mat;
@@ -3366,6 +3507,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3366
3507
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3367
3508
  return true;
3368
3509
  }
3369
- func(tensor->src0, tensor->src1, tensor);
3510
+ func(tensor->src[0], tensor->src[1], tensor);
3370
3511
  return true;
3371
3512
  }