llama_cpp 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
59
59
  #endif //GGML_CUDA_DMMV_F16
60
60
 
61
61
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
- typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
63
- typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
62
+ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
63
+ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
64
64
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
65
65
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
66
66
  typedef void (*ggml_cuda_op_t)(
@@ -131,7 +131,7 @@ typedef struct {
131
131
  } block_q8_1;
132
132
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
133
 
134
- typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
134
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
135
135
 
136
136
  //================================= k-quants
137
137
 
@@ -208,9 +208,11 @@ typedef struct {
208
208
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
209
209
 
210
210
  #define WARP_SIZE 32
211
+ #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
211
212
 
212
213
  #define CUDA_ADD_BLOCK_SIZE 256
213
214
  #define CUDA_MUL_BLOCK_SIZE 256
215
+ #define CUDA_GELU_BLOCK_SIZE 256
214
216
  #define CUDA_SILU_BLOCK_SIZE 256
215
217
  #define CUDA_CPY_BLOCK_SIZE 32
216
218
  #define CUDA_SCALE_BLOCK_SIZE 256
@@ -265,6 +267,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
265
267
  dst[i] = x[i] * y[i%ky];
266
268
  }
267
269
 
270
+ static __global__ void gelu_f32(const float * x, float * dst, const int k) {
271
+ const float GELU_COEF_A = 0.044715f;
272
+ const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
273
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
274
+
275
+ if (i >= k) {
276
+ return;
277
+ }
278
+
279
+ float xi = x[i];
280
+ dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
281
+ }
282
+
268
283
  static __global__ void silu_f32(const float * x, float * dst, const int k) {
269
284
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
270
285
 
@@ -274,16 +289,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
274
289
  dst[i] = x[i] / (1.0f + expf(-x[i]));
275
290
  }
276
291
 
292
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
293
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
294
+ const int tid = threadIdx.x;
295
+
296
+ const float eps = 1e-5f;
297
+
298
+ float mean = 0.0f;
299
+ float var = 0.0f;
300
+
301
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
302
+ const float xi = x[row*ncols + col];
303
+ mean += xi;
304
+ var += xi * xi;
305
+ }
306
+
307
+ // sum up partial sums
308
+ #pragma unroll
309
+ for (int mask = 16; mask > 0; mask >>= 1) {
310
+ mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
311
+ var += __shfl_xor_sync(0xffffffff, var, mask, 32);
312
+ }
313
+
314
+ mean /= ncols;
315
+ var = var / ncols - mean * mean;
316
+ const float inv_var = rsqrtf(var + eps);
317
+
318
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
319
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
320
+ }
321
+ }
322
+
277
323
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
278
324
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
279
325
  const int tid = threadIdx.x;
280
326
 
281
- const float eps = 1e-6;
327
+ const float eps = 1e-6f;
282
328
 
283
329
  float tmp = 0.0f; // partial sum for thread in warp
284
330
 
285
- for (int i = 0; i < ncols; i += WARP_SIZE) {
286
- const int col = i + tid;
331
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
287
332
  const float xi = x[row*ncols + col];
288
333
  tmp += xi * xi;
289
334
  }
@@ -295,10 +340,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
295
340
  }
296
341
 
297
342
  const float mean = tmp / ncols;
298
- const float scale = 1.0f / sqrtf(mean + eps);
343
+ const float scale = rsqrtf(mean + eps);
299
344
 
300
- for (int i = 0; i < ncols; i += WARP_SIZE) {
301
- const int col = i + tid;
345
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
302
346
  dst[row*ncols + col] = scale * x[row*ncols + col];
303
347
  }
304
348
  }
@@ -407,7 +451,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
407
451
 
408
452
  //================================== k-quants
409
453
 
410
- static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
454
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
411
455
 
412
456
  const int i = blockIdx.x;
413
457
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -440,7 +484,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
440
484
 
441
485
  }
442
486
 
443
- static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
487
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
444
488
 
445
489
  const int i = blockIdx.x;
446
490
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -504,7 +548,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
504
548
  }
505
549
  #endif
506
550
 
507
- static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
551
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
508
552
  const block_q4_K * x = (const block_q4_K *) vx;
509
553
 
510
554
  const int i = blockIdx.x;
@@ -544,7 +588,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
544
588
  #endif
545
589
  }
546
590
 
547
- static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
591
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
548
592
  const block_q5_K * x = (const block_q5_K *) vx;
549
593
 
550
594
  const int i = blockIdx.x;
@@ -590,7 +634,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
590
634
  #endif
591
635
  }
592
636
 
593
- static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
637
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
594
638
  const block_q6_K * x = (const block_q6_K *) vx;
595
639
 
596
640
  const int i = blockIdx.x;
@@ -634,7 +678,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
634
678
  #endif
635
679
  }
636
680
 
637
- static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
681
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
638
682
 
639
683
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
640
684
 
@@ -742,7 +786,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
742
786
  }
743
787
  }
744
788
 
745
- static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
789
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
746
790
 
747
791
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
748
792
  if (row > nrows) return;
@@ -846,7 +890,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
846
890
  }
847
891
  }
848
892
 
849
- static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
893
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
850
894
 
851
895
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
852
896
  if (row > nrows) return;
@@ -949,7 +993,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
949
993
  }
950
994
  }
951
995
 
952
- static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
996
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
953
997
 
954
998
  const int row = blockIdx.x;
955
999
  const int num_blocks_per_row = ncols / QK_K;
@@ -1053,7 +1097,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1053
1097
  }
1054
1098
  }
1055
1099
 
1056
- static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
1100
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1057
1101
 
1058
1102
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1059
1103
 
@@ -1171,7 +1215,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1171
1215
  v.y = x[ib + iqs + 1];
1172
1216
  }
1173
1217
 
1174
- static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1218
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1175
1219
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
1176
1220
 
1177
1221
  if (i >= k) {
@@ -1180,10 +1224,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1180
1224
 
1181
1225
  block_q8_1 * y = (block_q8_1 *) vy;
1182
1226
 
1183
- const int ib = i / QK8_0; // block index
1184
- const int iqs = i % QK8_0; // quant index
1227
+ const int ib = i / QK8_1; // block index
1228
+ const int iqs = i % QK8_1; // quant index
1185
1229
 
1186
- const float xi = x[i];
1230
+ const float xi = i < ndata ? x[i] : 0.0f;
1187
1231
  float amax = fabsf(xi);
1188
1232
  float sum = xi;
1189
1233
 
@@ -1207,7 +1251,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1207
1251
  }
1208
1252
 
1209
1253
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1210
- static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1254
+ static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1211
1255
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1212
1256
 
1213
1257
  if (i >= k) {
@@ -1227,8 +1271,8 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1227
1271
  y[iybs + iqs + y_offset] = v.y;
1228
1272
  }
1229
1273
 
1230
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1231
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1274
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1275
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1232
1276
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1233
1277
 
1234
1278
  int vi;
@@ -1249,11 +1293,11 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
1249
1293
  return sumi*d;
1250
1294
  #else
1251
1295
  return 0.0f; // only to satisfy the compiler
1252
- #endif // __CUDA_ARCH__ >= 600
1296
+ #endif // __CUDA_ARCH__ >= 610
1253
1297
  }
1254
1298
 
1255
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1256
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1299
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1300
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1257
1301
  const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1258
1302
 
1259
1303
  const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1274,11 +1318,11 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
1274
1318
  return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1275
1319
  #else
1276
1320
  return 0.0f; // only to satisfy the compiler
1277
- #endif // __CUDA_ARCH__ >= 600
1321
+ #endif // __CUDA_ARCH__ >= 610
1278
1322
  }
1279
1323
 
1280
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1281
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1324
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1325
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1282
1326
  const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1283
1327
 
1284
1328
  int qs;
@@ -1309,11 +1353,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
1309
1353
  return sumi*d;
1310
1354
  #else
1311
1355
  return 0.0f; // only to satisfy the compiler
1312
- #endif // __CUDA_ARCH__ >= 600
1356
+ #endif // __CUDA_ARCH__ >= 610
1313
1357
  }
1314
1358
 
1315
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1316
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1359
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1360
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1317
1361
  const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1318
1362
 
1319
1363
  const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1343,11 +1387,11 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
1343
1387
  return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1344
1388
  #else
1345
1389
  return 0.0f; // only to satisfy the compiler
1346
- #endif // __CUDA_ARCH__ >= 600
1390
+ #endif // __CUDA_ARCH__ >= 610
1347
1391
  }
1348
1392
 
1349
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1350
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1393
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1394
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1351
1395
  const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1352
1396
 
1353
1397
  int vi;
@@ -1362,11 +1406,11 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
1362
1406
  return sumi*d;
1363
1407
  #else
1364
1408
  return 0.0f; // only to satisfy the compiler
1365
- #endif // __CUDA_ARCH__ >= 600
1409
+ #endif // __CUDA_ARCH__ >= 610
1366
1410
  }
1367
1411
 
1368
1412
  template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1369
- static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
1413
+ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1370
1414
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1371
1415
 
1372
1416
  if (row >= nrows) {
@@ -1404,7 +1448,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1404
1448
  }
1405
1449
 
1406
1450
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1407
- static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1451
+ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
1408
1452
  // qk = quantized weights per x block
1409
1453
  // qr = number of quantized weights per data value in x block
1410
1454
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -1471,7 +1515,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1471
1515
  }
1472
1516
  }
1473
1517
 
1474
- static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1518
+ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1475
1519
  const half * x = (const half *) vx;
1476
1520
 
1477
1521
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1518,7 +1562,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1518
1562
  }
1519
1563
 
1520
1564
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1521
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1565
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1522
1566
  const int row_stride_x, const int channel_stride_x) {
1523
1567
 
1524
1568
  const half * x = (const half *) vx;
@@ -1703,20 +1747,31 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
1703
1747
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1704
1748
  }
1705
1749
 
1750
+ static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1751
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
1752
+ gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1753
+ }
1754
+
1706
1755
  static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1707
1756
  const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
1708
1757
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1709
1758
  }
1710
1759
 
1760
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1761
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
1762
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1763
+ norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1764
+ }
1765
+
1711
1766
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1712
1767
  GGML_ASSERT(ncols % WARP_SIZE == 0);
1713
1768
  const dim3 block_dims(WARP_SIZE, 1, 1);
1714
1769
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1715
1770
  }
1716
1771
 
1717
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
1772
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
1718
1773
  const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1719
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
1774
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
1720
1775
  }
1721
1776
 
1722
1777
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2236,16 +2291,21 @@ inline void ggml_cuda_op_add(
2236
2291
 
2237
2292
  GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
2238
2293
  GGML_ASSERT(src1_ddf_i != nullptr);
2239
- GGML_ASSERT(dst_ddf_i != nullptr);
2294
+ GGML_ASSERT(dst_ddf_i != nullptr);
2240
2295
 
2241
- const int64_t ne0 = src0->ne[0];
2296
+ // TODO: support broadcasting
2297
+ GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
2298
+
2299
+ const int64_t ne00 = src0->ne[0];
2242
2300
  const int64_t i01_diff = i01_high - i01_low;
2243
2301
 
2302
+ // const int64_t ne10 = src1->ne[0];
2303
+
2244
2304
  // compute
2245
2305
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2246
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
2306
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2247
2307
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
2248
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
2308
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
2249
2309
  } else {
2250
2310
  GGML_ASSERT(false);
2251
2311
  }
@@ -2264,10 +2324,9 @@ inline void ggml_cuda_op_mul(
2264
2324
 
2265
2325
  GGML_ASSERT(src0_ddf_i != nullptr);
2266
2326
  GGML_ASSERT(src1_ddf_i != nullptr);
2267
- GGML_ASSERT(dst_ddf_i != nullptr);
2327
+ GGML_ASSERT(dst_ddf_i != nullptr);
2268
2328
 
2269
2329
  const int64_t ne00 = src0->ne[0];
2270
-
2271
2330
  const int64_t ne10 = src1->ne[0];
2272
2331
  const int64_t ne11 = src1->ne[1];
2273
2332
 
@@ -2276,7 +2335,7 @@ inline void ggml_cuda_op_mul(
2276
2335
 
2277
2336
  float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
2278
2337
  float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
2279
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2338
+ float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2280
2339
 
2281
2340
  // compute
2282
2341
  mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
@@ -2287,6 +2346,28 @@ inline void ggml_cuda_op_mul(
2287
2346
  (void) i02;
2288
2347
  }
2289
2348
 
2349
+ inline void ggml_cuda_op_gelu(
2350
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2351
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2352
+ cudaStream_t & cudaStream_main){
2353
+
2354
+ GGML_ASSERT(src0_ddf_i != nullptr);
2355
+ GGML_ASSERT(dst_ddf_i != nullptr);
2356
+
2357
+ const int64_t ne00 = src0->ne[0];
2358
+ const int64_t i01_diff = i01_high - i01_low;
2359
+
2360
+ // compute
2361
+ gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2362
+
2363
+ (void) src1;
2364
+ (void) dst;
2365
+ (void) src0_ddq_i;
2366
+ (void) src1_ddf_i;
2367
+ (void) i02;
2368
+ (void) i1;
2369
+ }
2370
+
2290
2371
  inline void ggml_cuda_op_silu(
2291
2372
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2292
2373
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2309,6 +2390,28 @@ inline void ggml_cuda_op_silu(
2309
2390
  (void) i1;
2310
2391
  }
2311
2392
 
2393
+ inline void ggml_cuda_op_norm(
2394
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2395
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2396
+ cudaStream_t & cudaStream_main){
2397
+
2398
+ GGML_ASSERT(src0_ddf_i != nullptr);
2399
+ GGML_ASSERT(dst_ddf_i != nullptr);
2400
+
2401
+ const int64_t ne00 = src0->ne[0];
2402
+ const int64_t i01_diff = i01_high - i01_low;
2403
+
2404
+ // compute
2405
+ norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2406
+
2407
+ (void) src1;
2408
+ (void) dst;
2409
+ (void) src0_ddq_i;
2410
+ (void) src1_ddf_i;
2411
+ (void) i02;
2412
+ (void) i1;
2413
+ }
2414
+
2312
2415
  inline void ggml_cuda_op_rms_norm(
2313
2416
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2314
2417
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2355,16 +2458,15 @@ inline void ggml_cuda_op_mul_mat_vec(
2355
2458
  src0->type == GGML_TYPE_Q5_1 ||
2356
2459
  src0->type == GGML_TYPE_Q8_0;
2357
2460
 
2358
- // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
2359
- // However, they have bad performance with Pascal cards.
2360
- // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
2361
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
2461
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
2362
2462
  #endif
2363
2463
 
2364
2464
  if (use_mul_mat_vec_q) {
2465
+ int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2466
+ padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
2365
2467
  size_t as;
2366
- void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
2367
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
2468
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2469
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
2368
2470
 
2369
2471
  switch (src0->type) {
2370
2472
  case GGML_TYPE_Q4_0:
@@ -2925,11 +3027,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2925
3027
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
2926
3028
  }
2927
3029
 
3030
+ void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3031
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3032
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
3033
+ }
3034
+
2928
3035
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2929
3036
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2930
3037
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
2931
3038
  }
2932
3039
 
3040
+ void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3041
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3042
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
3043
+ }
3044
+
2933
3045
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2934
3046
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2935
3047
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -3108,7 +3220,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3108
3220
 
3109
3221
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3110
3222
  int nrows = ggml_nrows(tensor);
3223
+
3224
+ const int64_t ne0 = tensor->ne[0];
3225
+
3111
3226
  const size_t nb1 = tensor->nb[1];
3227
+
3112
3228
  ggml_backend backend = tensor->backend;
3113
3229
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
3114
3230
  memset(extra, 0, sizeof(*extra));
@@ -3137,13 +3253,26 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3137
3253
  int64_t nrows_split = row_high - row_low;
3138
3254
 
3139
3255
  const size_t offset_split = row_low*nb1;
3140
- const size_t size = ggml_nbytes_split(tensor, nrows_split);
3256
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
3257
+ const size_t original_size = size;
3258
+
3259
+ // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3260
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
3261
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3262
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
3263
+ }
3141
3264
 
3142
- void * buf;
3265
+ char * buf;
3143
3266
  CUDA_CHECK(cudaMalloc(&buf, size));
3144
- void * buf_host = (char*)data + offset_split;
3267
+ char * buf_host = (char*)data + offset_split;
3268
+
3269
+ // set padding to 0 to avoid possible NaN values
3270
+ if (size > original_size) {
3271
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
3272
+ }
3145
3273
 
3146
- cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
3274
+
3275
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
3147
3276
 
3148
3277
  extra->data_device[id] = buf;
3149
3278
 
@@ -3183,36 +3312,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3183
3312
  }
3184
3313
 
3185
3314
  // recursively assign CUDA buffers until a compute tensor is found
3186
- if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
3187
- const ggml_op src0_op = tensor->src0->op;
3315
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3316
+ const ggml_op src0_op = tensor->src[0]->op;
3188
3317
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
3189
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
3318
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
3190
3319
  }
3191
3320
  }
3192
- if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
3193
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
3321
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
3322
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
3194
3323
  }
3195
3324
 
3196
3325
  tensor->backend = GGML_BACKEND_GPU;
3197
3326
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
3198
3327
  memset(extra, 0, sizeof(*extra));
3199
3328
 
3200
- const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
3329
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
3201
3330
  tensor->op == GGML_OP_VIEW ||
3202
3331
  force_inplace;
3203
3332
  const size_t size = ggml_nbytes(tensor);
3204
3333
 
3205
3334
  CUDA_CHECK(cudaSetDevice(g_main_device));
3206
- if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
3207
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
3335
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
3336
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
3208
3337
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3209
3338
  size_t offset = 0;
3210
3339
  if (tensor->op == GGML_OP_VIEW) {
3211
- memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
3340
+ memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3212
3341
  }
3213
3342
  extra->data_device[g_main_device] = src0_ddc + offset;
3214
3343
  } else if (tensor->op == GGML_OP_CPY) {
3215
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
3344
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
3216
3345
  void * src1_ddv = src1_extra->data_device[g_main_device];
3217
3346
  extra->data_device[g_main_device] = src1_ddv;
3218
3347
  } else if (scratch) {
@@ -3283,8 +3412,8 @@ void ggml_cuda_free_scratch() {
3283
3412
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
3284
3413
  ggml_cuda_func_t func;
3285
3414
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
3286
- || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
3287
- || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
3415
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3416
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
3288
3417
 
3289
3418
  switch (tensor->op) {
3290
3419
  case GGML_OP_ADD:
@@ -3299,12 +3428,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3299
3428
  }
3300
3429
  func = ggml_cuda_mul;
3301
3430
  break;
3431
+ case GGML_OP_GELU:
3432
+ if (!any_on_device) {
3433
+ return false;
3434
+ }
3435
+ func = ggml_cuda_gelu;
3436
+ break;
3302
3437
  case GGML_OP_SILU:
3303
3438
  if (!any_on_device) {
3304
3439
  return false;
3305
3440
  }
3306
3441
  func = ggml_cuda_silu;
3307
3442
  break;
3443
+ case GGML_OP_NORM:
3444
+ if (!any_on_device) {
3445
+ return false;
3446
+ }
3447
+ func = ggml_cuda_norm;
3448
+ break;
3308
3449
  case GGML_OP_RMS_NORM:
3309
3450
  if (!any_on_device) {
3310
3451
  return false;
@@ -3312,7 +3453,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3312
3453
  func = ggml_cuda_rms_norm;
3313
3454
  break;
3314
3455
  case GGML_OP_MUL_MAT:
3315
- if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
3456
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
3316
3457
  return false;
3317
3458
  }
3318
3459
  func = ggml_cuda_mul_mat;
@@ -3366,6 +3507,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3366
3507
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3367
3508
  return true;
3368
3509
  }
3369
- func(tensor->src0, tensor->src1, tensor);
3510
+ func(tensor->src[0], tensor->src[1], tensor);
3370
3511
  return true;
3371
3512
  }