llama_cpp 0.3.2 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,6 +13,8 @@
13
13
  #include "ggml-cuda.h"
14
14
  #include "ggml.h"
15
15
 
16
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+
16
18
  #if defined(_MSC_VER)
17
19
  #pragma warning(disable: 4244 4267) // possible loss of data
18
20
  #endif
@@ -59,8 +61,8 @@ typedef float2 dfloat2;
59
61
  #endif //GGML_CUDA_DMMV_F16
60
62
 
61
63
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
- typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
63
- typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
64
+ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
65
+ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
64
66
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
65
67
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
66
68
  typedef void (*ggml_cuda_op_t)(
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
74
76
 
75
77
  #define QK4_0 32
76
78
  #define QR4_0 2
77
- #define QI4_0 4
79
+ #define QI4_0 (QK4_0 / (4 * QR4_0))
78
80
  typedef struct {
79
81
  half d; // delta
80
82
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
83
85
 
84
86
  #define QK4_1 32
85
87
  #define QR4_1 2
86
- #define QI4_1 4
88
+ #define QI4_1 (QK4_1 / (4 * QR4_1))
87
89
  typedef struct {
88
90
  half d; // delta
89
91
  half m; // min
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
93
95
 
94
96
  #define QK5_0 32
95
97
  #define QR5_0 2
96
- #define QI5_0 4
98
+ #define QI5_0 (QK5_0 / (4 * QR5_0))
97
99
  typedef struct {
98
100
  half d; // delta
99
101
  uint8_t qh[4]; // 5-th bit of quants
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
103
105
 
104
106
  #define QK5_1 32
105
107
  #define QR5_1 2
106
- #define QI5_1 4
108
+ #define QI5_1 (QK5_1 / (4 * QR5_1))
107
109
  typedef struct {
108
110
  half d; // delta
109
111
  half m; // min
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
114
116
 
115
117
  #define QK8_0 32
116
118
  #define QR8_0 1
117
- #define QI8_0 8
119
+ #define QI8_0 (QK8_0 / (4 * QR8_0))
118
120
  typedef struct {
119
121
  half d; // delta
120
122
  int8_t qs[QK8_0]; // quants
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
123
125
 
124
126
  #define QK8_1 32
125
127
  #define QR8_1 1
126
- #define QI8_1 8
128
+ #define QI8_1 (QK8_1 / (4 * QR8_1))
127
129
  typedef struct {
128
130
  half d; // delta
129
131
  half s; // unquantized sum
@@ -131,7 +133,7 @@ typedef struct {
131
133
  } block_q8_1;
132
134
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
135
 
134
- typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
136
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
135
137
 
136
138
  //================================= k-quants
137
139
 
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, co
143
145
  #define K_SCALE_SIZE 12
144
146
  #endif
145
147
 
148
+ #define QR2_K 4
149
+ #define QI2_K (QK_K / (4*QR2_K))
146
150
  typedef struct {
147
151
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
148
152
  uint8_t qs[QK_K/4]; // quants
@@ -151,6 +155,8 @@ typedef struct {
151
155
  } block_q2_K;
152
156
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
153
157
 
158
+ #define QR3_K 4
159
+ #define QI3_K (QK_K / (4*QR3_K))
154
160
  typedef struct {
155
161
  uint8_t hmask[QK_K/8]; // quants - high bit
156
162
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -163,6 +169,8 @@ typedef struct {
163
169
  } block_q3_K;
164
170
  //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
165
171
 
172
+ #define QR4_K 2
173
+ #define QI4_K (QK_K / (4*QR4_K))
166
174
  #ifdef GGML_QKK_64
167
175
  typedef struct {
168
176
  half d[2]; // super-block scales/mins
@@ -180,6 +188,8 @@ typedef struct {
180
188
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
181
189
  #endif
182
190
 
191
+ #define QR5_K 2
192
+ #define QI5_K (QK_K / (4*QR5_K))
183
193
  #ifdef GGML_QKK_64
184
194
  typedef struct {
185
195
  half d; // super-block scale
@@ -199,6 +209,8 @@ typedef struct {
199
209
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
200
210
  #endif
201
211
 
212
+ #define QR6_K 2
213
+ #define QI6_K (QK_K / (4*QR6_K))
202
214
  typedef struct {
203
215
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
204
216
  uint8_t qh[QK_K/4]; // quants, upper 2 bits
@@ -208,9 +220,11 @@ typedef struct {
208
220
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
209
221
 
210
222
  #define WARP_SIZE 32
223
+ #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
211
224
 
212
225
  #define CUDA_ADD_BLOCK_SIZE 256
213
226
  #define CUDA_MUL_BLOCK_SIZE 256
227
+ #define CUDA_GELU_BLOCK_SIZE 256
214
228
  #define CUDA_SILU_BLOCK_SIZE 256
215
229
  #define CUDA_CPY_BLOCK_SIZE 32
216
230
  #define CUDA_SCALE_BLOCK_SIZE 256
@@ -238,13 +252,13 @@ struct ggml_tensor_extra_gpu {
238
252
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
239
253
  };
240
254
 
241
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
255
+ static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
242
256
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
243
257
 
244
- if (i >= k) {
258
+ if (i >= kx) {
245
259
  return;
246
260
  }
247
- dst[i] = x[i] + y[i];
261
+ dst[i] = x[i] + y[i%ky];
248
262
  }
249
263
 
250
264
  static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -265,6 +279,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
265
279
  dst[i] = x[i] * y[i%ky];
266
280
  }
267
281
 
282
+ static __global__ void gelu_f32(const float * x, float * dst, const int k) {
283
+ const float GELU_COEF_A = 0.044715f;
284
+ const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
285
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
286
+
287
+ if (i >= k) {
288
+ return;
289
+ }
290
+
291
+ float xi = x[i];
292
+ dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
293
+ }
294
+
268
295
  static __global__ void silu_f32(const float * x, float * dst, const int k) {
269
296
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
270
297
 
@@ -274,16 +301,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
274
301
  dst[i] = x[i] / (1.0f + expf(-x[i]));
275
302
  }
276
303
 
304
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
305
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
306
+ const int tid = threadIdx.x;
307
+
308
+ const float eps = 1e-5f;
309
+
310
+ float mean = 0.0f;
311
+ float var = 0.0f;
312
+
313
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
314
+ const float xi = x[row*ncols + col];
315
+ mean += xi;
316
+ var += xi * xi;
317
+ }
318
+
319
+ // sum up partial sums
320
+ #pragma unroll
321
+ for (int mask = 16; mask > 0; mask >>= 1) {
322
+ mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
323
+ var += __shfl_xor_sync(0xffffffff, var, mask, 32);
324
+ }
325
+
326
+ mean /= ncols;
327
+ var = var / ncols - mean * mean;
328
+ const float inv_var = rsqrtf(var + eps);
329
+
330
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
331
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
332
+ }
333
+ }
334
+
277
335
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
278
336
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
279
337
  const int tid = threadIdx.x;
280
338
 
281
- const float eps = 1e-6;
339
+ const float eps = 1e-6f;
282
340
 
283
341
  float tmp = 0.0f; // partial sum for thread in warp
284
342
 
285
- for (int i = 0; i < ncols; i += WARP_SIZE) {
286
- const int col = i + tid;
343
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
287
344
  const float xi = x[row*ncols + col];
288
345
  tmp += xi * xi;
289
346
  }
@@ -295,10 +352,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
295
352
  }
296
353
 
297
354
  const float mean = tmp / ncols;
298
- const float scale = 1.0f / sqrtf(mean + eps);
355
+ const float scale = rsqrtf(mean + eps);
299
356
 
300
- for (int i = 0; i < ncols; i += WARP_SIZE) {
301
- const int col = i + tid;
357
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
302
358
  dst[row*ncols + col] = scale * x[row*ncols + col];
303
359
  }
304
360
  }
@@ -407,7 +463,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
407
463
 
408
464
  //================================== k-quants
409
465
 
410
- static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
466
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
411
467
 
412
468
  const int i = blockIdx.x;
413
469
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -440,7 +496,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
440
496
 
441
497
  }
442
498
 
443
- static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
499
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
444
500
 
445
501
  const int i = blockIdx.x;
446
502
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -504,7 +560,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
504
560
  }
505
561
  #endif
506
562
 
507
- static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
563
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
508
564
  const block_q4_K * x = (const block_q4_K *) vx;
509
565
 
510
566
  const int i = blockIdx.x;
@@ -544,7 +600,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
544
600
  #endif
545
601
  }
546
602
 
547
- static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
603
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
548
604
  const block_q5_K * x = (const block_q5_K *) vx;
549
605
 
550
606
  const int i = blockIdx.x;
@@ -590,7 +646,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
590
646
  #endif
591
647
  }
592
648
 
593
- static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
649
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
594
650
  const block_q6_K * x = (const block_q6_K *) vx;
595
651
 
596
652
  const int i = blockIdx.x;
@@ -634,7 +690,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
634
690
  #endif
635
691
  }
636
692
 
637
- static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
693
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
638
694
 
639
695
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
640
696
 
@@ -742,7 +798,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
742
798
  }
743
799
  }
744
800
 
745
- static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
801
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
746
802
 
747
803
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
748
804
  if (row > nrows) return;
@@ -846,7 +902,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
846
902
  }
847
903
  }
848
904
 
849
- static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
905
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
850
906
 
851
907
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
852
908
  if (row > nrows) return;
@@ -949,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
949
1005
  }
950
1006
  }
951
1007
 
952
- static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
1008
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
953
1009
 
954
1010
  const int row = blockIdx.x;
955
1011
  const int num_blocks_per_row = ncols / QK_K;
@@ -1053,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1053
1109
  }
1054
1110
  }
1055
1111
 
1056
- static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
1112
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1057
1113
 
1058
1114
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1059
1115
 
@@ -1171,7 +1227,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1171
1227
  v.y = x[ib + iqs + 1];
1172
1228
  }
1173
1229
 
1174
- static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1230
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1175
1231
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
1176
1232
 
1177
1233
  if (i >= k) {
@@ -1180,10 +1236,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1180
1236
 
1181
1237
  block_q8_1 * y = (block_q8_1 *) vy;
1182
1238
 
1183
- const int ib = i / QK8_0; // block index
1184
- const int iqs = i % QK8_0; // quant index
1239
+ const int ib = i / QK8_1; // block index
1240
+ const int iqs = i % QK8_1; // quant index
1185
1241
 
1186
- const float xi = x[i];
1242
+ const float xi = i < ndata ? x[i] : 0.0f;
1187
1243
  float amax = fabsf(xi);
1188
1244
  float sum = xi;
1189
1245
 
@@ -1207,7 +1263,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1207
1263
  }
1208
1264
 
1209
1265
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1210
- static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1266
+ static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1211
1267
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1212
1268
 
1213
1269
  if (i >= k) {
@@ -1227,8 +1283,9 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1227
1283
  y[iybs + iqs + y_offset] = v.y;
1228
1284
  }
1229
1285
 
1230
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1231
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1286
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1287
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1288
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1232
1289
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1233
1290
 
1234
1291
  int vi;
@@ -1249,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
1249
1306
  return sumi*d;
1250
1307
  #else
1251
1308
  return 0.0f; // only to satisfy the compiler
1252
- #endif // __CUDA_ARCH__ >= 600
1309
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1253
1310
  }
1254
1311
 
1255
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1256
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1312
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1313
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1314
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1257
1315
  const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1258
1316
 
1259
1317
  const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1274,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
1274
1332
  return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1275
1333
  #else
1276
1334
  return 0.0f; // only to satisfy the compiler
1277
- #endif // __CUDA_ARCH__ >= 600
1335
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1278
1336
  }
1279
1337
 
1280
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1281
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1338
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1339
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1340
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1282
1341
  const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1283
1342
 
1284
1343
  int qs;
@@ -1309,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
1309
1368
  return sumi*d;
1310
1369
  #else
1311
1370
  return 0.0f; // only to satisfy the compiler
1312
- #endif // __CUDA_ARCH__ >= 600
1371
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1313
1372
  }
1314
1373
 
1315
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1316
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1374
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1375
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1376
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1317
1377
  const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1318
1378
 
1319
1379
  const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1343,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
1343
1403
  return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1344
1404
  #else
1345
1405
  return 0.0f; // only to satisfy the compiler
1346
- #endif // __CUDA_ARCH__ >= 600
1406
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1347
1407
  }
1348
1408
 
1349
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1350
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1409
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1410
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1411
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1351
1412
  const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1352
1413
 
1353
1414
  int vi;
@@ -1362,11 +1423,224 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
1362
1423
  return sumi*d;
1363
1424
  #else
1364
1425
  return 0.0f; // only to satisfy the compiler
1365
- #endif // __CUDA_ARCH__ >= 600
1426
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1427
+ }
1428
+
1429
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1430
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1431
+
1432
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1433
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1434
+
1435
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
1436
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1437
+
1438
+ float sumf_d = 0.0f;
1439
+ float sumf_m = 0.0f;
1440
+
1441
+ const float d = bq2_K->d;
1442
+ const float dmin = bq2_K->dmin;
1443
+
1444
+ const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1445
+
1446
+ for (int i = 0; i < QR2_K; ++i) {
1447
+ const int sc = bq2_K->scales[scale_offset + 2*i];
1448
+
1449
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1450
+ const float d8i = bq8i->d;
1451
+
1452
+ const int vi = (v >> (2*i)) & 0x03030303;
1453
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1454
+
1455
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1456
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1457
+ }
1458
+
1459
+ return d*sumf_d - dmin*sumf_m;
1460
+ #else
1461
+ return 0.0f; // only to satisfy the compiler
1462
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1463
+ }
1464
+
1465
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1466
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1467
+
1468
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1469
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
1470
+
1471
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1472
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1473
+
1474
+ float sumf = 0.0f;
1475
+
1476
+ const float d = bq3_K->d;
1477
+
1478
+ int vl;
1479
+ memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
1480
+
1481
+ int vh;
1482
+ memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1483
+ vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1484
+ vh >>= bq8_offset;
1485
+
1486
+ for (int i = 0; i < QR3_K; ++i) {
1487
+ const int isc = scale_offset + 2*i;
1488
+
1489
+ const int isc_low = isc % (QK_K/32);
1490
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
1491
+ const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
1492
+
1493
+ const int isc_high = isc % (QK_K/64);
1494
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
1495
+ const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1496
+
1497
+ const int sc = (sc_low | sc_high) - 32;
1498
+
1499
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1500
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1501
+ const float d8i = bq8i->d;
1502
+
1503
+ const int vil = (vl >> (2*i)) & 0x03030303;
1504
+
1505
+ const int vih = ((vh >> i) << 2) & 0x04040404;
1506
+
1507
+ const int vi = __vsubss4(vil, vih);
1508
+
1509
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1510
+ }
1511
+
1512
+ return d*sumf;
1513
+ #else
1514
+ return 0.0f; // only to satisfy the compiler
1515
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1516
+ }
1517
+
1518
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1519
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1520
+
1521
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1522
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1523
+
1524
+ const int bq8_offset = QR4_K * (iqs / QI8_1);
1525
+
1526
+ float sumf_d = 0.0f;
1527
+ float sumf_m = 0.0f;
1528
+
1529
+ const float d = bq4_K->d;
1530
+ const float dmin = bq4_K->dmin;
1531
+
1532
+ const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
1533
+
1534
+ for (int i = 0; i < QR4_K; ++i) {
1535
+ const int isc = bq8_offset + i;
1536
+
1537
+ uint8_t sc, m;
1538
+ get_scale_min_k4(isc, bq4_K->scales, sc, m);
1539
+
1540
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1541
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1542
+ const float d8i = bq8i->d;
1543
+
1544
+ const int vi = (v >> (4*i)) & 0x0F0F0F0F;
1545
+
1546
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1547
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
1548
+ }
1549
+
1550
+ return d*sumf_d - dmin*sumf_m;
1551
+ #else
1552
+ return 0.0f; // only to satisfy the compiler
1553
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1554
+ }
1555
+
1556
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1557
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1558
+
1559
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1560
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1561
+
1562
+ const int bq8_offset = QR5_K * (iqs / QI8_1);
1563
+
1564
+ float sumf_d = 0.0f;
1565
+ float sumf_m = 0.0f;
1566
+
1567
+ const float d = bq5_K->d;
1568
+ const float dmin = bq5_K->dmin;
1569
+
1570
+ const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
1571
+
1572
+ const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
1573
+
1574
+ for (int i = 0; i < QR5_K; ++i) {
1575
+ const int isc = bq8_offset + i;
1576
+
1577
+ uint8_t sc, m;
1578
+ get_scale_min_k4(isc, bq5_K->scales, sc, m);
1579
+
1580
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1581
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1582
+ const float d8i = bq8i->d;
1583
+
1584
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1585
+
1586
+ const int vih = ((vh >> i) << 4) & 0x10101010;
1587
+
1588
+ const int vi = vil | vih;
1589
+
1590
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1591
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
1592
+ }
1593
+
1594
+ return d*sumf_d - dmin*sumf_m;
1595
+ #else
1596
+ return 0.0f; // only to satisfy the compiler
1597
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1598
+ }
1599
+
1600
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1601
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1602
+
1603
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1604
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
1605
+
1606
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1607
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1608
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
1609
+
1610
+ float sumf = 0.0f;
1611
+
1612
+ const float d = bq6_K->d;
1613
+
1614
+ int vl;
1615
+ memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
1616
+
1617
+ int vh;
1618
+ memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
1619
+
1620
+ for (int i = 0; i < QR6_K; ++i) {
1621
+ const int sc = bq6_K->scales[scale_offset + 4*i];
1622
+
1623
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1624
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1625
+ const float d8i = bq8i->d;
1626
+
1627
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1628
+
1629
+ const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
1630
+
1631
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1632
+
1633
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1634
+ }
1635
+
1636
+ return d*sumf;
1637
+ #else
1638
+ return 0.0f; // only to satisfy the compiler
1639
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1366
1640
  }
1367
1641
 
1368
1642
  template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1369
- static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
1643
+ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1370
1644
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1371
1645
 
1372
1646
  if (row >= nrows) {
@@ -1385,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1385
1659
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1386
1660
  const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1387
1661
 
1388
- const int iby = i + threadIdx.x / qi; // y block index
1662
+ const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
1389
1663
 
1390
1664
  const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1391
1665
 
@@ -1404,7 +1678,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1404
1678
  }
1405
1679
 
1406
1680
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1407
- static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1681
+ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
1408
1682
  // qk = quantized weights per x block
1409
1683
  // qr = number of quantized weights per data value in x block
1410
1684
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -1471,7 +1745,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1471
1745
  }
1472
1746
  }
1473
1747
 
1474
- static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1748
+ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1475
1749
  const half * x = (const half *) vx;
1476
1750
 
1477
1751
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1518,7 +1792,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1518
1792
  }
1519
1793
 
1520
1794
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1521
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1795
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1522
1796
  const int row_stride_x, const int channel_stride_x) {
1523
1797
 
1524
1798
  const half * x = (const half *) vx;
@@ -1623,6 +1897,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
1623
1897
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
1624
1898
  }
1625
1899
 
1900
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
1901
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
1902
+ const int half_n_dims = ncols/4;
1903
+
1904
+ if (col >= half_n_dims) {
1905
+ return;
1906
+ }
1907
+
1908
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1909
+ const int i = row*ncols + col;
1910
+
1911
+ const float col_theta_scale = powf(theta_scale, col);
1912
+
1913
+ const float theta = p*col_theta_scale;
1914
+ const float sin_theta = sinf(theta);
1915
+ const float cos_theta = cosf(theta);
1916
+
1917
+ const float x0 = x[i + 0];
1918
+ const float x1 = x[i + half_n_dims];
1919
+
1920
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
1921
+ dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
1922
+
1923
+ const float block_theta = block_p*col_theta_scale;
1924
+ const float sin_block_theta = sinf(block_theta);
1925
+ const float cos_block_theta = cosf(block_theta);
1926
+
1927
+ const float x2 = x[i + half_n_dims * 2];
1928
+ const float x3 = x[i + half_n_dims * 3];
1929
+
1930
+ dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
1931
+ dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
1932
+ }
1933
+
1626
1934
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
1627
1935
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
1628
1936
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1688,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
1688
1996
  dst[i] = scale * x[i];
1689
1997
  }
1690
1998
 
1691
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
1692
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1693
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1999
+ static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
2000
+ const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
2001
+ add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1694
2002
  }
1695
2003
 
1696
2004
  static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -1703,20 +2011,31 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
1703
2011
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1704
2012
  }
1705
2013
 
2014
+ static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
2015
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
2016
+ gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
2017
+ }
2018
+
1706
2019
  static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1707
2020
  const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
1708
2021
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1709
2022
  }
1710
2023
 
2024
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2025
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
2026
+ const dim3 block_dims(WARP_SIZE, 1, 1);
2027
+ norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
2028
+ }
2029
+
1711
2030
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1712
2031
  GGML_ASSERT(ncols % WARP_SIZE == 0);
1713
2032
  const dim3 block_dims(WARP_SIZE, 1, 1);
1714
2033
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1715
2034
  }
1716
2035
 
1717
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
2036
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
1718
2037
  const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1719
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
2038
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
1720
2039
  }
1721
2040
 
1722
2041
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -1873,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1873
2192
  }
1874
2193
 
1875
2194
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1876
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2195
+ GGML_ASSERT(ncols % QK4_0 == 0);
1877
2196
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1878
2197
  const dim3 block_nums(1, block_num_y, 1);
1879
2198
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1882,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
1882
2201
  }
1883
2202
 
1884
2203
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1885
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2204
+ GGML_ASSERT(ncols % QK4_1 == 0);
1886
2205
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1887
2206
  const dim3 block_nums(1, block_num_y, 1);
1888
2207
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1891,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
1891
2210
  }
1892
2211
 
1893
2212
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1894
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2213
+ GGML_ASSERT(ncols % QK5_0 == 0);
1895
2214
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1896
2215
  const dim3 block_nums(1, block_num_y, 1);
1897
2216
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1900,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
1900
2219
  }
1901
2220
 
1902
2221
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1903
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2222
+ GGML_ASSERT(ncols % QK5_1 == 0);
1904
2223
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1905
2224
  const dim3 block_nums(1, block_num_y, 1);
1906
2225
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1909,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
1909
2228
  }
1910
2229
 
1911
2230
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1912
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2231
+ GGML_ASSERT(ncols % QK8_0 == 0);
1913
2232
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1914
2233
  const dim3 block_nums(1, block_num_y, 1);
1915
2234
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1917,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
1917
2236
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1918
2237
  }
1919
2238
 
2239
+ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2240
+ GGML_ASSERT(ncols % QK_K == 0);
2241
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2242
+ const dim3 block_nums(1, block_num_y, 1);
2243
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2244
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
2245
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2246
+ }
2247
+
2248
+ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2249
+ GGML_ASSERT(ncols % QK_K == 0);
2250
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2251
+ const dim3 block_nums(1, block_num_y, 1);
2252
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2253
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
2254
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2255
+ }
2256
+
2257
+ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2258
+ GGML_ASSERT(ncols % QK_K == 0);
2259
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2260
+ const dim3 block_nums(1, block_num_y, 1);
2261
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2262
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
2263
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2264
+ }
2265
+
2266
+ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2267
+ GGML_ASSERT(ncols % QK_K == 0);
2268
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2269
+ const dim3 block_nums(1, block_num_y, 1);
2270
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2271
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
2272
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2273
+ }
2274
+
2275
+ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2276
+ GGML_ASSERT(ncols % QK_K == 0);
2277
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2278
+ const dim3 block_nums(1, block_num_y, 1);
2279
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2280
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
2281
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2282
+ }
2283
+
1920
2284
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1921
2285
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1922
2286
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2009,6 +2373,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
2009
2373
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
2010
2374
  }
2011
2375
 
2376
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
2377
+ GGML_ASSERT(nrows % 4 == 0);
2378
+ const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2379
+ const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
2380
+ const dim3 block_nums(num_blocks_x, nrows, 1);
2381
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
2382
+ }
2383
+
2012
2384
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
2013
2385
  const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
2014
2386
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2051,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
2051
2423
  scoped_spin_lock lock(g_cuda_pool_lock);
2052
2424
  int id;
2053
2425
  CUDA_CHECK(cudaGetDevice(&id));
2054
-
2426
+ #ifdef DEBUG_CUDA_MALLOC
2427
+ int nnz = 0;
2428
+ size_t max_size = 0, tot_size = 0;
2429
+ #endif
2430
+ size_t best_diff = 1ull << 36;
2431
+ int ibest = -1;
2055
2432
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
2056
2433
  cuda_buffer& b = g_cuda_buffer_pool[id][i];
2057
- if (b.size >= size && b.ptr != nullptr) {
2058
- void * ptr = b.ptr;
2059
- *actual_size = b.size;
2060
- b.ptr = nullptr;
2061
- b.size = 0;
2062
- return ptr;
2434
+ if (b.ptr != nullptr) {
2435
+ #ifdef DEBUG_CUDA_MALLOC
2436
+ ++nnz;
2437
+ tot_size += b.size;
2438
+ if (b.size > max_size) max_size = b.size;
2439
+ #endif
2440
+ if (b.size >= size) {
2441
+ size_t diff = b.size - size;
2442
+ if (diff < best_diff) {
2443
+ best_diff = diff;
2444
+ ibest = i;
2445
+ if (!best_diff) {
2446
+ void * ptr = b.ptr;
2447
+ *actual_size = b.size;
2448
+ b.ptr = nullptr;
2449
+ b.size = 0;
2450
+ return ptr;
2451
+ }
2452
+ }
2453
+ }
2063
2454
  }
2064
2455
  }
2456
+ if (ibest >= 0) {
2457
+ cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
2458
+ void * ptr = b.ptr;
2459
+ *actual_size = b.size;
2460
+ b.ptr = nullptr;
2461
+ b.size = 0;
2462
+ return ptr;
2463
+ }
2464
+ #ifdef DEBUG_CUDA_MALLOC
2465
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
2466
+ (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
2467
+ #endif
2065
2468
  void * ptr;
2066
- CUDA_CHECK(cudaMalloc((void **) &ptr, size));
2067
- *actual_size = size;
2469
+ size_t look_ahead_size = (size_t) (1.05 * size);
2470
+ look_ahead_size = 256 * ((look_ahead_size + 255)/256);
2471
+ CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
2472
+ *actual_size = look_ahead_size;
2068
2473
  return ptr;
2069
2474
  }
2070
2475
 
@@ -2140,6 +2545,9 @@ void ggml_init_cublas() {
2140
2545
  }
2141
2546
 
2142
2547
  void ggml_cuda_set_tensor_split(const float * tensor_split) {
2548
+ if (tensor_split == nullptr) {
2549
+ return;
2550
+ }
2143
2551
  bool all_zero = true;
2144
2552
  for (int i = 0; i < g_device_count; ++i) {
2145
2553
  if (tensor_split[i] != 0.0f) {
@@ -2236,16 +2644,19 @@ inline void ggml_cuda_op_add(
2236
2644
 
2237
2645
  GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
2238
2646
  GGML_ASSERT(src1_ddf_i != nullptr);
2239
- GGML_ASSERT(dst_ddf_i != nullptr);
2647
+ GGML_ASSERT(dst_ddf_i != nullptr);
2240
2648
 
2241
- const int64_t ne0 = src0->ne[0];
2649
+ const int64_t ne00 = src0->ne[0];
2242
2650
  const int64_t i01_diff = i01_high - i01_low;
2243
2651
 
2652
+ const int64_t ne10 = src1->ne[0];
2653
+ const int64_t ne11 = src1->ne[1];
2654
+
2244
2655
  // compute
2245
2656
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2246
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
2657
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2247
2658
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
2248
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
2659
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
2249
2660
  } else {
2250
2661
  GGML_ASSERT(false);
2251
2662
  }
@@ -2264,27 +2675,41 @@ inline void ggml_cuda_op_mul(
2264
2675
 
2265
2676
  GGML_ASSERT(src0_ddf_i != nullptr);
2266
2677
  GGML_ASSERT(src1_ddf_i != nullptr);
2267
- GGML_ASSERT(dst_ddf_i != nullptr);
2678
+ GGML_ASSERT(dst_ddf_i != nullptr);
2268
2679
 
2269
2680
  const int64_t ne00 = src0->ne[0];
2681
+ const int64_t i01_diff = i01_high - i01_low;
2270
2682
 
2271
2683
  const int64_t ne10 = src1->ne[0];
2272
2684
  const int64_t ne11 = src1->ne[1];
2273
2685
 
2274
- for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
2275
- const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
2686
+ mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2276
2687
 
2277
- float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
2278
- float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
2279
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2688
+ (void) dst;
2689
+ (void) src0_ddq_i;
2690
+ (void) i02;
2691
+ }
2280
2692
 
2281
- // compute
2282
- mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
2283
- }
2693
+ inline void ggml_cuda_op_gelu(
2694
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2695
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2696
+ cudaStream_t & cudaStream_main){
2697
+
2698
+ GGML_ASSERT(src0_ddf_i != nullptr);
2699
+ GGML_ASSERT(dst_ddf_i != nullptr);
2700
+
2701
+ const int64_t ne00 = src0->ne[0];
2702
+ const int64_t i01_diff = i01_high - i01_low;
2703
+
2704
+ // compute
2705
+ gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2284
2706
 
2707
+ (void) src1;
2285
2708
  (void) dst;
2286
2709
  (void) src0_ddq_i;
2710
+ (void) src1_ddf_i;
2287
2711
  (void) i02;
2712
+ (void) i1;
2288
2713
  }
2289
2714
 
2290
2715
  inline void ggml_cuda_op_silu(
@@ -2309,6 +2734,28 @@ inline void ggml_cuda_op_silu(
2309
2734
  (void) i1;
2310
2735
  }
2311
2736
 
2737
+ inline void ggml_cuda_op_norm(
2738
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2739
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2740
+ cudaStream_t & cudaStream_main){
2741
+
2742
+ GGML_ASSERT(src0_ddf_i != nullptr);
2743
+ GGML_ASSERT(dst_ddf_i != nullptr);
2744
+
2745
+ const int64_t ne00 = src0->ne[0];
2746
+ const int64_t i01_diff = i01_high - i01_low;
2747
+
2748
+ // compute
2749
+ norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2750
+
2751
+ (void) src1;
2752
+ (void) dst;
2753
+ (void) src0_ddq_i;
2754
+ (void) src1_ddf_i;
2755
+ (void) i02;
2756
+ (void) i1;
2757
+ }
2758
+
2312
2759
  inline void ggml_cuda_op_rms_norm(
2313
2760
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2314
2761
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2349,22 +2796,30 @@ inline void ggml_cuda_op_mul_mat_vec(
2349
2796
  int id;
2350
2797
  CUDA_CHECK(cudaGetDevice(&id));
2351
2798
 
2352
- const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2799
+ bool mul_mat_vec_q_implemented =
2800
+ src0->type == GGML_TYPE_Q4_0 ||
2353
2801
  src0->type == GGML_TYPE_Q4_1 ||
2354
2802
  src0->type == GGML_TYPE_Q5_0 ||
2355
2803
  src0->type == GGML_TYPE_Q5_1 ||
2356
2804
  src0->type == GGML_TYPE_Q8_0;
2357
-
2358
- // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
2359
- // However, they have bad performance with Pascal cards.
2360
- // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
2361
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
2805
+ #if QK_K == 256
2806
+ mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
2807
+ src0->type == GGML_TYPE_Q2_K ||
2808
+ src0->type == GGML_TYPE_Q3_K ||
2809
+ src0->type == GGML_TYPE_Q4_K ||
2810
+ src0->type == GGML_TYPE_Q5_K ||
2811
+ src0->type == GGML_TYPE_Q6_K;
2812
+ #endif // QK_K == 256
2813
+
2814
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
2362
2815
  #endif
2363
2816
 
2364
2817
  if (use_mul_mat_vec_q) {
2818
+ int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2819
+ padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
2365
2820
  size_t as;
2366
- void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
2367
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
2821
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2822
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
2368
2823
 
2369
2824
  switch (src0->type) {
2370
2825
  case GGML_TYPE_Q4_0:
@@ -2382,6 +2837,21 @@ inline void ggml_cuda_op_mul_mat_vec(
2382
2837
  case GGML_TYPE_Q8_0:
2383
2838
  mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2384
2839
  break;
2840
+ case GGML_TYPE_Q2_K:
2841
+ mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2842
+ break;
2843
+ case GGML_TYPE_Q3_K:
2844
+ mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2845
+ break;
2846
+ case GGML_TYPE_Q4_K:
2847
+ mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2848
+ break;
2849
+ case GGML_TYPE_Q5_K:
2850
+ mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2851
+ break;
2852
+ case GGML_TYPE_Q6_K:
2853
+ mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2854
+ break;
2385
2855
  default:
2386
2856
  GGML_ASSERT(false);
2387
2857
  break;
@@ -2516,13 +2986,26 @@ inline void ggml_cuda_op_rope(
2516
2986
  const int n_past = ((int32_t *) src1->data)[0];
2517
2987
  const int n_dims = ((int32_t *) src1->data)[1];
2518
2988
  const int mode = ((int32_t *) src1->data)[2];
2519
- GGML_ASSERT(mode == 0);
2989
+ const int n_ctx = ((int32_t *) src1->data)[3];
2990
+
2991
+ // RoPE alteration for extended context
2992
+ float freq_base, freq_scale;
2993
+ memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
2994
+ memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
2995
+
2996
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
2997
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
2520
2998
 
2521
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
2522
- const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
2999
+ bool is_glm = mode & 4;
2523
3000
 
2524
3001
  // compute
2525
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3002
+ if (is_glm) {
3003
+ const float id_p = min(p, n_ctx - 2.f);
3004
+ const float block_p = max(p - (n_ctx - 2.f), 0.f);
3005
+ rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3006
+ } else {
3007
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3008
+ }
2526
3009
 
2527
3010
  (void) dst;
2528
3011
  (void) src0_ddq_i;
@@ -2925,11 +3408,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2925
3408
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
2926
3409
  }
2927
3410
 
3411
+ void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3412
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3413
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
3414
+ }
3415
+
2928
3416
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2929
3417
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2930
3418
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
2931
3419
  }
2932
3420
 
3421
+ void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3422
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3423
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
3424
+ }
3425
+
2933
3426
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2934
3427
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2935
3428
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -3085,6 +3578,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3085
3578
  (void) dst;
3086
3579
  }
3087
3580
 
3581
+ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3582
+ ggml_cuda_cpy(src0, dst, nullptr);
3583
+ (void) src1;
3584
+ }
3585
+
3088
3586
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3089
3587
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3090
3588
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@@ -3108,7 +3606,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3108
3606
 
3109
3607
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3110
3608
  int nrows = ggml_nrows(tensor);
3609
+
3610
+ const int64_t ne0 = tensor->ne[0];
3611
+
3111
3612
  const size_t nb1 = tensor->nb[1];
3613
+
3112
3614
  ggml_backend backend = tensor->backend;
3113
3615
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
3114
3616
  memset(extra, 0, sizeof(*extra));
@@ -3137,13 +3639,26 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3137
3639
  int64_t nrows_split = row_high - row_low;
3138
3640
 
3139
3641
  const size_t offset_split = row_low*nb1;
3140
- const size_t size = ggml_nbytes_split(tensor, nrows_split);
3642
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
3643
+ const size_t original_size = size;
3644
+
3645
+ // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3646
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
3647
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3648
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
3649
+ }
3141
3650
 
3142
- void * buf;
3651
+ char * buf;
3143
3652
  CUDA_CHECK(cudaMalloc(&buf, size));
3144
- void * buf_host = (char*)data + offset_split;
3653
+ char * buf_host = (char*)data + offset_split;
3145
3654
 
3146
- cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
3655
+ // set padding to 0 to avoid possible NaN values
3656
+ if (size > original_size) {
3657
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
3658
+ }
3659
+
3660
+
3661
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
3147
3662
 
3148
3663
  extra->data_device[id] = buf;
3149
3664
 
@@ -3177,43 +3692,60 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
3177
3692
  delete extra;
3178
3693
  }
3179
3694
 
3695
+ static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
3696
+ static size_t g_temp_tensor_extra_index = 0;
3697
+
3698
+ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
3699
+ if (g_temp_tensor_extras == nullptr) {
3700
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
3701
+ }
3702
+
3703
+ size_t alloc_index = g_temp_tensor_extra_index;
3704
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
3705
+ struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
3706
+ memset(extra, 0, sizeof(*extra));
3707
+
3708
+ return extra;
3709
+ }
3710
+
3180
3711
  void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
3181
3712
  if (scratch && g_scratch_size == 0) {
3182
3713
  return;
3183
3714
  }
3184
3715
 
3185
3716
  // recursively assign CUDA buffers until a compute tensor is found
3186
- if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
3187
- const ggml_op src0_op = tensor->src0->op;
3188
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
3189
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
3717
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3718
+ const ggml_op src0_op = tensor->src[0]->op;
3719
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
3720
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
3190
3721
  }
3191
3722
  }
3192
- if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
3193
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
3723
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
3724
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
3194
3725
  }
3195
3726
 
3196
3727
  tensor->backend = GGML_BACKEND_GPU;
3197
- struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
3198
- memset(extra, 0, sizeof(*extra));
3728
+ struct ggml_tensor_extra_gpu * extra;
3199
3729
 
3200
- const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
3730
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
3201
3731
  tensor->op == GGML_OP_VIEW ||
3202
3732
  force_inplace;
3203
3733
  const size_t size = ggml_nbytes(tensor);
3204
3734
 
3205
3735
  CUDA_CHECK(cudaSetDevice(g_main_device));
3206
- if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
3207
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
3736
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
3737
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
3208
3738
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3209
3739
  size_t offset = 0;
3210
3740
  if (tensor->op == GGML_OP_VIEW) {
3211
- memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
3741
+ memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3212
3742
  }
3743
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3213
3744
  extra->data_device[g_main_device] = src0_ddc + offset;
3214
3745
  } else if (tensor->op == GGML_OP_CPY) {
3215
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
3746
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
3216
3747
  void * src1_ddv = src1_extra->data_device[g_main_device];
3748
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3217
3749
  extra->data_device[g_main_device] = src1_ddv;
3218
3750
  } else if (scratch) {
3219
3751
  GGML_ASSERT(size <= g_scratch_size);
@@ -3226,6 +3758,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3226
3758
  CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
3227
3759
  g_scratch_buffer = data;
3228
3760
  }
3761
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3229
3762
  extra->data_device[g_main_device] = data + g_scratch_offset;
3230
3763
 
3231
3764
  g_scratch_offset += size;
@@ -3235,6 +3768,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3235
3768
  void * data;
3236
3769
  CUDA_CHECK(cudaMalloc(&data, size));
3237
3770
  CUDA_CHECK(cudaMemset(data, 0, size));
3771
+ extra = new ggml_tensor_extra_gpu;
3772
+ memset(extra, 0, sizeof(*extra));
3238
3773
  extra->data_device[g_main_device] = data;
3239
3774
  }
3240
3775
 
@@ -3283,10 +3818,16 @@ void ggml_cuda_free_scratch() {
3283
3818
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
3284
3819
  ggml_cuda_func_t func;
3285
3820
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
3286
- || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
3287
- || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
3821
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3822
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
3288
3823
 
3289
3824
  switch (tensor->op) {
3825
+ case GGML_OP_DUP:
3826
+ if (!any_on_device) {
3827
+ return false;
3828
+ }
3829
+ func = ggml_cuda_dup;
3830
+ break;
3290
3831
  case GGML_OP_ADD:
3291
3832
  if (!any_on_device) {
3292
3833
  return false;
@@ -3299,12 +3840,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3299
3840
  }
3300
3841
  func = ggml_cuda_mul;
3301
3842
  break;
3843
+ case GGML_OP_GELU:
3844
+ if (!any_on_device) {
3845
+ return false;
3846
+ }
3847
+ func = ggml_cuda_gelu;
3848
+ break;
3302
3849
  case GGML_OP_SILU:
3303
3850
  if (!any_on_device) {
3304
3851
  return false;
3305
3852
  }
3306
3853
  func = ggml_cuda_silu;
3307
3854
  break;
3855
+ case GGML_OP_NORM:
3856
+ if (!any_on_device) {
3857
+ return false;
3858
+ }
3859
+ func = ggml_cuda_norm;
3860
+ break;
3308
3861
  case GGML_OP_RMS_NORM:
3309
3862
  if (!any_on_device) {
3310
3863
  return false;
@@ -3312,7 +3865,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3312
3865
  func = ggml_cuda_rms_norm;
3313
3866
  break;
3314
3867
  case GGML_OP_MUL_MAT:
3315
- if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
3868
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
3316
3869
  return false;
3317
3870
  }
3318
3871
  func = ggml_cuda_mul_mat;
@@ -3329,6 +3882,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3329
3882
  }
3330
3883
  func = ggml_cuda_cpy;
3331
3884
  break;
3885
+ case GGML_OP_CONT:
3886
+ if (!any_on_device) {
3887
+ return false;
3888
+ }
3889
+ func = ggml_cuda_dup;
3890
+ break;
3332
3891
  case GGML_OP_RESHAPE:
3333
3892
  case GGML_OP_VIEW:
3334
3893
  case GGML_OP_PERMUTE:
@@ -3366,6 +3925,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3366
3925
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3367
3926
  return true;
3368
3927
  }
3369
- func(tensor->src0, tensor->src1, tensor);
3928
+ func(tensor->src[0], tensor->src[1], tensor);
3370
3929
  return true;
3371
3930
  }