llama_cpp 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,8 @@
13
13
  #include "ggml-cuda.h"
14
14
  #include "ggml.h"
15
15
 
16
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+
16
18
  #if defined(_MSC_VER)
17
19
  #pragma warning(disable: 4244 4267) // possible loss of data
18
20
  #endif
@@ -59,8 +61,8 @@ typedef float2 dfloat2;
59
61
  #endif //GGML_CUDA_DMMV_F16
60
62
 
61
63
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
- typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
63
- typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
64
+ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
65
+ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
64
66
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
65
67
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
66
68
  typedef void (*ggml_cuda_op_t)(
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
74
76
 
75
77
  #define QK4_0 32
76
78
  #define QR4_0 2
77
- #define QI4_0 4
79
+ #define QI4_0 (QK4_0 / (4 * QR4_0))
78
80
  typedef struct {
79
81
  half d; // delta
80
82
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
83
85
 
84
86
  #define QK4_1 32
85
87
  #define QR4_1 2
86
- #define QI4_1 4
88
+ #define QI4_1 (QK4_1 / (4 * QR4_1))
87
89
  typedef struct {
88
90
  half d; // delta
89
91
  half m; // min
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
93
95
 
94
96
  #define QK5_0 32
95
97
  #define QR5_0 2
96
- #define QI5_0 4
98
+ #define QI5_0 (QK5_0 / (4 * QR5_0))
97
99
  typedef struct {
98
100
  half d; // delta
99
101
  uint8_t qh[4]; // 5-th bit of quants
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
103
105
 
104
106
  #define QK5_1 32
105
107
  #define QR5_1 2
106
- #define QI5_1 4
108
+ #define QI5_1 (QK5_1 / (4 * QR5_1))
107
109
  typedef struct {
108
110
  half d; // delta
109
111
  half m; // min
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
114
116
 
115
117
  #define QK8_0 32
116
118
  #define QR8_0 1
117
- #define QI8_0 8
119
+ #define QI8_0 (QK8_0 / (4 * QR8_0))
118
120
  typedef struct {
119
121
  half d; // delta
120
122
  int8_t qs[QK8_0]; // quants
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
123
125
 
124
126
  #define QK8_1 32
125
127
  #define QR8_1 1
126
- #define QI8_1 8
128
+ #define QI8_1 (QK8_1 / (4 * QR8_1))
127
129
  typedef struct {
128
130
  half d; // delta
129
131
  half s; // unquantized sum
@@ -131,7 +133,7 @@ typedef struct {
131
133
  } block_q8_1;
132
134
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
135
 
134
- typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
136
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
135
137
 
136
138
  //================================= k-quants
137
139
 
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, co
143
145
  #define K_SCALE_SIZE 12
144
146
  #endif
145
147
 
148
+ #define QR2_K 4
149
+ #define QI2_K (QK_K / (4*QR2_K))
146
150
  typedef struct {
147
151
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
148
152
  uint8_t qs[QK_K/4]; // quants
@@ -151,6 +155,8 @@ typedef struct {
151
155
  } block_q2_K;
152
156
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
153
157
 
158
+ #define QR3_K 4
159
+ #define QI3_K (QK_K / (4*QR3_K))
154
160
  typedef struct {
155
161
  uint8_t hmask[QK_K/8]; // quants - high bit
156
162
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -163,6 +169,8 @@ typedef struct {
163
169
  } block_q3_K;
164
170
  //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
165
171
 
172
+ #define QR4_K 2
173
+ #define QI4_K (QK_K / (4*QR4_K))
166
174
  #ifdef GGML_QKK_64
167
175
  typedef struct {
168
176
  half d[2]; // super-block scales/mins
@@ -180,6 +188,8 @@ typedef struct {
180
188
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
181
189
  #endif
182
190
 
191
+ #define QR5_K 2
192
+ #define QI5_K (QK_K / (4*QR5_K))
183
193
  #ifdef GGML_QKK_64
184
194
  typedef struct {
185
195
  half d; // super-block scale
@@ -199,6 +209,8 @@ typedef struct {
199
209
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
200
210
  #endif
201
211
 
212
+ #define QR6_K 2
213
+ #define QI6_K (QK_K / (4*QR6_K))
202
214
  typedef struct {
203
215
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
204
216
  uint8_t qh[QK_K/4]; // quants, upper 2 bits
@@ -208,9 +220,11 @@ typedef struct {
208
220
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
209
221
 
210
222
  #define WARP_SIZE 32
223
+ #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
211
224
 
212
225
  #define CUDA_ADD_BLOCK_SIZE 256
213
226
  #define CUDA_MUL_BLOCK_SIZE 256
227
+ #define CUDA_GELU_BLOCK_SIZE 256
214
228
  #define CUDA_SILU_BLOCK_SIZE 256
215
229
  #define CUDA_CPY_BLOCK_SIZE 32
216
230
  #define CUDA_SCALE_BLOCK_SIZE 256
@@ -238,13 +252,13 @@ struct ggml_tensor_extra_gpu {
238
252
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
239
253
  };
240
254
 
241
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
255
+ static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
242
256
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
243
257
 
244
- if (i >= k) {
258
+ if (i >= kx) {
245
259
  return;
246
260
  }
247
- dst[i] = x[i] + y[i];
261
+ dst[i] = x[i] + y[i%ky];
248
262
  }
249
263
 
250
264
  static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -265,6 +279,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
265
279
  dst[i] = x[i] * y[i%ky];
266
280
  }
267
281
 
282
+ static __global__ void gelu_f32(const float * x, float * dst, const int k) {
283
+ const float GELU_COEF_A = 0.044715f;
284
+ const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
285
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
286
+
287
+ if (i >= k) {
288
+ return;
289
+ }
290
+
291
+ float xi = x[i];
292
+ dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
293
+ }
294
+
268
295
  static __global__ void silu_f32(const float * x, float * dst, const int k) {
269
296
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
270
297
 
@@ -274,16 +301,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
274
301
  dst[i] = x[i] / (1.0f + expf(-x[i]));
275
302
  }
276
303
 
304
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
305
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
306
+ const int tid = threadIdx.x;
307
+
308
+ const float eps = 1e-5f;
309
+
310
+ float mean = 0.0f;
311
+ float var = 0.0f;
312
+
313
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
314
+ const float xi = x[row*ncols + col];
315
+ mean += xi;
316
+ var += xi * xi;
317
+ }
318
+
319
+ // sum up partial sums
320
+ #pragma unroll
321
+ for (int mask = 16; mask > 0; mask >>= 1) {
322
+ mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
323
+ var += __shfl_xor_sync(0xffffffff, var, mask, 32);
324
+ }
325
+
326
+ mean /= ncols;
327
+ var = var / ncols - mean * mean;
328
+ const float inv_var = rsqrtf(var + eps);
329
+
330
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
331
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
332
+ }
333
+ }
334
+
277
335
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
278
336
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
279
337
  const int tid = threadIdx.x;
280
338
 
281
- const float eps = 1e-6;
339
+ const float eps = 1e-6f;
282
340
 
283
341
  float tmp = 0.0f; // partial sum for thread in warp
284
342
 
285
- for (int i = 0; i < ncols; i += WARP_SIZE) {
286
- const int col = i + tid;
343
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
287
344
  const float xi = x[row*ncols + col];
288
345
  tmp += xi * xi;
289
346
  }
@@ -295,10 +352,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
295
352
  }
296
353
 
297
354
  const float mean = tmp / ncols;
298
- const float scale = 1.0f / sqrtf(mean + eps);
355
+ const float scale = rsqrtf(mean + eps);
299
356
 
300
- for (int i = 0; i < ncols; i += WARP_SIZE) {
301
- const int col = i + tid;
357
+ for (int col = tid; col < ncols; col += WARP_SIZE) {
302
358
  dst[row*ncols + col] = scale * x[row*ncols + col];
303
359
  }
304
360
  }
@@ -407,7 +463,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
407
463
 
408
464
  //================================== k-quants
409
465
 
410
- static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
466
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
411
467
 
412
468
  const int i = blockIdx.x;
413
469
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -440,7 +496,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
440
496
 
441
497
  }
442
498
 
443
- static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
499
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
444
500
 
445
501
  const int i = blockIdx.x;
446
502
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -504,7 +560,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
504
560
  }
505
561
  #endif
506
562
 
507
- static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
563
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
508
564
  const block_q4_K * x = (const block_q4_K *) vx;
509
565
 
510
566
  const int i = blockIdx.x;
@@ -544,7 +600,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
544
600
  #endif
545
601
  }
546
602
 
547
- static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
603
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
548
604
  const block_q5_K * x = (const block_q5_K *) vx;
549
605
 
550
606
  const int i = blockIdx.x;
@@ -590,7 +646,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
590
646
  #endif
591
647
  }
592
648
 
593
- static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
649
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
594
650
  const block_q6_K * x = (const block_q6_K *) vx;
595
651
 
596
652
  const int i = blockIdx.x;
@@ -634,7 +690,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
634
690
  #endif
635
691
  }
636
692
 
637
- static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
693
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
638
694
 
639
695
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
640
696
 
@@ -742,7 +798,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
742
798
  }
743
799
  }
744
800
 
745
- static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
801
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
746
802
 
747
803
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
748
804
  if (row > nrows) return;
@@ -846,7 +902,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
846
902
  }
847
903
  }
848
904
 
849
- static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
905
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
850
906
 
851
907
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
852
908
  if (row > nrows) return;
@@ -949,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
949
1005
  }
950
1006
  }
951
1007
 
952
- static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
1008
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
953
1009
 
954
1010
  const int row = blockIdx.x;
955
1011
  const int num_blocks_per_row = ncols / QK_K;
@@ -1053,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1053
1109
  }
1054
1110
  }
1055
1111
 
1056
- static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
1112
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1057
1113
 
1058
1114
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1059
1115
 
@@ -1171,7 +1227,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1171
1227
  v.y = x[ib + iqs + 1];
1172
1228
  }
1173
1229
 
1174
- static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1230
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1175
1231
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
1176
1232
 
1177
1233
  if (i >= k) {
@@ -1180,10 +1236,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1180
1236
 
1181
1237
  block_q8_1 * y = (block_q8_1 *) vy;
1182
1238
 
1183
- const int ib = i / QK8_0; // block index
1184
- const int iqs = i % QK8_0; // quant index
1239
+ const int ib = i / QK8_1; // block index
1240
+ const int iqs = i % QK8_1; // quant index
1185
1241
 
1186
- const float xi = x[i];
1242
+ const float xi = i < ndata ? x[i] : 0.0f;
1187
1243
  float amax = fabsf(xi);
1188
1244
  float sum = xi;
1189
1245
 
@@ -1207,7 +1263,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1207
1263
  }
1208
1264
 
1209
1265
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1210
- static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1266
+ static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1211
1267
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1212
1268
 
1213
1269
  if (i >= k) {
@@ -1227,8 +1283,9 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1227
1283
  y[iybs + iqs + y_offset] = v.y;
1228
1284
  }
1229
1285
 
1230
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1231
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1286
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1287
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1288
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1232
1289
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1233
1290
 
1234
1291
  int vi;
@@ -1249,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
1249
1306
  return sumi*d;
1250
1307
  #else
1251
1308
  return 0.0f; // only to satisfy the compiler
1252
- #endif // __CUDA_ARCH__ >= 600
1309
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1253
1310
  }
1254
1311
 
1255
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1256
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1312
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1313
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1314
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1257
1315
  const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1258
1316
 
1259
1317
  const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1274,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
1274
1332
  return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1275
1333
  #else
1276
1334
  return 0.0f; // only to satisfy the compiler
1277
- #endif // __CUDA_ARCH__ >= 600
1335
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1278
1336
  }
1279
1337
 
1280
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1281
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1338
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1339
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1340
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1282
1341
  const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1283
1342
 
1284
1343
  int qs;
@@ -1309,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
1309
1368
  return sumi*d;
1310
1369
  #else
1311
1370
  return 0.0f; // only to satisfy the compiler
1312
- #endif // __CUDA_ARCH__ >= 600
1371
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1313
1372
  }
1314
1373
 
1315
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1316
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1374
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1375
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1376
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1317
1377
  const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1318
1378
 
1319
1379
  const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1343,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
1343
1403
  return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1344
1404
  #else
1345
1405
  return 0.0f; // only to satisfy the compiler
1346
- #endif // __CUDA_ARCH__ >= 600
1406
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1347
1407
  }
1348
1408
 
1349
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1350
- #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1409
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1410
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1411
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1351
1412
  const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1352
1413
 
1353
1414
  int vi;
@@ -1362,11 +1423,224 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
1362
1423
  return sumi*d;
1363
1424
  #else
1364
1425
  return 0.0f; // only to satisfy the compiler
1365
- #endif // __CUDA_ARCH__ >= 600
1426
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1427
+ }
1428
+
1429
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1430
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1431
+
1432
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1433
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1434
+
1435
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
1436
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1437
+
1438
+ float sumf_d = 0.0f;
1439
+ float sumf_m = 0.0f;
1440
+
1441
+ const float d = bq2_K->d;
1442
+ const float dmin = bq2_K->dmin;
1443
+
1444
+ const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1445
+
1446
+ for (int i = 0; i < QR2_K; ++i) {
1447
+ const int sc = bq2_K->scales[scale_offset + 2*i];
1448
+
1449
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1450
+ const float d8i = bq8i->d;
1451
+
1452
+ const int vi = (v >> (2*i)) & 0x03030303;
1453
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1454
+
1455
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1456
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1457
+ }
1458
+
1459
+ return d*sumf_d - dmin*sumf_m;
1460
+ #else
1461
+ return 0.0f; // only to satisfy the compiler
1462
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1463
+ }
1464
+
1465
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1466
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1467
+
1468
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1469
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
1470
+
1471
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1472
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1473
+
1474
+ float sumf = 0.0f;
1475
+
1476
+ const float d = bq3_K->d;
1477
+
1478
+ int vl;
1479
+ memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
1480
+
1481
+ int vh;
1482
+ memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1483
+ vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1484
+ vh >>= bq8_offset;
1485
+
1486
+ for (int i = 0; i < QR3_K; ++i) {
1487
+ const int isc = scale_offset + 2*i;
1488
+
1489
+ const int isc_low = isc % (QK_K/32);
1490
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
1491
+ const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
1492
+
1493
+ const int isc_high = isc % (QK_K/64);
1494
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
1495
+ const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1496
+
1497
+ const int sc = (sc_low | sc_high) - 32;
1498
+
1499
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1500
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1501
+ const float d8i = bq8i->d;
1502
+
1503
+ const int vil = (vl >> (2*i)) & 0x03030303;
1504
+
1505
+ const int vih = ((vh >> i) << 2) & 0x04040404;
1506
+
1507
+ const int vi = __vsubss4(vil, vih);
1508
+
1509
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1510
+ }
1511
+
1512
+ return d*sumf;
1513
+ #else
1514
+ return 0.0f; // only to satisfy the compiler
1515
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1516
+ }
1517
+
1518
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1519
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1520
+
1521
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1522
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1523
+
1524
+ const int bq8_offset = QR4_K * (iqs / QI8_1);
1525
+
1526
+ float sumf_d = 0.0f;
1527
+ float sumf_m = 0.0f;
1528
+
1529
+ const float d = bq4_K->d;
1530
+ const float dmin = bq4_K->dmin;
1531
+
1532
+ const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
1533
+
1534
+ for (int i = 0; i < QR4_K; ++i) {
1535
+ const int isc = bq8_offset + i;
1536
+
1537
+ uint8_t sc, m;
1538
+ get_scale_min_k4(isc, bq4_K->scales, sc, m);
1539
+
1540
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1541
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1542
+ const float d8i = bq8i->d;
1543
+
1544
+ const int vi = (v >> (4*i)) & 0x0F0F0F0F;
1545
+
1546
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1547
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
1548
+ }
1549
+
1550
+ return d*sumf_d - dmin*sumf_m;
1551
+ #else
1552
+ return 0.0f; // only to satisfy the compiler
1553
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1554
+ }
1555
+
1556
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1557
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1558
+
1559
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1560
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1561
+
1562
+ const int bq8_offset = QR5_K * (iqs / QI8_1);
1563
+
1564
+ float sumf_d = 0.0f;
1565
+ float sumf_m = 0.0f;
1566
+
1567
+ const float d = bq5_K->d;
1568
+ const float dmin = bq5_K->dmin;
1569
+
1570
+ const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
1571
+
1572
+ const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
1573
+
1574
+ for (int i = 0; i < QR5_K; ++i) {
1575
+ const int isc = bq8_offset + i;
1576
+
1577
+ uint8_t sc, m;
1578
+ get_scale_min_k4(isc, bq5_K->scales, sc, m);
1579
+
1580
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1581
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1582
+ const float d8i = bq8i->d;
1583
+
1584
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1585
+
1586
+ const int vih = ((vh >> i) << 4) & 0x10101010;
1587
+
1588
+ const int vi = vil | vih;
1589
+
1590
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1591
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
1592
+ }
1593
+
1594
+ return d*sumf_d - dmin*sumf_m;
1595
+ #else
1596
+ return 0.0f; // only to satisfy the compiler
1597
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1598
+ }
1599
+
1600
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1601
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1602
+
1603
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1604
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
1605
+
1606
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1607
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1608
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
1609
+
1610
+ float sumf = 0.0f;
1611
+
1612
+ const float d = bq6_K->d;
1613
+
1614
+ int vl;
1615
+ memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
1616
+
1617
+ int vh;
1618
+ memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
1619
+
1620
+ for (int i = 0; i < QR6_K; ++i) {
1621
+ const int sc = bq6_K->scales[scale_offset + 4*i];
1622
+
1623
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1624
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1625
+ const float d8i = bq8i->d;
1626
+
1627
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1628
+
1629
+ const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
1630
+
1631
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1632
+
1633
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1634
+ }
1635
+
1636
+ return d*sumf;
1637
+ #else
1638
+ return 0.0f; // only to satisfy the compiler
1639
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1366
1640
  }
1367
1641
 
1368
1642
  template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1369
- static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
1643
+ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1370
1644
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1371
1645
 
1372
1646
  if (row >= nrows) {
@@ -1385,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1385
1659
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1386
1660
  const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1387
1661
 
1388
- const int iby = i + threadIdx.x / qi; // y block index
1662
+ const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
1389
1663
 
1390
1664
  const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1391
1665
 
@@ -1404,7 +1678,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1404
1678
  }
1405
1679
 
1406
1680
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1407
- static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1681
+ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
1408
1682
  // qk = quantized weights per x block
1409
1683
  // qr = number of quantized weights per data value in x block
1410
1684
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -1471,7 +1745,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1471
1745
  }
1472
1746
  }
1473
1747
 
1474
- static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1748
+ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1475
1749
  const half * x = (const half *) vx;
1476
1750
 
1477
1751
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1518,7 +1792,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1518
1792
  }
1519
1793
 
1520
1794
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1521
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1795
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1522
1796
  const int row_stride_x, const int channel_stride_x) {
1523
1797
 
1524
1798
  const half * x = (const half *) vx;
@@ -1623,6 +1897,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
1623
1897
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
1624
1898
  }
1625
1899
 
1900
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
1901
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
1902
+ const int half_n_dims = ncols/4;
1903
+
1904
+ if (col >= half_n_dims) {
1905
+ return;
1906
+ }
1907
+
1908
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1909
+ const int i = row*ncols + col;
1910
+
1911
+ const float col_theta_scale = powf(theta_scale, col);
1912
+
1913
+ const float theta = p*col_theta_scale;
1914
+ const float sin_theta = sinf(theta);
1915
+ const float cos_theta = cosf(theta);
1916
+
1917
+ const float x0 = x[i + 0];
1918
+ const float x1 = x[i + half_n_dims];
1919
+
1920
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
1921
+ dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
1922
+
1923
+ const float block_theta = block_p*col_theta_scale;
1924
+ const float sin_block_theta = sinf(block_theta);
1925
+ const float cos_block_theta = cosf(block_theta);
1926
+
1927
+ const float x2 = x[i + half_n_dims * 2];
1928
+ const float x3 = x[i + half_n_dims * 3];
1929
+
1930
+ dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
1931
+ dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
1932
+ }
1933
+
1626
1934
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
1627
1935
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
1628
1936
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1688,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
1688
1996
  dst[i] = scale * x[i];
1689
1997
  }
1690
1998
 
1691
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
1692
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1693
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1999
+ static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
2000
+ const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
2001
+ add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1694
2002
  }
1695
2003
 
1696
2004
  static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -1703,20 +2011,31 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
1703
2011
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1704
2012
  }
1705
2013
 
2014
+ static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
2015
+ const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
2016
+ gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
2017
+ }
2018
+
1706
2019
  static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
1707
2020
  const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
1708
2021
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
1709
2022
  }
1710
2023
 
2024
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2025
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
2026
+ const dim3 block_dims(WARP_SIZE, 1, 1);
2027
+ norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
2028
+ }
2029
+
1711
2030
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1712
2031
  GGML_ASSERT(ncols % WARP_SIZE == 0);
1713
2032
  const dim3 block_dims(WARP_SIZE, 1, 1);
1714
2033
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1715
2034
  }
1716
2035
 
1717
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
2036
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
1718
2037
  const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1719
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
2038
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
1720
2039
  }
1721
2040
 
1722
2041
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -1873,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1873
2192
  }
1874
2193
 
1875
2194
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1876
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2195
+ GGML_ASSERT(ncols % QK4_0 == 0);
1877
2196
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1878
2197
  const dim3 block_nums(1, block_num_y, 1);
1879
2198
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1882,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
1882
2201
  }
1883
2202
 
1884
2203
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1885
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2204
+ GGML_ASSERT(ncols % QK4_1 == 0);
1886
2205
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1887
2206
  const dim3 block_nums(1, block_num_y, 1);
1888
2207
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1891,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
1891
2210
  }
1892
2211
 
1893
2212
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1894
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2213
+ GGML_ASSERT(ncols % QK5_0 == 0);
1895
2214
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1896
2215
  const dim3 block_nums(1, block_num_y, 1);
1897
2216
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1900,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
1900
2219
  }
1901
2220
 
1902
2221
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1903
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2222
+ GGML_ASSERT(ncols % QK5_1 == 0);
1904
2223
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1905
2224
  const dim3 block_nums(1, block_num_y, 1);
1906
2225
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1909,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
1909
2228
  }
1910
2229
 
1911
2230
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1912
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2231
+ GGML_ASSERT(ncols % QK8_0 == 0);
1913
2232
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1914
2233
  const dim3 block_nums(1, block_num_y, 1);
1915
2234
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1917,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
1917
2236
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1918
2237
  }
1919
2238
 
2239
+ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2240
+ GGML_ASSERT(ncols % QK_K == 0);
2241
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2242
+ const dim3 block_nums(1, block_num_y, 1);
2243
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2244
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
2245
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2246
+ }
2247
+
2248
+ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2249
+ GGML_ASSERT(ncols % QK_K == 0);
2250
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2251
+ const dim3 block_nums(1, block_num_y, 1);
2252
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2253
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
2254
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2255
+ }
2256
+
2257
+ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2258
+ GGML_ASSERT(ncols % QK_K == 0);
2259
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2260
+ const dim3 block_nums(1, block_num_y, 1);
2261
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2262
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
2263
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2264
+ }
2265
+
2266
+ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2267
+ GGML_ASSERT(ncols % QK_K == 0);
2268
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2269
+ const dim3 block_nums(1, block_num_y, 1);
2270
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2271
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
2272
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2273
+ }
2274
+
2275
+ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2276
+ GGML_ASSERT(ncols % QK_K == 0);
2277
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2278
+ const dim3 block_nums(1, block_num_y, 1);
2279
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2280
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
2281
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2282
+ }
2283
+
1920
2284
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1921
2285
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1922
2286
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2009,6 +2373,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
2009
2373
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
2010
2374
  }
2011
2375
 
2376
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
2377
+ GGML_ASSERT(nrows % 4 == 0);
2378
+ const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2379
+ const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
2380
+ const dim3 block_nums(num_blocks_x, nrows, 1);
2381
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
2382
+ }
2383
+
2012
2384
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
2013
2385
  const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
2014
2386
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2051,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
2051
2423
  scoped_spin_lock lock(g_cuda_pool_lock);
2052
2424
  int id;
2053
2425
  CUDA_CHECK(cudaGetDevice(&id));
2054
-
2426
+ #ifdef DEBUG_CUDA_MALLOC
2427
+ int nnz = 0;
2428
+ size_t max_size = 0, tot_size = 0;
2429
+ #endif
2430
+ size_t best_diff = 1ull << 36;
2431
+ int ibest = -1;
2055
2432
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
2056
2433
  cuda_buffer& b = g_cuda_buffer_pool[id][i];
2057
- if (b.size >= size && b.ptr != nullptr) {
2058
- void * ptr = b.ptr;
2059
- *actual_size = b.size;
2060
- b.ptr = nullptr;
2061
- b.size = 0;
2062
- return ptr;
2434
+ if (b.ptr != nullptr) {
2435
+ #ifdef DEBUG_CUDA_MALLOC
2436
+ ++nnz;
2437
+ tot_size += b.size;
2438
+ if (b.size > max_size) max_size = b.size;
2439
+ #endif
2440
+ if (b.size >= size) {
2441
+ size_t diff = b.size - size;
2442
+ if (diff < best_diff) {
2443
+ best_diff = diff;
2444
+ ibest = i;
2445
+ if (!best_diff) {
2446
+ void * ptr = b.ptr;
2447
+ *actual_size = b.size;
2448
+ b.ptr = nullptr;
2449
+ b.size = 0;
2450
+ return ptr;
2451
+ }
2452
+ }
2453
+ }
2063
2454
  }
2064
2455
  }
2456
+ if (ibest >= 0) {
2457
+ cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
2458
+ void * ptr = b.ptr;
2459
+ *actual_size = b.size;
2460
+ b.ptr = nullptr;
2461
+ b.size = 0;
2462
+ return ptr;
2463
+ }
2464
+ #ifdef DEBUG_CUDA_MALLOC
2465
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
2466
+ (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
2467
+ #endif
2065
2468
  void * ptr;
2066
- CUDA_CHECK(cudaMalloc((void **) &ptr, size));
2067
- *actual_size = size;
2469
+ size_t look_ahead_size = (size_t) (1.05 * size);
2470
+ look_ahead_size = 256 * ((look_ahead_size + 255)/256);
2471
+ CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
2472
+ *actual_size = look_ahead_size;
2068
2473
  return ptr;
2069
2474
  }
2070
2475
 
@@ -2140,6 +2545,9 @@ void ggml_init_cublas() {
2140
2545
  }
2141
2546
 
2142
2547
  void ggml_cuda_set_tensor_split(const float * tensor_split) {
2548
+ if (tensor_split == nullptr) {
2549
+ return;
2550
+ }
2143
2551
  bool all_zero = true;
2144
2552
  for (int i = 0; i < g_device_count; ++i) {
2145
2553
  if (tensor_split[i] != 0.0f) {
@@ -2236,16 +2644,19 @@ inline void ggml_cuda_op_add(
2236
2644
 
2237
2645
  GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
2238
2646
  GGML_ASSERT(src1_ddf_i != nullptr);
2239
- GGML_ASSERT(dst_ddf_i != nullptr);
2647
+ GGML_ASSERT(dst_ddf_i != nullptr);
2240
2648
 
2241
- const int64_t ne0 = src0->ne[0];
2649
+ const int64_t ne00 = src0->ne[0];
2242
2650
  const int64_t i01_diff = i01_high - i01_low;
2243
2651
 
2652
+ const int64_t ne10 = src1->ne[0];
2653
+ const int64_t ne11 = src1->ne[1];
2654
+
2244
2655
  // compute
2245
2656
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2246
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
2657
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2247
2658
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
2248
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
2659
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
2249
2660
  } else {
2250
2661
  GGML_ASSERT(false);
2251
2662
  }
@@ -2264,27 +2675,41 @@ inline void ggml_cuda_op_mul(
2264
2675
 
2265
2676
  GGML_ASSERT(src0_ddf_i != nullptr);
2266
2677
  GGML_ASSERT(src1_ddf_i != nullptr);
2267
- GGML_ASSERT(dst_ddf_i != nullptr);
2678
+ GGML_ASSERT(dst_ddf_i != nullptr);
2268
2679
 
2269
2680
  const int64_t ne00 = src0->ne[0];
2681
+ const int64_t i01_diff = i01_high - i01_low;
2270
2682
 
2271
2683
  const int64_t ne10 = src1->ne[0];
2272
2684
  const int64_t ne11 = src1->ne[1];
2273
2685
 
2274
- for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
2275
- const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
2686
+ mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2276
2687
 
2277
- float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
2278
- float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
2279
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2688
+ (void) dst;
2689
+ (void) src0_ddq_i;
2690
+ (void) i02;
2691
+ }
2280
2692
 
2281
- // compute
2282
- mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
2283
- }
2693
+ inline void ggml_cuda_op_gelu(
2694
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2695
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2696
+ cudaStream_t & cudaStream_main){
2697
+
2698
+ GGML_ASSERT(src0_ddf_i != nullptr);
2699
+ GGML_ASSERT(dst_ddf_i != nullptr);
2700
+
2701
+ const int64_t ne00 = src0->ne[0];
2702
+ const int64_t i01_diff = i01_high - i01_low;
2703
+
2704
+ // compute
2705
+ gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2284
2706
 
2707
+ (void) src1;
2285
2708
  (void) dst;
2286
2709
  (void) src0_ddq_i;
2710
+ (void) src1_ddf_i;
2287
2711
  (void) i02;
2712
+ (void) i1;
2288
2713
  }
2289
2714
 
2290
2715
  inline void ggml_cuda_op_silu(
@@ -2309,6 +2734,28 @@ inline void ggml_cuda_op_silu(
2309
2734
  (void) i1;
2310
2735
  }
2311
2736
 
2737
+ inline void ggml_cuda_op_norm(
2738
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2739
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2740
+ cudaStream_t & cudaStream_main){
2741
+
2742
+ GGML_ASSERT(src0_ddf_i != nullptr);
2743
+ GGML_ASSERT(dst_ddf_i != nullptr);
2744
+
2745
+ const int64_t ne00 = src0->ne[0];
2746
+ const int64_t i01_diff = i01_high - i01_low;
2747
+
2748
+ // compute
2749
+ norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2750
+
2751
+ (void) src1;
2752
+ (void) dst;
2753
+ (void) src0_ddq_i;
2754
+ (void) src1_ddf_i;
2755
+ (void) i02;
2756
+ (void) i1;
2757
+ }
2758
+
2312
2759
  inline void ggml_cuda_op_rms_norm(
2313
2760
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2314
2761
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2349,22 +2796,30 @@ inline void ggml_cuda_op_mul_mat_vec(
2349
2796
  int id;
2350
2797
  CUDA_CHECK(cudaGetDevice(&id));
2351
2798
 
2352
- const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2799
+ bool mul_mat_vec_q_implemented =
2800
+ src0->type == GGML_TYPE_Q4_0 ||
2353
2801
  src0->type == GGML_TYPE_Q4_1 ||
2354
2802
  src0->type == GGML_TYPE_Q5_0 ||
2355
2803
  src0->type == GGML_TYPE_Q5_1 ||
2356
2804
  src0->type == GGML_TYPE_Q8_0;
2357
-
2358
- // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
2359
- // However, they have bad performance with Pascal cards.
2360
- // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
2361
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
2805
+ #if QK_K == 256
2806
+ mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
2807
+ src0->type == GGML_TYPE_Q2_K ||
2808
+ src0->type == GGML_TYPE_Q3_K ||
2809
+ src0->type == GGML_TYPE_Q4_K ||
2810
+ src0->type == GGML_TYPE_Q5_K ||
2811
+ src0->type == GGML_TYPE_Q6_K;
2812
+ #endif // QK_K == 256
2813
+
2814
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
2362
2815
  #endif
2363
2816
 
2364
2817
  if (use_mul_mat_vec_q) {
2818
+ int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2819
+ padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
2365
2820
  size_t as;
2366
- void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
2367
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
2821
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2822
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
2368
2823
 
2369
2824
  switch (src0->type) {
2370
2825
  case GGML_TYPE_Q4_0:
@@ -2382,6 +2837,21 @@ inline void ggml_cuda_op_mul_mat_vec(
2382
2837
  case GGML_TYPE_Q8_0:
2383
2838
  mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2384
2839
  break;
2840
+ case GGML_TYPE_Q2_K:
2841
+ mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2842
+ break;
2843
+ case GGML_TYPE_Q3_K:
2844
+ mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2845
+ break;
2846
+ case GGML_TYPE_Q4_K:
2847
+ mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2848
+ break;
2849
+ case GGML_TYPE_Q5_K:
2850
+ mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2851
+ break;
2852
+ case GGML_TYPE_Q6_K:
2853
+ mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2854
+ break;
2385
2855
  default:
2386
2856
  GGML_ASSERT(false);
2387
2857
  break;
@@ -2516,13 +2986,26 @@ inline void ggml_cuda_op_rope(
2516
2986
  const int n_past = ((int32_t *) src1->data)[0];
2517
2987
  const int n_dims = ((int32_t *) src1->data)[1];
2518
2988
  const int mode = ((int32_t *) src1->data)[2];
2519
- GGML_ASSERT(mode == 0);
2989
+ const int n_ctx = ((int32_t *) src1->data)[3];
2990
+
2991
+ // RoPE alteration for extended context
2992
+ float freq_base, freq_scale;
2993
+ memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
2994
+ memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
2995
+
2996
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
2997
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
2520
2998
 
2521
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
2522
- const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
2999
+ bool is_glm = mode & 4;
2523
3000
 
2524
3001
  // compute
2525
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3002
+ if (is_glm) {
3003
+ const float id_p = min(p, n_ctx - 2.f);
3004
+ const float block_p = max(p - (n_ctx - 2.f), 0.f);
3005
+ rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3006
+ } else {
3007
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3008
+ }
2526
3009
 
2527
3010
  (void) dst;
2528
3011
  (void) src0_ddq_i;
@@ -2925,11 +3408,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2925
3408
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
2926
3409
  }
2927
3410
 
3411
+ void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3412
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3413
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
3414
+ }
3415
+
2928
3416
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2929
3417
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2930
3418
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
2931
3419
  }
2932
3420
 
3421
+ void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3422
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3423
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
3424
+ }
3425
+
2933
3426
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2934
3427
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2935
3428
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -3085,6 +3578,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3085
3578
  (void) dst;
3086
3579
  }
3087
3580
 
3581
+ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3582
+ ggml_cuda_cpy(src0, dst, nullptr);
3583
+ (void) src1;
3584
+ }
3585
+
3088
3586
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3089
3587
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3090
3588
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@@ -3108,7 +3606,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3108
3606
 
3109
3607
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3110
3608
  int nrows = ggml_nrows(tensor);
3609
+
3610
+ const int64_t ne0 = tensor->ne[0];
3611
+
3111
3612
  const size_t nb1 = tensor->nb[1];
3613
+
3112
3614
  ggml_backend backend = tensor->backend;
3113
3615
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
3114
3616
  memset(extra, 0, sizeof(*extra));
@@ -3137,13 +3639,26 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3137
3639
  int64_t nrows_split = row_high - row_low;
3138
3640
 
3139
3641
  const size_t offset_split = row_low*nb1;
3140
- const size_t size = ggml_nbytes_split(tensor, nrows_split);
3642
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
3643
+ const size_t original_size = size;
3644
+
3645
+ // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3646
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
3647
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3648
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
3649
+ }
3141
3650
 
3142
- void * buf;
3651
+ char * buf;
3143
3652
  CUDA_CHECK(cudaMalloc(&buf, size));
3144
- void * buf_host = (char*)data + offset_split;
3653
+ char * buf_host = (char*)data + offset_split;
3145
3654
 
3146
- cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
3655
+ // set padding to 0 to avoid possible NaN values
3656
+ if (size > original_size) {
3657
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
3658
+ }
3659
+
3660
+
3661
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
3147
3662
 
3148
3663
  extra->data_device[id] = buf;
3149
3664
 
@@ -3177,43 +3692,60 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
3177
3692
  delete extra;
3178
3693
  }
3179
3694
 
3695
+ static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
3696
+ static size_t g_temp_tensor_extra_index = 0;
3697
+
3698
+ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
3699
+ if (g_temp_tensor_extras == nullptr) {
3700
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
3701
+ }
3702
+
3703
+ size_t alloc_index = g_temp_tensor_extra_index;
3704
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
3705
+ struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
3706
+ memset(extra, 0, sizeof(*extra));
3707
+
3708
+ return extra;
3709
+ }
3710
+
3180
3711
  void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
3181
3712
  if (scratch && g_scratch_size == 0) {
3182
3713
  return;
3183
3714
  }
3184
3715
 
3185
3716
  // recursively assign CUDA buffers until a compute tensor is found
3186
- if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
3187
- const ggml_op src0_op = tensor->src0->op;
3188
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
3189
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
3717
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3718
+ const ggml_op src0_op = tensor->src[0]->op;
3719
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
3720
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
3190
3721
  }
3191
3722
  }
3192
- if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
3193
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
3723
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
3724
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
3194
3725
  }
3195
3726
 
3196
3727
  tensor->backend = GGML_BACKEND_GPU;
3197
- struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
3198
- memset(extra, 0, sizeof(*extra));
3728
+ struct ggml_tensor_extra_gpu * extra;
3199
3729
 
3200
- const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
3730
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
3201
3731
  tensor->op == GGML_OP_VIEW ||
3202
3732
  force_inplace;
3203
3733
  const size_t size = ggml_nbytes(tensor);
3204
3734
 
3205
3735
  CUDA_CHECK(cudaSetDevice(g_main_device));
3206
- if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
3207
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
3736
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
3737
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
3208
3738
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3209
3739
  size_t offset = 0;
3210
3740
  if (tensor->op == GGML_OP_VIEW) {
3211
- memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
3741
+ memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3212
3742
  }
3743
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3213
3744
  extra->data_device[g_main_device] = src0_ddc + offset;
3214
3745
  } else if (tensor->op == GGML_OP_CPY) {
3215
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
3746
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
3216
3747
  void * src1_ddv = src1_extra->data_device[g_main_device];
3748
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3217
3749
  extra->data_device[g_main_device] = src1_ddv;
3218
3750
  } else if (scratch) {
3219
3751
  GGML_ASSERT(size <= g_scratch_size);
@@ -3226,6 +3758,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3226
3758
  CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
3227
3759
  g_scratch_buffer = data;
3228
3760
  }
3761
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3229
3762
  extra->data_device[g_main_device] = data + g_scratch_offset;
3230
3763
 
3231
3764
  g_scratch_offset += size;
@@ -3235,6 +3768,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3235
3768
  void * data;
3236
3769
  CUDA_CHECK(cudaMalloc(&data, size));
3237
3770
  CUDA_CHECK(cudaMemset(data, 0, size));
3771
+ extra = new ggml_tensor_extra_gpu;
3772
+ memset(extra, 0, sizeof(*extra));
3238
3773
  extra->data_device[g_main_device] = data;
3239
3774
  }
3240
3775
 
@@ -3283,10 +3818,16 @@ void ggml_cuda_free_scratch() {
3283
3818
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
3284
3819
  ggml_cuda_func_t func;
3285
3820
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
3286
- || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
3287
- || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
3821
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3822
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
3288
3823
 
3289
3824
  switch (tensor->op) {
3825
+ case GGML_OP_DUP:
3826
+ if (!any_on_device) {
3827
+ return false;
3828
+ }
3829
+ func = ggml_cuda_dup;
3830
+ break;
3290
3831
  case GGML_OP_ADD:
3291
3832
  if (!any_on_device) {
3292
3833
  return false;
@@ -3299,12 +3840,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3299
3840
  }
3300
3841
  func = ggml_cuda_mul;
3301
3842
  break;
3843
+ case GGML_OP_GELU:
3844
+ if (!any_on_device) {
3845
+ return false;
3846
+ }
3847
+ func = ggml_cuda_gelu;
3848
+ break;
3302
3849
  case GGML_OP_SILU:
3303
3850
  if (!any_on_device) {
3304
3851
  return false;
3305
3852
  }
3306
3853
  func = ggml_cuda_silu;
3307
3854
  break;
3855
+ case GGML_OP_NORM:
3856
+ if (!any_on_device) {
3857
+ return false;
3858
+ }
3859
+ func = ggml_cuda_norm;
3860
+ break;
3308
3861
  case GGML_OP_RMS_NORM:
3309
3862
  if (!any_on_device) {
3310
3863
  return false;
@@ -3312,7 +3865,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3312
3865
  func = ggml_cuda_rms_norm;
3313
3866
  break;
3314
3867
  case GGML_OP_MUL_MAT:
3315
- if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
3868
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
3316
3869
  return false;
3317
3870
  }
3318
3871
  func = ggml_cuda_mul_mat;
@@ -3329,6 +3882,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3329
3882
  }
3330
3883
  func = ggml_cuda_cpy;
3331
3884
  break;
3885
+ case GGML_OP_CONT:
3886
+ if (!any_on_device) {
3887
+ return false;
3888
+ }
3889
+ func = ggml_cuda_dup;
3890
+ break;
3332
3891
  case GGML_OP_RESHAPE:
3333
3892
  case GGML_OP_VIEW:
3334
3893
  case GGML_OP_PERMUTE:
@@ -3366,6 +3925,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3366
3925
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3367
3926
  return true;
3368
3927
  }
3369
- func(tensor->src0, tensor->src1, tensor);
3928
+ func(tensor->src[0], tensor->src[1], tensor);
3370
3929
  return true;
3371
3930
  }