llama_cpp 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,8 @@
13
13
  #include "ggml-cuda.h"
14
14
  #include "ggml.h"
15
15
 
16
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+
16
18
  #if defined(_MSC_VER)
17
19
  #pragma warning(disable: 4244 4267) // possible loss of data
18
20
  #endif
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
74
76
 
75
77
  #define QK4_0 32
76
78
  #define QR4_0 2
77
- #define QI4_0 4
79
+ #define QI4_0 (QK4_0 / (4 * QR4_0))
78
80
  typedef struct {
79
81
  half d; // delta
80
82
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
83
85
 
84
86
  #define QK4_1 32
85
87
  #define QR4_1 2
86
- #define QI4_1 4
88
+ #define QI4_1 (QK4_1 / (4 * QR4_1))
87
89
  typedef struct {
88
90
  half d; // delta
89
91
  half m; // min
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
93
95
 
94
96
  #define QK5_0 32
95
97
  #define QR5_0 2
96
- #define QI5_0 4
98
+ #define QI5_0 (QK5_0 / (4 * QR5_0))
97
99
  typedef struct {
98
100
  half d; // delta
99
101
  uint8_t qh[4]; // 5-th bit of quants
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
103
105
 
104
106
  #define QK5_1 32
105
107
  #define QR5_1 2
106
- #define QI5_1 4
108
+ #define QI5_1 (QK5_1 / (4 * QR5_1))
107
109
  typedef struct {
108
110
  half d; // delta
109
111
  half m; // min
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
114
116
 
115
117
  #define QK8_0 32
116
118
  #define QR8_0 1
117
- #define QI8_0 8
119
+ #define QI8_0 (QK8_0 / (4 * QR8_0))
118
120
  typedef struct {
119
121
  half d; // delta
120
122
  int8_t qs[QK8_0]; // quants
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
123
125
 
124
126
  #define QK8_1 32
125
127
  #define QR8_1 1
126
- #define QI8_1 8
128
+ #define QI8_1 (QK8_1 / (4 * QR8_1))
127
129
  typedef struct {
128
130
  half d; // delta
129
131
  half s; // unquantized sum
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
143
145
  #define K_SCALE_SIZE 12
144
146
  #endif
145
147
 
148
+ #define QR2_K 4
149
+ #define QI2_K (QK_K / (4*QR2_K))
146
150
  typedef struct {
147
151
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
148
152
  uint8_t qs[QK_K/4]; // quants
@@ -151,6 +155,8 @@ typedef struct {
151
155
  } block_q2_K;
152
156
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
153
157
 
158
+ #define QR3_K 4
159
+ #define QI3_K (QK_K / (4*QR3_K))
154
160
  typedef struct {
155
161
  uint8_t hmask[QK_K/8]; // quants - high bit
156
162
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -163,6 +169,8 @@ typedef struct {
163
169
  } block_q3_K;
164
170
  //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
165
171
 
172
+ #define QR4_K 2
173
+ #define QI4_K (QK_K / (4*QR4_K))
166
174
  #ifdef GGML_QKK_64
167
175
  typedef struct {
168
176
  half d[2]; // super-block scales/mins
@@ -180,6 +188,8 @@ typedef struct {
180
188
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
181
189
  #endif
182
190
 
191
+ #define QR5_K 2
192
+ #define QI5_K (QK_K / (4*QR5_K))
183
193
  #ifdef GGML_QKK_64
184
194
  typedef struct {
185
195
  half d; // super-block scale
@@ -199,6 +209,8 @@ typedef struct {
199
209
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
200
210
  #endif
201
211
 
212
+ #define QR6_K 2
213
+ #define QI6_K (QK_K / (4*QR6_K))
202
214
  typedef struct {
203
215
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
204
216
  uint8_t qh[QK_K/4]; // quants, upper 2 bits
@@ -208,7 +220,7 @@ typedef struct {
208
220
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
209
221
 
210
222
  #define WARP_SIZE 32
211
- #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
223
+ #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
212
224
 
213
225
  #define CUDA_ADD_BLOCK_SIZE 256
214
226
  #define CUDA_MUL_BLOCK_SIZE 256
@@ -240,13 +252,13 @@ struct ggml_tensor_extra_gpu {
240
252
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
241
253
  };
242
254
 
243
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
255
+ static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
244
256
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
245
257
 
246
- if (i >= k) {
258
+ if (i >= kx) {
247
259
  return;
248
260
  }
249
- dst[i] = x[i] + y[i];
261
+ dst[i] = x[i] + y[i%ky];
250
262
  }
251
263
 
252
264
  static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -320,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
320
332
  }
321
333
  }
322
334
 
323
- static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
335
+ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
324
336
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
325
337
  const int tid = threadIdx.x;
326
338
 
327
- const float eps = 1e-6f;
328
-
329
339
  float tmp = 0.0f; // partial sum for thread in warp
330
340
 
331
341
  for (int col = tid; col < ncols; col += WARP_SIZE) {
@@ -923,12 +933,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
923
933
  uint16_t aux[4];
924
934
  const uint8_t * sc = (const uint8_t *)aux;
925
935
 
936
+ #if K_QUANTS_PER_ITERATION == 2
937
+ uint32_t q32[4];
938
+ const uint8_t * q4 = (const uint8_t *)q32;
939
+ #else
940
+ uint16_t q16[4];
941
+ const uint8_t * q4 = (const uint8_t *)q16;
942
+ #endif
943
+
926
944
  float tmp = 0; // partial sum for thread in warp
927
945
 
928
946
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
929
947
 
930
- const uint8_t * q1 = x[i].qs + q_offset;
931
- const uint8_t * q2 = q1 + 64;
932
948
  const float * y1 = yy + i*QK_K + y_offset;
933
949
  const float * y2 = y1 + 128;
934
950
 
@@ -941,14 +957,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
941
957
  aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
942
958
  aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
943
959
 
960
+ #if K_QUANTS_PER_ITERATION == 2
961
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
962
+ const uint32_t * q2 = q1 + 16;
963
+
964
+ q32[0] = q1[0] & 0x0f0f0f0f;
965
+ q32[1] = q1[0] & 0xf0f0f0f0;
966
+ q32[2] = q2[0] & 0x0f0f0f0f;
967
+ q32[3] = q2[0] & 0xf0f0f0f0;
968
+
944
969
  float4 s = {0.f, 0.f, 0.f, 0.f};
945
970
  float smin = 0;
946
- for (int l = 0; l < n; ++l) {
947
- s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
948
- s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
971
+ for (int l = 0; l < 4; ++l) {
972
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
973
+ s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
949
974
  smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
950
975
  }
951
- tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
976
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
977
+ #else
978
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
979
+ const uint16_t * q2 = q1 + 32;
980
+
981
+ q16[0] = q1[0] & 0x0f0f;
982
+ q16[1] = q1[0] & 0xf0f0;
983
+ q16[2] = q2[0] & 0x0f0f;
984
+ q16[3] = q2[0] & 0xf0f0;
985
+
986
+ float4 s = {0.f, 0.f, 0.f, 0.f};
987
+ float smin = 0;
988
+ for (int l = 0; l < 2; ++l) {
989
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
990
+ s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
991
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
992
+ }
993
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
994
+ #endif
952
995
 
953
996
  }
954
997
  #else
@@ -1028,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1028
1071
  uint16_t aux[4];
1029
1072
  const uint8_t * sc = (const uint8_t *)aux;
1030
1073
 
1074
+ uint16_t q16[8];
1075
+ const uint8_t * q4 = (const uint8_t *)q16;
1076
+
1031
1077
  for (int i = ix; i < num_blocks_per_row; i += 2) {
1032
1078
 
1033
1079
  const uint8_t * ql1 = x[i].qs + q_offset;
1034
- const uint8_t * ql2 = ql1 + 64;
1035
1080
  const uint8_t * qh = x[i].qh + l0;
1036
1081
  const float * y1 = yy + i*QK_K + y_offset;
1037
1082
  const float * y2 = y1 + 128;
@@ -1047,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1047
1092
 
1048
1093
  float4 sum = {0.f, 0.f, 0.f, 0.f};
1049
1094
  float smin = 0;
1095
+ const uint16_t * q1 = (const uint16_t *)ql1;
1096
+ const uint16_t * q2 = q1 + 32;
1097
+ q16[0] = q1[0] & 0x0f0f;
1098
+ q16[1] = q1[8] & 0x0f0f;
1099
+ q16[2] = (q1[0] >> 4) & 0x0f0f;
1100
+ q16[3] = (q1[8] >> 4) & 0x0f0f;
1101
+ q16[4] = q2[0] & 0x0f0f;
1102
+ q16[5] = q2[8] & 0x0f0f;
1103
+ q16[6] = (q2[0] >> 4) & 0x0f0f;
1104
+ q16[7] = (q2[8] >> 4) & 0x0f0f;
1050
1105
  for (int l = 0; l < n; ++l) {
1051
- sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1052
- + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1053
- sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1054
- + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1055
- sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1056
- + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1057
- sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1058
- + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1106
+ sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1107
+ + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1108
+ sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1109
+ + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1110
+ sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1111
+ + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1112
+ sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1113
+ + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1059
1114
  smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
1060
1115
  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
1061
1116
  }
@@ -1271,8 +1326,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
1271
1326
  y[iybs + iqs + y_offset] = v.y;
1272
1327
  }
1273
1328
 
1274
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1275
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1329
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1330
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1331
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1276
1332
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1277
1333
 
1278
1334
  int vi;
@@ -1293,11 +1349,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
1293
1349
  return sumi*d;
1294
1350
  #else
1295
1351
  return 0.0f; // only to satisfy the compiler
1296
- #endif // __CUDA_ARCH__ >= 610
1352
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1297
1353
  }
1298
1354
 
1299
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1300
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1355
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1356
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1357
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1301
1358
  const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1302
1359
 
1303
1360
  const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1318,11 +1375,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
1318
1375
  return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1319
1376
  #else
1320
1377
  return 0.0f; // only to satisfy the compiler
1321
- #endif // __CUDA_ARCH__ >= 610
1378
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1322
1379
  }
1323
1380
 
1324
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1325
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1381
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1382
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1383
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1326
1384
  const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1327
1385
 
1328
1386
  int qs;
@@ -1353,11 +1411,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
1353
1411
  return sumi*d;
1354
1412
  #else
1355
1413
  return 0.0f; // only to satisfy the compiler
1356
- #endif // __CUDA_ARCH__ >= 610
1414
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1357
1415
  }
1358
1416
 
1359
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1360
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1417
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1418
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1419
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1361
1420
  const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1362
1421
 
1363
1422
  const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1387,11 +1446,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
1387
1446
  return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1388
1447
  #else
1389
1448
  return 0.0f; // only to satisfy the compiler
1390
- #endif // __CUDA_ARCH__ >= 610
1449
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1391
1450
  }
1392
1451
 
1393
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1394
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1452
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1453
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1454
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1395
1455
  const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1396
1456
 
1397
1457
  int vi;
@@ -1406,7 +1466,342 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
1406
1466
  return sumi*d;
1407
1467
  #else
1408
1468
  return 0.0f; // only to satisfy the compiler
1409
- #endif // __CUDA_ARCH__ >= 610
1469
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1470
+ }
1471
+
1472
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1473
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1474
+
1475
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1476
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1477
+
1478
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
1479
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1480
+
1481
+ float sumf_d = 0.0f;
1482
+ float sumf_m = 0.0f;
1483
+
1484
+ const float d = bq2_K->d;
1485
+ const float dmin = bq2_K->dmin;
1486
+
1487
+ const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1488
+
1489
+ for (int i = 0; i < QR2_K; ++i) {
1490
+ const int sc = bq2_K->scales[scale_offset + 2*i];
1491
+
1492
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1493
+ const float d8i = bq8i->d;
1494
+
1495
+ const int vi = (v >> (2*i)) & 0x03030303;
1496
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1497
+
1498
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1499
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1500
+ }
1501
+
1502
+ return d*sumf_d - dmin*sumf_m;
1503
+ #else
1504
+ return 0.0f; // only to satisfy the compiler
1505
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1506
+ }
1507
+
1508
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1509
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1510
+
1511
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1512
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
1513
+
1514
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1515
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1516
+
1517
+ float sumf = 0.0f;
1518
+
1519
+ const float d = bq3_K->d;
1520
+
1521
+ int vl;
1522
+ memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
1523
+
1524
+ int vh;
1525
+ memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1526
+ vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1527
+ vh >>= bq8_offset;
1528
+
1529
+ for (int i = 0; i < QR3_K; ++i) {
1530
+ const int isc = scale_offset + 2*i;
1531
+
1532
+ const int isc_low = isc % (QK_K/32);
1533
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
1534
+ const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
1535
+
1536
+ const int isc_high = isc % (QK_K/64);
1537
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
1538
+ const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1539
+
1540
+ const int sc = (sc_low | sc_high) - 32;
1541
+
1542
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1543
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1544
+ const float d8i = bq8i->d;
1545
+
1546
+ const int vil = (vl >> (2*i)) & 0x03030303;
1547
+
1548
+ const int vih = ((vh >> i) << 2) & 0x04040404;
1549
+
1550
+ const int vi = __vsubss4(vil, vih);
1551
+
1552
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1553
+ }
1554
+
1555
+ return d*sumf;
1556
+ #else
1557
+ return 0.0f; // only to satisfy the compiler
1558
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1559
+ }
1560
+
1561
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1562
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1563
+
1564
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1565
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1566
+
1567
+ float sumf_d = 0.0f;
1568
+ float sumf_m = 0.0f;
1569
+
1570
+ #ifndef GGML_QKK_64
1571
+
1572
+ // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573
+ const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1574
+
1575
+ const float d = bq4_K->d;
1576
+ const float dmin = bq4_K->dmin;
1577
+
1578
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1582
+
1583
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1584
+ const int v1 = q4[0];
1585
+ const int v2 = q4[4];
1586
+
1587
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
1588
+ uint16_t aux[2];
1589
+ const int j = bq8_offset/2;
1590
+ if (j < 2) {
1591
+ aux[0] = scales[j+0] & 0x3f3f;
1592
+ aux[1] = scales[j+2] & 0x3f3f;
1593
+ } else {
1594
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1595
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1596
+ }
1597
+ const uint8_t * sc = (const uint8_t *)aux;
1598
+ const uint8_t * m = sc + 2;
1599
+
1600
+ for (int i = 0; i < QR4_K; ++i) {
1601
+
1602
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1603
+ const float d8i = bq8i->d;
1604
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
1605
+ const int ui1 = q8[0];
1606
+ const int ui2 = q8[4];
1607
+
1608
+ const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
1609
+ const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
1610
+
1611
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1612
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1613
+
1614
+ sumf_d += d8i * (dot1 * sc[i]);
1615
+ sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1616
+ }
1617
+
1618
+ return d*sumf_d - dmin*sumf_m;
1619
+
1620
+ #else
1621
+
1622
+ uint16_t aux16[2];
1623
+ const uint8_t * s = (const uint8_t *)aux16;
1624
+
1625
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
1626
+ aux16[0] = a[0] & 0x0f0f;
1627
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
1628
+
1629
+ const float dall = bq4_K->d[0];
1630
+ const float dmin = bq4_K->d[1];
1631
+
1632
+ const float d8_1 = bq8_1[0].d;
1633
+ const float d8_2 = bq8_1[1].d;
1634
+
1635
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1636
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1637
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1638
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1639
+
1640
+ const int * q4 = (const int *)bq4_K->qs + iqs;
1641
+ const int v1 = q4[0];
1642
+ const int v2 = q4[4];
1643
+
1644
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
1645
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
1646
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1647
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
1648
+
1649
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
1650
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
1651
+
1652
+ return dall * sumf_d - dmin * sumf_m;
1653
+
1654
+ #endif
1655
+
1656
+ #else
1657
+ return 0.0f; // only to satisfy the compiler
1658
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1659
+ }
1660
+
1661
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1662
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1663
+
1664
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1665
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1666
+
1667
+ #ifndef GGML_QKK_64
1668
+
1669
+ const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
1670
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1671
+ const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
1672
+
1673
+ float sumf_d = 0.0f;
1674
+ float sumf_m = 0.0f;
1675
+
1676
+ const float d = bq5_K->d;
1677
+ const float dmin = bq5_K->dmin;
1678
+
1679
+ const int vl1 = ql[0];
1680
+ const int vl2 = ql[4];
1681
+
1682
+ const int vh1 = qh[0] >> bq8_offset;
1683
+ const int vh2 = qh[4] >> bq8_offset;
1684
+
1685
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
1686
+ uint16_t aux[2];
1687
+ const int j = bq8_offset/2;
1688
+ if (j < 2) {
1689
+ aux[0] = scales[j+0] & 0x3f3f;
1690
+ aux[1] = scales[j+2] & 0x3f3f;
1691
+ } else {
1692
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1693
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1694
+ }
1695
+ const uint8_t * sc = (const uint8_t *)aux;
1696
+ const uint8_t * m = sc + 2;
1697
+
1698
+ for (int i = 0; i < QR5_K; ++i) {
1699
+
1700
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1701
+ const float d8i = bq8i->d;
1702
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
1703
+ const int ui1 = q8[0];
1704
+ const int ui2 = q8[4];
1705
+
1706
+ const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
1707
+ const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
1708
+
1709
+ const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
1710
+ const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
1711
+
1712
+ const int vi1 = vil1 | vih1;
1713
+ const int vi2 = vil2 | vih2;
1714
+
1715
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1716
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1717
+
1718
+ sumf_d += d8i * (dot1 * sc[i]);
1719
+ sumf_m += d8i * (dot2 * m[i]);
1720
+
1721
+ }
1722
+
1723
+ return d*sumf_d - dmin*sumf_m;
1724
+
1725
+ #else
1726
+
1727
+ const int8_t * s = bq5_K->scales;
1728
+
1729
+ const float d = bq5_K->d;
1730
+
1731
+ const float d8_1 = bq8_1[0].d;
1732
+ const float d8_2 = bq8_1[1].d;
1733
+
1734
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1735
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1736
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1737
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1738
+
1739
+ const int * ql = (const int *)bq5_K->qs + iqs;
1740
+ const int vl1 = ql[0];
1741
+ const int vl2 = ql[4];
1742
+
1743
+ const int step = 4 * iqs; // 0, 4, 8, 12
1744
+ const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745
+ const int in = step%8; // 0, 4, 0, 4
1746
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
1747
+
1748
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
1749
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
1750
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
1751
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
1752
+
1753
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
1754
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
1755
+
1756
+ return d * sumf_d;
1757
+
1758
+ #endif
1759
+
1760
+ #else
1761
+ return 0.0f; // only to satisfy the compiler
1762
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1763
+ }
1764
+
1765
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1766
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1767
+
1768
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1769
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
1770
+
1771
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1772
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1773
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
1774
+
1775
+ float sumf = 0.0f;
1776
+
1777
+ const float d = bq6_K->d;
1778
+
1779
+ int vl;
1780
+ memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
1781
+
1782
+ int vh;
1783
+ memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
1784
+
1785
+ for (int i = 0; i < QR6_K; ++i) {
1786
+ const int sc = bq6_K->scales[scale_offset + 4*i];
1787
+
1788
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1789
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1790
+ const float d8i = bq8i->d;
1791
+
1792
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1793
+
1794
+ const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
1795
+
1796
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1797
+
1798
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1799
+ }
1800
+
1801
+ return d*sumf;
1802
+ #else
1803
+ return 0.0f; // only to satisfy the compiler
1804
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1410
1805
  }
1411
1806
 
1412
1807
  template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -1429,7 +1824,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1429
1824
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1430
1825
  const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1431
1826
 
1432
- const int iby = i + threadIdx.x / qi; // y block index
1827
+ const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
1433
1828
 
1434
1829
  const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1435
1830
 
@@ -1515,11 +1910,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1515
1910
  }
1516
1911
  }
1517
1912
 
1518
- static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1913
+ static __global__ void mul_mat_p021_f16_f32(
1914
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1915
+ const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
1916
+
1519
1917
  const half * x = (const half *) vx;
1520
1918
 
1521
1919
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1522
1920
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1921
+ const int channel_x = channel / (nchannels_y / nchannels_x);
1523
1922
 
1524
1923
  const int nrows_y = ncols_x;
1525
1924
  const int nrows_dst = nrows_x;
@@ -1535,7 +1934,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1535
1934
  }
1536
1935
 
1537
1936
  // x is transposed and permuted
1538
- const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
1937
+ const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
1539
1938
  const float xi = __half2float(x[ix]);
1540
1939
 
1541
1940
  const int row_y = col_x;
@@ -1563,12 +1962,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1563
1962
 
1564
1963
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1565
1964
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1566
- const int row_stride_x, const int channel_stride_x) {
1965
+ const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
1567
1966
 
1568
1967
  const half * x = (const half *) vx;
1569
1968
 
1570
1969
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1571
1970
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1971
+ const int channel_x = channel / channel_x_divisor;
1572
1972
 
1573
1973
  const int nrows_y = ncols_x;
1574
1974
  const int nrows_dst = nrows_x;
@@ -1585,7 +1985,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1585
1985
  break;
1586
1986
  }
1587
1987
 
1588
- const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
1988
+ const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
1589
1989
  const float xi = __half2float(x[ix]);
1590
1990
 
1591
1991
  const int row_y = col_x;
@@ -1667,6 +2067,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
1667
2067
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
1668
2068
  }
1669
2069
 
2070
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
2071
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
2072
+ const int half_n_dims = ncols/4;
2073
+
2074
+ if (col >= half_n_dims) {
2075
+ return;
2076
+ }
2077
+
2078
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
2079
+ const int i = row*ncols + col;
2080
+
2081
+ const float col_theta_scale = powf(theta_scale, col);
2082
+
2083
+ const float theta = p*col_theta_scale;
2084
+ const float sin_theta = sinf(theta);
2085
+ const float cos_theta = cosf(theta);
2086
+
2087
+ const float x0 = x[i + 0];
2088
+ const float x1 = x[i + half_n_dims];
2089
+
2090
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
2091
+ dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
2092
+
2093
+ const float block_theta = block_p*col_theta_scale;
2094
+ const float sin_block_theta = sinf(block_theta);
2095
+ const float cos_block_theta = cosf(block_theta);
2096
+
2097
+ const float x2 = x[i + half_n_dims * 2];
2098
+ const float x3 = x[i + half_n_dims * 3];
2099
+
2100
+ dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
2101
+ dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
2102
+ }
2103
+
1670
2104
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
1671
2105
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
1672
2106
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1732,9 +2166,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
1732
2166
  dst[i] = scale * x[i];
1733
2167
  }
1734
2168
 
1735
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
1736
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1737
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
2169
+ static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
2170
+ const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
2171
+ add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1738
2172
  }
1739
2173
 
1740
2174
  static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -1763,10 +2197,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
1763
2197
  norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1764
2198
  }
1765
2199
 
1766
- static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2200
+ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
1767
2201
  GGML_ASSERT(ncols % WARP_SIZE == 0);
1768
2202
  const dim3 block_dims(WARP_SIZE, 1, 1);
1769
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
2203
+ rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
1770
2204
  }
1771
2205
 
1772
2206
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
@@ -1928,7 +2362,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1928
2362
  }
1929
2363
 
1930
2364
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1931
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2365
+ GGML_ASSERT(ncols % QK4_0 == 0);
1932
2366
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1933
2367
  const dim3 block_nums(1, block_num_y, 1);
1934
2368
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1937,7 +2371,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
1937
2371
  }
1938
2372
 
1939
2373
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1940
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2374
+ GGML_ASSERT(ncols % QK4_1 == 0);
1941
2375
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1942
2376
  const dim3 block_nums(1, block_num_y, 1);
1943
2377
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1946,7 +2380,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
1946
2380
  }
1947
2381
 
1948
2382
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1949
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2383
+ GGML_ASSERT(ncols % QK5_0 == 0);
1950
2384
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1951
2385
  const dim3 block_nums(1, block_num_y, 1);
1952
2386
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1955,7 +2389,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
1955
2389
  }
1956
2390
 
1957
2391
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1958
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2392
+ GGML_ASSERT(ncols % QK5_1 == 0);
1959
2393
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1960
2394
  const dim3 block_nums(1, block_num_y, 1);
1961
2395
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1964,7 +2398,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
1964
2398
  }
1965
2399
 
1966
2400
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1967
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2401
+ GGML_ASSERT(ncols % QK8_0 == 0);
1968
2402
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1969
2403
  const dim3 block_nums(1, block_num_y, 1);
1970
2404
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1972,6 +2406,57 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
1972
2406
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1973
2407
  }
1974
2408
 
2409
+ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2410
+ GGML_ASSERT(ncols % QK_K == 0);
2411
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2412
+ const dim3 block_nums(1, block_num_y, 1);
2413
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2414
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
2415
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2416
+ }
2417
+
2418
+ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2419
+ GGML_ASSERT(ncols % QK_K == 0);
2420
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2421
+ const dim3 block_nums(1, block_num_y, 1);
2422
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2423
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
2424
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2425
+ }
2426
+
2427
+ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2428
+ GGML_ASSERT(ncols % QK_K == 0);
2429
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2430
+ const dim3 block_nums(1, block_num_y, 1);
2431
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2432
+ // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2433
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2434
+ // is better amortized.
2435
+ mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
2436
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2437
+ }
2438
+
2439
+ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2440
+ GGML_ASSERT(ncols % QK_K == 0);
2441
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2442
+ const dim3 block_nums(1, block_num_y, 1);
2443
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2444
+ // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2445
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2446
+ // is better amortized.
2447
+ mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
2448
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2449
+ }
2450
+
2451
+ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2452
+ GGML_ASSERT(ncols % QK_K == 0);
2453
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2454
+ const dim3 block_nums(1, block_num_y, 1);
2455
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2456
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
2457
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2458
+ }
2459
+
1975
2460
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1976
2461
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1977
2462
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2015,20 +2500,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
2015
2500
  }
2016
2501
  }
2017
2502
 
2018
- static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
2019
- const dim3 block_nums(1, nrows_x, nchannels_x);
2503
+ static void ggml_mul_mat_p021_f16_f32_cuda(
2504
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
2505
+ const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
2506
+
2507
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2020
2508
  const dim3 block_dims(WARP_SIZE, 1, 1);
2021
- mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
2509
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
2022
2510
  }
2023
2511
 
2024
2512
  static void ggml_mul_mat_vec_nc_f16_f32_cuda(
2025
2513
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
2026
- const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
2514
+ const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
2027
2515
 
2028
- const dim3 block_nums(1, nrows_x, nchannels_x);
2516
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2029
2517
  const dim3 block_dims(WARP_SIZE, 1, 1);
2030
2518
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
2031
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
2519
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
2032
2520
  }
2033
2521
 
2034
2522
  static void ggml_cpy_f32_f32_cuda(
@@ -2064,6 +2552,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
2064
2552
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
2065
2553
  }
2066
2554
 
2555
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
2556
+ GGML_ASSERT(nrows % 4 == 0);
2557
+ const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2558
+ const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
2559
+ const dim3 block_nums(num_blocks_x, nrows, 1);
2560
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
2561
+ }
2562
+
2067
2563
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
2068
2564
  const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
2069
2565
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2106,20 +2602,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
2106
2602
  scoped_spin_lock lock(g_cuda_pool_lock);
2107
2603
  int id;
2108
2604
  CUDA_CHECK(cudaGetDevice(&id));
2109
-
2605
+ #ifdef DEBUG_CUDA_MALLOC
2606
+ int nnz = 0;
2607
+ size_t max_size = 0, tot_size = 0;
2608
+ #endif
2609
+ size_t best_diff = 1ull << 36;
2610
+ int ibest = -1;
2110
2611
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
2111
2612
  cuda_buffer& b = g_cuda_buffer_pool[id][i];
2112
- if (b.size >= size && b.ptr != nullptr) {
2113
- void * ptr = b.ptr;
2114
- *actual_size = b.size;
2115
- b.ptr = nullptr;
2116
- b.size = 0;
2117
- return ptr;
2613
+ if (b.ptr != nullptr) {
2614
+ #ifdef DEBUG_CUDA_MALLOC
2615
+ ++nnz;
2616
+ tot_size += b.size;
2617
+ if (b.size > max_size) max_size = b.size;
2618
+ #endif
2619
+ if (b.size >= size) {
2620
+ size_t diff = b.size - size;
2621
+ if (diff < best_diff) {
2622
+ best_diff = diff;
2623
+ ibest = i;
2624
+ if (!best_diff) {
2625
+ void * ptr = b.ptr;
2626
+ *actual_size = b.size;
2627
+ b.ptr = nullptr;
2628
+ b.size = 0;
2629
+ return ptr;
2630
+ }
2631
+ }
2632
+ }
2118
2633
  }
2119
2634
  }
2635
+ if (ibest >= 0) {
2636
+ cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
2637
+ void * ptr = b.ptr;
2638
+ *actual_size = b.size;
2639
+ b.ptr = nullptr;
2640
+ b.size = 0;
2641
+ return ptr;
2642
+ }
2643
+ #ifdef DEBUG_CUDA_MALLOC
2644
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
2645
+ (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
2646
+ #endif
2120
2647
  void * ptr;
2121
- CUDA_CHECK(cudaMalloc((void **) &ptr, size));
2122
- *actual_size = size;
2648
+ size_t look_ahead_size = (size_t) (1.05 * size);
2649
+ look_ahead_size = 256 * ((look_ahead_size + 255)/256);
2650
+ CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
2651
+ *actual_size = look_ahead_size;
2123
2652
  return ptr;
2124
2653
  }
2125
2654
 
@@ -2147,7 +2676,9 @@ static size_t g_scratch_offset = 0;
2147
2676
 
2148
2677
  static int g_device_count = -1;
2149
2678
  static int g_main_device = 0;
2679
+ #ifndef GGML_CUDA_FORCE_DMMV
2150
2680
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2681
+ #endif
2151
2682
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
2152
2683
 
2153
2684
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -2170,7 +2701,9 @@ void ggml_init_cublas() {
2170
2701
  g_tensor_split[id] = total_vram;
2171
2702
  total_vram += prop.totalGlobalMem;
2172
2703
 
2704
+ #ifndef GGML_CUDA_FORCE_DMMV
2173
2705
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
2706
+ #endif
2174
2707
  }
2175
2708
  for (int id = 0; id < g_device_count; ++id) {
2176
2709
  g_tensor_split[id] /= total_vram;
@@ -2195,6 +2728,9 @@ void ggml_init_cublas() {
2195
2728
  }
2196
2729
 
2197
2730
  void ggml_cuda_set_tensor_split(const float * tensor_split) {
2731
+ if (tensor_split == nullptr) {
2732
+ return;
2733
+ }
2198
2734
  bool all_zero = true;
2199
2735
  for (int i = 0; i < g_device_count; ++i) {
2200
2736
  if (tensor_split[i] != 0.0f) {
@@ -2293,17 +2829,15 @@ inline void ggml_cuda_op_add(
2293
2829
  GGML_ASSERT(src1_ddf_i != nullptr);
2294
2830
  GGML_ASSERT(dst_ddf_i != nullptr);
2295
2831
 
2296
- // TODO: support broadcasting
2297
- GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
2298
-
2299
2832
  const int64_t ne00 = src0->ne[0];
2300
2833
  const int64_t i01_diff = i01_high - i01_low;
2301
2834
 
2302
- // const int64_t ne10 = src1->ne[0];
2835
+ const int64_t ne10 = src1->ne[0];
2836
+ const int64_t ne11 = src1->ne[1];
2303
2837
 
2304
2838
  // compute
2305
2839
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2306
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2840
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2307
2841
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
2308
2842
  add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
2309
2843
  } else {
@@ -2327,23 +2861,17 @@ inline void ggml_cuda_op_mul(
2327
2861
  GGML_ASSERT(dst_ddf_i != nullptr);
2328
2862
 
2329
2863
  const int64_t ne00 = src0->ne[0];
2864
+ const int64_t i01_diff = i01_high - i01_low;
2865
+
2330
2866
  const int64_t ne10 = src1->ne[0];
2331
2867
  const int64_t ne11 = src1->ne[1];
2332
2868
 
2333
- for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
2334
- const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
2335
-
2336
- float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
2337
- float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
2338
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2339
-
2340
- // compute
2341
- mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
2342
- }
2869
+ mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2343
2870
 
2344
2871
  (void) dst;
2345
2872
  (void) src0_ddq_i;
2346
2873
  (void) i02;
2874
+ (void) i1;
2347
2875
  }
2348
2876
 
2349
2877
  inline void ggml_cuda_op_gelu(
@@ -2423,8 +2951,11 @@ inline void ggml_cuda_op_rms_norm(
2423
2951
  const int64_t ne00 = src0->ne[0];
2424
2952
  const int64_t i01_diff = i01_high - i01_low;
2425
2953
 
2954
+ float eps;
2955
+ memcpy(&eps, dst->op_params, sizeof(float));
2956
+
2426
2957
  // compute
2427
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2958
+ rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
2428
2959
 
2429
2960
  (void) src1;
2430
2961
  (void) dst;
@@ -2452,18 +2983,27 @@ inline void ggml_cuda_op_mul_mat_vec(
2452
2983
  int id;
2453
2984
  CUDA_CHECK(cudaGetDevice(&id));
2454
2985
 
2455
- const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2986
+ bool mul_mat_vec_q_implemented =
2987
+ src0->type == GGML_TYPE_Q4_0 ||
2456
2988
  src0->type == GGML_TYPE_Q4_1 ||
2457
2989
  src0->type == GGML_TYPE_Q5_0 ||
2458
2990
  src0->type == GGML_TYPE_Q5_1 ||
2459
2991
  src0->type == GGML_TYPE_Q8_0;
2460
-
2461
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
2992
+ #if QK_K == 256
2993
+ mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
2994
+ src0->type == GGML_TYPE_Q2_K ||
2995
+ src0->type == GGML_TYPE_Q3_K ||
2996
+ src0->type == GGML_TYPE_Q4_K ||
2997
+ src0->type == GGML_TYPE_Q5_K ||
2998
+ src0->type == GGML_TYPE_Q6_K;
2999
+ #endif // QK_K == 256
3000
+
3001
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
2462
3002
  #endif
2463
3003
 
2464
3004
  if (use_mul_mat_vec_q) {
2465
- int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2466
- padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
3005
+ const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
3006
+ ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
2467
3007
  size_t as;
2468
3008
  void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2469
3009
  quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
@@ -2484,6 +3024,21 @@ inline void ggml_cuda_op_mul_mat_vec(
2484
3024
  case GGML_TYPE_Q8_0:
2485
3025
  mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2486
3026
  break;
3027
+ case GGML_TYPE_Q2_K:
3028
+ mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3029
+ break;
3030
+ case GGML_TYPE_Q3_K:
3031
+ mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3032
+ break;
3033
+ case GGML_TYPE_Q4_K:
3034
+ mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3035
+ break;
3036
+ case GGML_TYPE_Q5_K:
3037
+ mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3038
+ break;
3039
+ case GGML_TYPE_Q6_K:
3040
+ mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3041
+ break;
2487
3042
  default:
2488
3043
  GGML_ASSERT(false);
2489
3044
  break;
@@ -2615,17 +3170,31 @@ inline void ggml_cuda_op_rope(
2615
3170
  const int64_t ne00 = src0->ne[0];
2616
3171
  const int64_t i01_diff = i01_high - i01_low;
2617
3172
 
2618
- const int n_past = ((int32_t *) src1->data)[0];
2619
- const int n_dims = ((int32_t *) src1->data)[1];
2620
- const int mode = ((int32_t *) src1->data)[2];
2621
- GGML_ASSERT(mode == 0);
3173
+ const int n_past = ((int32_t *) dst->op_params)[0];
3174
+ const int n_dims = ((int32_t *) dst->op_params)[1];
3175
+ const int mode = ((int32_t *) dst->op_params)[2];
3176
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
3177
+ // RoPE alteration for extended context
2622
3178
 
2623
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
2624
- const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
3179
+ float freq_base, freq_scale;
3180
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
3181
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
3182
+
3183
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
3184
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3185
+
3186
+ bool is_glm = mode & 4;
2625
3187
 
2626
3188
  // compute
2627
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3189
+ if (is_glm) {
3190
+ const float id_p = min(p, n_ctx - 2.f);
3191
+ const float block_p = max(p - (n_ctx - 2.f), 0.f);
3192
+ rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3193
+ } else {
3194
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3195
+ }
2628
3196
 
3197
+ (void) src1;
2629
3198
  (void) dst;
2630
3199
  (void) src0_ddq_i;
2631
3200
  (void) src1_ddf_i;
@@ -2644,11 +3213,12 @@ inline void ggml_cuda_op_diag_mask_inf(
2644
3213
  const int64_t ne01 = src0->ne[1];
2645
3214
  const int64_t i01_diff = i01_high - i01_low;
2646
3215
 
2647
- const int n_past = ((int32_t *) src1->data)[0];
3216
+ const int n_past = ((int32_t *) dst->op_params)[0];
2648
3217
 
2649
3218
  // compute
2650
3219
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
2651
3220
 
3221
+ (void) src1;
2652
3222
  (void) dst;
2653
3223
  (void) src0_ddq_i;
2654
3224
  (void) src1_ddf_i;
@@ -2716,6 +3286,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2716
3286
  const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
2717
3287
  const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
2718
3288
  const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
3289
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
3290
+
3291
+ GGML_ASSERT(ne03 == ne13);
2719
3292
 
2720
3293
  const int64_t ne0 = dst->ne[0];
2721
3294
  const int64_t ne1 = dst->ne[1];
@@ -2727,12 +3300,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2727
3300
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
2728
3301
 
2729
3302
  // strides for iteration over dims 3 and 2
2730
- const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
2731
- const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
3303
+ const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
3304
+ const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
3305
+ const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
2732
3306
  const int64_t src0_stride = ne00 * ne01 * stride_mod;
2733
3307
  const int64_t src1_stride = ne10 * ne11 * stride_mod;
2734
3308
  const int64_t dst_stride = ne0 * ne1 * stride_mod;
2735
3309
 
3310
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
3311
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
3312
+ const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
3313
+ const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
3314
+ GGML_ASSERT(!(flatten_rows && ne02 < ne12));
3315
+
2736
3316
  const size_t src0_ts = ggml_type_size(src0->type);
2737
3317
  const size_t src0_bs = ggml_blck_size(src0->type);
2738
3318
 
@@ -2749,6 +3329,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2749
3329
  dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
2750
3330
 
2751
3331
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
3332
+ GGML_ASSERT(!(split && ne02 < ne12));
2752
3333
 
2753
3334
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
2754
3335
 
@@ -2785,7 +3366,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2785
3366
  row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
2786
3367
  } else {
2787
3368
  row_low = 0;
2788
- row_high = nrows0;
3369
+ row_high = nrows0*i02_divisor;
2789
3370
  }
2790
3371
  if (row_low == row_high) {
2791
3372
  continue;
@@ -2833,16 +3414,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2833
3414
  dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
2834
3415
  }
2835
3416
 
2836
- const int64_t i03_max = flatten_rows ? 1 : ne03;
2837
- const int64_t i02_max = flatten_rows ? 1 : ne02;
2838
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
2839
-
2840
3417
  for (int64_t i03 = 0; i03 < i03_max; i03++) {
2841
3418
  const int64_t i13 = i03 % ne13;
2842
3419
  for (int64_t i02 = 0; i02 < i02_max; i02++) {
2843
3420
  const int64_t i12 = i02 % ne12;
2844
3421
 
2845
- const int64_t i0 = i03*ne02 + i02;
3422
+ const int64_t i0 = i03*i02_max + i02;
2846
3423
 
2847
3424
  // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
2848
3425
  const int64_t i0_offset_low = row_low/rows_per_iter;
@@ -2876,10 +3453,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2876
3453
  const int64_t i11 = i13*ne12 + i12;
2877
3454
 
2878
3455
  // for split tensors the data begins at i0 == i0_offset_low
2879
- char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
2880
- float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
3456
+ char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
3457
+ float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
2881
3458
  float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
2882
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
3459
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
2883
3460
 
2884
3461
  // for split tensors the data pointer needs to be rounded down
2885
3462
  // to the bin edge for i03, i02 bins beyond the first
@@ -2918,11 +3495,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2918
3495
  }
2919
3496
  }
2920
3497
 
2921
- if (!src0_on_device || !src0_is_contiguous) {
3498
+ if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
2922
3499
  if (src0_is_f32) {
2923
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
3500
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
2924
3501
  } else {
2925
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
3502
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
2926
3503
  }
2927
3504
  }
2928
3505
 
@@ -3076,6 +3653,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3076
3653
  const int64_t ne01 = src0->ne[1];
3077
3654
  const int64_t ne02 = src0->ne[2];
3078
3655
 
3656
+ const int64_t ne12 = src1->ne[2];
3657
+
3079
3658
  CUDA_CHECK(cudaSetDevice(g_main_device));
3080
3659
  cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
3081
3660
 
@@ -3088,7 +3667,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3088
3667
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
3089
3668
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
3090
3669
 
3091
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
3670
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
3092
3671
  }
3093
3672
 
3094
3673
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -3102,6 +3681,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3102
3681
  const int64_t ne01 = src0->ne[1];
3103
3682
  const int64_t ne02 = src0->ne[2];
3104
3683
 
3684
+ const int64_t ne12 = src1->ne[2];
3685
+
3105
3686
  const int64_t nb01 = src0->nb[1];
3106
3687
  const int64_t nb02 = src0->nb[2];
3107
3688
 
@@ -3120,7 +3701,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3120
3701
  const int row_stride_x = nb01 / sizeof(half);
3121
3702
  const int channel_stride_x = nb02 / sizeof(half);
3122
3703
 
3123
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
3704
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
3124
3705
  }
3125
3706
 
3126
3707
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3197,6 +3778,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3197
3778
  (void) dst;
3198
3779
  }
3199
3780
 
3781
+ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3782
+ ggml_cuda_cpy(src0, dst, nullptr);
3783
+ (void) src1;
3784
+ }
3785
+
3200
3786
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3201
3787
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3202
3788
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@@ -3256,7 +3842,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3256
3842
  size_t size = ggml_nbytes_split(tensor, nrows_split);
3257
3843
  const size_t original_size = size;
3258
3844
 
3259
- // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3845
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
3260
3846
  if (ne0 % MATRIX_ROW_PADDING != 0) {
3261
3847
  size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3262
3848
  * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
@@ -3272,7 +3858,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3272
3858
  }
3273
3859
 
3274
3860
 
3275
- CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
3861
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
3276
3862
 
3277
3863
  extra->data_device[id] = buf;
3278
3864
 
@@ -3306,6 +3892,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
3306
3892
  delete extra;
3307
3893
  }
3308
3894
 
3895
+ static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
3896
+ static size_t g_temp_tensor_extra_index = 0;
3897
+
3898
+ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
3899
+ if (g_temp_tensor_extras == nullptr) {
3900
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
3901
+ }
3902
+
3903
+ size_t alloc_index = g_temp_tensor_extra_index;
3904
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
3905
+ struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
3906
+ memset(extra, 0, sizeof(*extra));
3907
+
3908
+ return extra;
3909
+ }
3910
+
3309
3911
  void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
3310
3912
  if (scratch && g_scratch_size == 0) {
3311
3913
  return;
@@ -3314,7 +3916,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3314
3916
  // recursively assign CUDA buffers until a compute tensor is found
3315
3917
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3316
3918
  const ggml_op src0_op = tensor->src[0]->op;
3317
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
3919
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
3318
3920
  ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
3319
3921
  }
3320
3922
  }
@@ -3323,8 +3925,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3323
3925
  }
3324
3926
 
3325
3927
  tensor->backend = GGML_BACKEND_GPU;
3326
- struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
3327
- memset(extra, 0, sizeof(*extra));
3928
+ struct ggml_tensor_extra_gpu * extra;
3328
3929
 
3329
3930
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
3330
3931
  tensor->op == GGML_OP_VIEW ||
@@ -3337,12 +3938,14 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3337
3938
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3338
3939
  size_t offset = 0;
3339
3940
  if (tensor->op == GGML_OP_VIEW) {
3340
- memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3941
+ memcpy(&offset, tensor->op_params, sizeof(size_t));
3341
3942
  }
3943
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3342
3944
  extra->data_device[g_main_device] = src0_ddc + offset;
3343
3945
  } else if (tensor->op == GGML_OP_CPY) {
3344
3946
  struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
3345
3947
  void * src1_ddv = src1_extra->data_device[g_main_device];
3948
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3346
3949
  extra->data_device[g_main_device] = src1_ddv;
3347
3950
  } else if (scratch) {
3348
3951
  GGML_ASSERT(size <= g_scratch_size);
@@ -3355,6 +3958,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3355
3958
  CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
3356
3959
  g_scratch_buffer = data;
3357
3960
  }
3961
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3358
3962
  extra->data_device[g_main_device] = data + g_scratch_offset;
3359
3963
 
3360
3964
  g_scratch_offset += size;
@@ -3364,6 +3968,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3364
3968
  void * data;
3365
3969
  CUDA_CHECK(cudaMalloc(&data, size));
3366
3970
  CUDA_CHECK(cudaMemset(data, 0, size));
3971
+ extra = new ggml_tensor_extra_gpu;
3972
+ memset(extra, 0, sizeof(*extra));
3367
3973
  extra->data_device[g_main_device] = data;
3368
3974
  }
3369
3975
 
@@ -3416,30 +4022,41 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3416
4022
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
3417
4023
 
3418
4024
  switch (tensor->op) {
3419
- case GGML_OP_ADD:
4025
+ case GGML_OP_DUP:
3420
4026
  if (!any_on_device) {
3421
4027
  return false;
3422
4028
  }
3423
- func = ggml_cuda_add;
4029
+ func = ggml_cuda_dup;
3424
4030
  break;
3425
- case GGML_OP_MUL:
3426
- if (!any_on_device) {
3427
- return false;
3428
- }
3429
- func = ggml_cuda_mul;
3430
- break;
3431
- case GGML_OP_GELU:
4031
+ case GGML_OP_ADD:
3432
4032
  if (!any_on_device) {
3433
4033
  return false;
3434
4034
  }
3435
- func = ggml_cuda_gelu;
4035
+ func = ggml_cuda_add;
3436
4036
  break;
3437
- case GGML_OP_SILU:
4037
+ case GGML_OP_MUL:
3438
4038
  if (!any_on_device) {
3439
4039
  return false;
3440
4040
  }
3441
- func = ggml_cuda_silu;
4041
+ func = ggml_cuda_mul;
3442
4042
  break;
4043
+ case GGML_OP_UNARY:
4044
+ switch (ggml_get_unary_op(tensor)) {
4045
+ case GGML_UNARY_OP_GELU:
4046
+ if (!any_on_device) {
4047
+ return false;
4048
+ }
4049
+ func = ggml_cuda_gelu;
4050
+ break;
4051
+ case GGML_UNARY_OP_SILU:
4052
+ if (!any_on_device) {
4053
+ return false;
4054
+ }
4055
+ func = ggml_cuda_silu;
4056
+ break;
4057
+ default:
4058
+ return false;
4059
+ } break;
3443
4060
  case GGML_OP_NORM:
3444
4061
  if (!any_on_device) {
3445
4062
  return false;
@@ -3470,6 +4087,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3470
4087
  }
3471
4088
  func = ggml_cuda_cpy;
3472
4089
  break;
4090
+ case GGML_OP_CONT:
4091
+ if (!any_on_device) {
4092
+ return false;
4093
+ }
4094
+ func = ggml_cuda_dup;
4095
+ break;
3473
4096
  case GGML_OP_RESHAPE:
3474
4097
  case GGML_OP_VIEW:
3475
4098
  case GGML_OP_PERMUTE: