llama_cpp 0.3.3 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,6 +13,8 @@
13
13
  #include "ggml-cuda.h"
14
14
  #include "ggml.h"
15
15
 
16
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+
16
18
  #if defined(_MSC_VER)
17
19
  #pragma warning(disable: 4244 4267) // possible loss of data
18
20
  #endif
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
74
76
 
75
77
  #define QK4_0 32
76
78
  #define QR4_0 2
77
- #define QI4_0 4
79
+ #define QI4_0 (QK4_0 / (4 * QR4_0))
78
80
  typedef struct {
79
81
  half d; // delta
80
82
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
83
85
 
84
86
  #define QK4_1 32
85
87
  #define QR4_1 2
86
- #define QI4_1 4
88
+ #define QI4_1 (QK4_1 / (4 * QR4_1))
87
89
  typedef struct {
88
90
  half d; // delta
89
91
  half m; // min
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
93
95
 
94
96
  #define QK5_0 32
95
97
  #define QR5_0 2
96
- #define QI5_0 4
98
+ #define QI5_0 (QK5_0 / (4 * QR5_0))
97
99
  typedef struct {
98
100
  half d; // delta
99
101
  uint8_t qh[4]; // 5-th bit of quants
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
103
105
 
104
106
  #define QK5_1 32
105
107
  #define QR5_1 2
106
- #define QI5_1 4
108
+ #define QI5_1 (QK5_1 / (4 * QR5_1))
107
109
  typedef struct {
108
110
  half d; // delta
109
111
  half m; // min
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
114
116
 
115
117
  #define QK8_0 32
116
118
  #define QR8_0 1
117
- #define QI8_0 8
119
+ #define QI8_0 (QK8_0 / (4 * QR8_0))
118
120
  typedef struct {
119
121
  half d; // delta
120
122
  int8_t qs[QK8_0]; // quants
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
123
125
 
124
126
  #define QK8_1 32
125
127
  #define QR8_1 1
126
- #define QI8_1 8
128
+ #define QI8_1 (QK8_1 / (4 * QR8_1))
127
129
  typedef struct {
128
130
  half d; // delta
129
131
  half s; // unquantized sum
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
143
145
  #define K_SCALE_SIZE 12
144
146
  #endif
145
147
 
148
+ #define QR2_K 4
149
+ #define QI2_K (QK_K / (4*QR2_K))
146
150
  typedef struct {
147
151
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
148
152
  uint8_t qs[QK_K/4]; // quants
@@ -151,6 +155,8 @@ typedef struct {
151
155
  } block_q2_K;
152
156
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
153
157
 
158
+ #define QR3_K 4
159
+ #define QI3_K (QK_K / (4*QR3_K))
154
160
  typedef struct {
155
161
  uint8_t hmask[QK_K/8]; // quants - high bit
156
162
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -163,6 +169,8 @@ typedef struct {
163
169
  } block_q3_K;
164
170
  //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
165
171
 
172
+ #define QR4_K 2
173
+ #define QI4_K (QK_K / (4*QR4_K))
166
174
  #ifdef GGML_QKK_64
167
175
  typedef struct {
168
176
  half d[2]; // super-block scales/mins
@@ -180,6 +188,8 @@ typedef struct {
180
188
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
181
189
  #endif
182
190
 
191
+ #define QR5_K 2
192
+ #define QI5_K (QK_K / (4*QR5_K))
183
193
  #ifdef GGML_QKK_64
184
194
  typedef struct {
185
195
  half d; // super-block scale
@@ -199,6 +209,8 @@ typedef struct {
199
209
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
200
210
  #endif
201
211
 
212
+ #define QR6_K 2
213
+ #define QI6_K (QK_K / (4*QR6_K))
202
214
  typedef struct {
203
215
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
204
216
  uint8_t qh[QK_K/4]; // quants, upper 2 bits
@@ -208,7 +220,7 @@ typedef struct {
208
220
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
209
221
 
210
222
  #define WARP_SIZE 32
211
- #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
223
+ #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
212
224
 
213
225
  #define CUDA_ADD_BLOCK_SIZE 256
214
226
  #define CUDA_MUL_BLOCK_SIZE 256
@@ -240,13 +252,13 @@ struct ggml_tensor_extra_gpu {
240
252
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
241
253
  };
242
254
 
243
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
255
+ static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
244
256
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
245
257
 
246
- if (i >= k) {
258
+ if (i >= kx) {
247
259
  return;
248
260
  }
249
- dst[i] = x[i] + y[i];
261
+ dst[i] = x[i] + y[i%ky];
250
262
  }
251
263
 
252
264
  static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -320,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
320
332
  }
321
333
  }
322
334
 
323
- static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
335
+ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
324
336
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
325
337
  const int tid = threadIdx.x;
326
338
 
327
- const float eps = 1e-6f;
328
-
329
339
  float tmp = 0.0f; // partial sum for thread in warp
330
340
 
331
341
  for (int col = tid; col < ncols; col += WARP_SIZE) {
@@ -923,12 +933,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
923
933
  uint16_t aux[4];
924
934
  const uint8_t * sc = (const uint8_t *)aux;
925
935
 
936
+ #if K_QUANTS_PER_ITERATION == 2
937
+ uint32_t q32[4];
938
+ const uint8_t * q4 = (const uint8_t *)q32;
939
+ #else
940
+ uint16_t q16[4];
941
+ const uint8_t * q4 = (const uint8_t *)q16;
942
+ #endif
943
+
926
944
  float tmp = 0; // partial sum for thread in warp
927
945
 
928
946
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
929
947
 
930
- const uint8_t * q1 = x[i].qs + q_offset;
931
- const uint8_t * q2 = q1 + 64;
932
948
  const float * y1 = yy + i*QK_K + y_offset;
933
949
  const float * y2 = y1 + 128;
934
950
 
@@ -941,14 +957,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
941
957
  aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
942
958
  aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
943
959
 
960
+ #if K_QUANTS_PER_ITERATION == 2
961
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
962
+ const uint32_t * q2 = q1 + 16;
963
+
964
+ q32[0] = q1[0] & 0x0f0f0f0f;
965
+ q32[1] = q1[0] & 0xf0f0f0f0;
966
+ q32[2] = q2[0] & 0x0f0f0f0f;
967
+ q32[3] = q2[0] & 0xf0f0f0f0;
968
+
944
969
  float4 s = {0.f, 0.f, 0.f, 0.f};
945
970
  float smin = 0;
946
- for (int l = 0; l < n; ++l) {
947
- s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
948
- s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
971
+ for (int l = 0; l < 4; ++l) {
972
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
973
+ s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
949
974
  smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
950
975
  }
951
- tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
976
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
977
+ #else
978
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
979
+ const uint16_t * q2 = q1 + 32;
980
+
981
+ q16[0] = q1[0] & 0x0f0f;
982
+ q16[1] = q1[0] & 0xf0f0;
983
+ q16[2] = q2[0] & 0x0f0f;
984
+ q16[3] = q2[0] & 0xf0f0;
985
+
986
+ float4 s = {0.f, 0.f, 0.f, 0.f};
987
+ float smin = 0;
988
+ for (int l = 0; l < 2; ++l) {
989
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
990
+ s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
991
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
992
+ }
993
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
994
+ #endif
952
995
 
953
996
  }
954
997
  #else
@@ -1028,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1028
1071
  uint16_t aux[4];
1029
1072
  const uint8_t * sc = (const uint8_t *)aux;
1030
1073
 
1074
+ uint16_t q16[8];
1075
+ const uint8_t * q4 = (const uint8_t *)q16;
1076
+
1031
1077
  for (int i = ix; i < num_blocks_per_row; i += 2) {
1032
1078
 
1033
1079
  const uint8_t * ql1 = x[i].qs + q_offset;
1034
- const uint8_t * ql2 = ql1 + 64;
1035
1080
  const uint8_t * qh = x[i].qh + l0;
1036
1081
  const float * y1 = yy + i*QK_K + y_offset;
1037
1082
  const float * y2 = y1 + 128;
@@ -1047,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1047
1092
 
1048
1093
  float4 sum = {0.f, 0.f, 0.f, 0.f};
1049
1094
  float smin = 0;
1095
+ const uint16_t * q1 = (const uint16_t *)ql1;
1096
+ const uint16_t * q2 = q1 + 32;
1097
+ q16[0] = q1[0] & 0x0f0f;
1098
+ q16[1] = q1[8] & 0x0f0f;
1099
+ q16[2] = (q1[0] >> 4) & 0x0f0f;
1100
+ q16[3] = (q1[8] >> 4) & 0x0f0f;
1101
+ q16[4] = q2[0] & 0x0f0f;
1102
+ q16[5] = q2[8] & 0x0f0f;
1103
+ q16[6] = (q2[0] >> 4) & 0x0f0f;
1104
+ q16[7] = (q2[8] >> 4) & 0x0f0f;
1050
1105
  for (int l = 0; l < n; ++l) {
1051
- sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1052
- + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1053
- sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1054
- + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1055
- sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1056
- + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1057
- sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1058
- + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1106
+ sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1107
+ + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1108
+ sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1109
+ + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1110
+ sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1111
+ + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1112
+ sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1113
+ + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1059
1114
  smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
1060
1115
  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
1061
1116
  }
@@ -1271,8 +1326,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
1271
1326
  y[iybs + iqs + y_offset] = v.y;
1272
1327
  }
1273
1328
 
1274
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1275
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1329
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1330
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1331
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1276
1332
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1277
1333
 
1278
1334
  int vi;
@@ -1293,11 +1349,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
1293
1349
  return sumi*d;
1294
1350
  #else
1295
1351
  return 0.0f; // only to satisfy the compiler
1296
- #endif // __CUDA_ARCH__ >= 610
1352
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1297
1353
  }
1298
1354
 
1299
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1300
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1355
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1356
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1357
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1301
1358
  const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1302
1359
 
1303
1360
  const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1318,11 +1375,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
1318
1375
  return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1319
1376
  #else
1320
1377
  return 0.0f; // only to satisfy the compiler
1321
- #endif // __CUDA_ARCH__ >= 610
1378
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1322
1379
  }
1323
1380
 
1324
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1325
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1381
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1382
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1383
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1326
1384
  const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1327
1385
 
1328
1386
  int qs;
@@ -1353,11 +1411,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
1353
1411
  return sumi*d;
1354
1412
  #else
1355
1413
  return 0.0f; // only to satisfy the compiler
1356
- #endif // __CUDA_ARCH__ >= 610
1414
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1357
1415
  }
1358
1416
 
1359
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1360
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1417
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1418
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1419
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1361
1420
  const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1362
1421
 
1363
1422
  const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1387,11 +1446,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
1387
1446
  return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1388
1447
  #else
1389
1448
  return 0.0f; // only to satisfy the compiler
1390
- #endif // __CUDA_ARCH__ >= 610
1449
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1391
1450
  }
1392
1451
 
1393
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1394
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1452
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1453
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1454
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1395
1455
  const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1396
1456
 
1397
1457
  int vi;
@@ -1406,7 +1466,342 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
1406
1466
  return sumi*d;
1407
1467
  #else
1408
1468
  return 0.0f; // only to satisfy the compiler
1409
- #endif // __CUDA_ARCH__ >= 610
1469
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1470
+ }
1471
+
1472
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1473
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1474
+
1475
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1476
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1477
+
1478
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
1479
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1480
+
1481
+ float sumf_d = 0.0f;
1482
+ float sumf_m = 0.0f;
1483
+
1484
+ const float d = bq2_K->d;
1485
+ const float dmin = bq2_K->dmin;
1486
+
1487
+ const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1488
+
1489
+ for (int i = 0; i < QR2_K; ++i) {
1490
+ const int sc = bq2_K->scales[scale_offset + 2*i];
1491
+
1492
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1493
+ const float d8i = bq8i->d;
1494
+
1495
+ const int vi = (v >> (2*i)) & 0x03030303;
1496
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1497
+
1498
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1499
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1500
+ }
1501
+
1502
+ return d*sumf_d - dmin*sumf_m;
1503
+ #else
1504
+ return 0.0f; // only to satisfy the compiler
1505
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1506
+ }
1507
+
1508
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1509
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1510
+
1511
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1512
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
1513
+
1514
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1515
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1516
+
1517
+ float sumf = 0.0f;
1518
+
1519
+ const float d = bq3_K->d;
1520
+
1521
+ int vl;
1522
+ memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
1523
+
1524
+ int vh;
1525
+ memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1526
+ vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1527
+ vh >>= bq8_offset;
1528
+
1529
+ for (int i = 0; i < QR3_K; ++i) {
1530
+ const int isc = scale_offset + 2*i;
1531
+
1532
+ const int isc_low = isc % (QK_K/32);
1533
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
1534
+ const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
1535
+
1536
+ const int isc_high = isc % (QK_K/64);
1537
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
1538
+ const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1539
+
1540
+ const int sc = (sc_low | sc_high) - 32;
1541
+
1542
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1543
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1544
+ const float d8i = bq8i->d;
1545
+
1546
+ const int vil = (vl >> (2*i)) & 0x03030303;
1547
+
1548
+ const int vih = ((vh >> i) << 2) & 0x04040404;
1549
+
1550
+ const int vi = __vsubss4(vil, vih);
1551
+
1552
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1553
+ }
1554
+
1555
+ return d*sumf;
1556
+ #else
1557
+ return 0.0f; // only to satisfy the compiler
1558
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1559
+ }
1560
+
1561
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1562
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1563
+
1564
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1565
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1566
+
1567
+ float sumf_d = 0.0f;
1568
+ float sumf_m = 0.0f;
1569
+
1570
+ #ifndef GGML_QKK_64
1571
+
1572
+ // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573
+ const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1574
+
1575
+ const float d = bq4_K->d;
1576
+ const float dmin = bq4_K->dmin;
1577
+
1578
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1582
+
1583
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1584
+ const int v1 = q4[0];
1585
+ const int v2 = q4[4];
1586
+
1587
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
1588
+ uint16_t aux[2];
1589
+ const int j = bq8_offset/2;
1590
+ if (j < 2) {
1591
+ aux[0] = scales[j+0] & 0x3f3f;
1592
+ aux[1] = scales[j+2] & 0x3f3f;
1593
+ } else {
1594
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1595
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1596
+ }
1597
+ const uint8_t * sc = (const uint8_t *)aux;
1598
+ const uint8_t * m = sc + 2;
1599
+
1600
+ for (int i = 0; i < QR4_K; ++i) {
1601
+
1602
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1603
+ const float d8i = bq8i->d;
1604
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
1605
+ const int ui1 = q8[0];
1606
+ const int ui2 = q8[4];
1607
+
1608
+ const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
1609
+ const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
1610
+
1611
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1612
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1613
+
1614
+ sumf_d += d8i * (dot1 * sc[i]);
1615
+ sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1616
+ }
1617
+
1618
+ return d*sumf_d - dmin*sumf_m;
1619
+
1620
+ #else
1621
+
1622
+ uint16_t aux16[2];
1623
+ const uint8_t * s = (const uint8_t *)aux16;
1624
+
1625
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
1626
+ aux16[0] = a[0] & 0x0f0f;
1627
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
1628
+
1629
+ const float dall = bq4_K->d[0];
1630
+ const float dmin = bq4_K->d[1];
1631
+
1632
+ const float d8_1 = bq8_1[0].d;
1633
+ const float d8_2 = bq8_1[1].d;
1634
+
1635
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1636
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1637
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1638
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1639
+
1640
+ const int * q4 = (const int *)bq4_K->qs + iqs;
1641
+ const int v1 = q4[0];
1642
+ const int v2 = q4[4];
1643
+
1644
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
1645
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
1646
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1647
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
1648
+
1649
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
1650
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
1651
+
1652
+ return dall * sumf_d - dmin * sumf_m;
1653
+
1654
+ #endif
1655
+
1656
+ #else
1657
+ return 0.0f; // only to satisfy the compiler
1658
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1659
+ }
1660
+
1661
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1662
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1663
+
1664
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1665
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1666
+
1667
+ #ifndef GGML_QKK_64
1668
+
1669
+ const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
1670
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1671
+ const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
1672
+
1673
+ float sumf_d = 0.0f;
1674
+ float sumf_m = 0.0f;
1675
+
1676
+ const float d = bq5_K->d;
1677
+ const float dmin = bq5_K->dmin;
1678
+
1679
+ const int vl1 = ql[0];
1680
+ const int vl2 = ql[4];
1681
+
1682
+ const int vh1 = qh[0] >> bq8_offset;
1683
+ const int vh2 = qh[4] >> bq8_offset;
1684
+
1685
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
1686
+ uint16_t aux[2];
1687
+ const int j = bq8_offset/2;
1688
+ if (j < 2) {
1689
+ aux[0] = scales[j+0] & 0x3f3f;
1690
+ aux[1] = scales[j+2] & 0x3f3f;
1691
+ } else {
1692
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1693
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1694
+ }
1695
+ const uint8_t * sc = (const uint8_t *)aux;
1696
+ const uint8_t * m = sc + 2;
1697
+
1698
+ for (int i = 0; i < QR5_K; ++i) {
1699
+
1700
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1701
+ const float d8i = bq8i->d;
1702
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
1703
+ const int ui1 = q8[0];
1704
+ const int ui2 = q8[4];
1705
+
1706
+ const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
1707
+ const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
1708
+
1709
+ const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
1710
+ const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
1711
+
1712
+ const int vi1 = vil1 | vih1;
1713
+ const int vi2 = vil2 | vih2;
1714
+
1715
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1716
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1717
+
1718
+ sumf_d += d8i * (dot1 * sc[i]);
1719
+ sumf_m += d8i * (dot2 * m[i]);
1720
+
1721
+ }
1722
+
1723
+ return d*sumf_d - dmin*sumf_m;
1724
+
1725
+ #else
1726
+
1727
+ const int8_t * s = bq5_K->scales;
1728
+
1729
+ const float d = bq5_K->d;
1730
+
1731
+ const float d8_1 = bq8_1[0].d;
1732
+ const float d8_2 = bq8_1[1].d;
1733
+
1734
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1735
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1736
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1737
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1738
+
1739
+ const int * ql = (const int *)bq5_K->qs + iqs;
1740
+ const int vl1 = ql[0];
1741
+ const int vl2 = ql[4];
1742
+
1743
+ const int step = 4 * iqs; // 0, 4, 8, 12
1744
+ const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745
+ const int in = step%8; // 0, 4, 0, 4
1746
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
1747
+
1748
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
1749
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
1750
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
1751
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
1752
+
1753
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
1754
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
1755
+
1756
+ return d * sumf_d;
1757
+
1758
+ #endif
1759
+
1760
+ #else
1761
+ return 0.0f; // only to satisfy the compiler
1762
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1763
+ }
1764
+
1765
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1766
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1767
+
1768
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1769
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
1770
+
1771
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1772
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1773
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
1774
+
1775
+ float sumf = 0.0f;
1776
+
1777
+ const float d = bq6_K->d;
1778
+
1779
+ int vl;
1780
+ memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
1781
+
1782
+ int vh;
1783
+ memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
1784
+
1785
+ for (int i = 0; i < QR6_K; ++i) {
1786
+ const int sc = bq6_K->scales[scale_offset + 4*i];
1787
+
1788
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1789
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1790
+ const float d8i = bq8i->d;
1791
+
1792
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1793
+
1794
+ const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
1795
+
1796
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1797
+
1798
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1799
+ }
1800
+
1801
+ return d*sumf;
1802
+ #else
1803
+ return 0.0f; // only to satisfy the compiler
1804
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1410
1805
  }
1411
1806
 
1412
1807
  template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -1429,7 +1824,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1429
1824
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1430
1825
  const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1431
1826
 
1432
- const int iby = i + threadIdx.x / qi; // y block index
1827
+ const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
1433
1828
 
1434
1829
  const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1435
1830
 
@@ -1515,11 +1910,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1515
1910
  }
1516
1911
  }
1517
1912
 
1518
- static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1913
+ static __global__ void mul_mat_p021_f16_f32(
1914
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1915
+ const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
1916
+
1519
1917
  const half * x = (const half *) vx;
1520
1918
 
1521
1919
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1522
1920
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1921
+ const int channel_x = channel / (nchannels_y / nchannels_x);
1523
1922
 
1524
1923
  const int nrows_y = ncols_x;
1525
1924
  const int nrows_dst = nrows_x;
@@ -1535,7 +1934,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1535
1934
  }
1536
1935
 
1537
1936
  // x is transposed and permuted
1538
- const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
1937
+ const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
1539
1938
  const float xi = __half2float(x[ix]);
1540
1939
 
1541
1940
  const int row_y = col_x;
@@ -1563,12 +1962,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1563
1962
 
1564
1963
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1565
1964
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1566
- const int row_stride_x, const int channel_stride_x) {
1965
+ const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
1567
1966
 
1568
1967
  const half * x = (const half *) vx;
1569
1968
 
1570
1969
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1571
1970
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1971
+ const int channel_x = channel / channel_x_divisor;
1572
1972
 
1573
1973
  const int nrows_y = ncols_x;
1574
1974
  const int nrows_dst = nrows_x;
@@ -1585,7 +1985,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1585
1985
  break;
1586
1986
  }
1587
1987
 
1588
- const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
1988
+ const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
1589
1989
  const float xi = __half2float(x[ix]);
1590
1990
 
1591
1991
  const int row_y = col_x;
@@ -1667,6 +2067,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
1667
2067
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
1668
2068
  }
1669
2069
 
2070
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
2071
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
2072
+ const int half_n_dims = ncols/4;
2073
+
2074
+ if (col >= half_n_dims) {
2075
+ return;
2076
+ }
2077
+
2078
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
2079
+ const int i = row*ncols + col;
2080
+
2081
+ const float col_theta_scale = powf(theta_scale, col);
2082
+
2083
+ const float theta = p*col_theta_scale;
2084
+ const float sin_theta = sinf(theta);
2085
+ const float cos_theta = cosf(theta);
2086
+
2087
+ const float x0 = x[i + 0];
2088
+ const float x1 = x[i + half_n_dims];
2089
+
2090
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
2091
+ dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
2092
+
2093
+ const float block_theta = block_p*col_theta_scale;
2094
+ const float sin_block_theta = sinf(block_theta);
2095
+ const float cos_block_theta = cosf(block_theta);
2096
+
2097
+ const float x2 = x[i + half_n_dims * 2];
2098
+ const float x3 = x[i + half_n_dims * 3];
2099
+
2100
+ dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
2101
+ dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
2102
+ }
2103
+
1670
2104
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
1671
2105
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
1672
2106
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1732,9 +2166,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
1732
2166
  dst[i] = scale * x[i];
1733
2167
  }
1734
2168
 
1735
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
1736
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1737
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
2169
+ static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
2170
+ const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
2171
+ add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1738
2172
  }
1739
2173
 
1740
2174
  static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -1763,10 +2197,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
1763
2197
  norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1764
2198
  }
1765
2199
 
1766
- static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2200
+ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
1767
2201
  GGML_ASSERT(ncols % WARP_SIZE == 0);
1768
2202
  const dim3 block_dims(WARP_SIZE, 1, 1);
1769
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
2203
+ rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
1770
2204
  }
1771
2205
 
1772
2206
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
@@ -1928,7 +2362,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1928
2362
  }
1929
2363
 
1930
2364
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1931
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2365
+ GGML_ASSERT(ncols % QK4_0 == 0);
1932
2366
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1933
2367
  const dim3 block_nums(1, block_num_y, 1);
1934
2368
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1937,7 +2371,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
1937
2371
  }
1938
2372
 
1939
2373
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1940
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2374
+ GGML_ASSERT(ncols % QK4_1 == 0);
1941
2375
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1942
2376
  const dim3 block_nums(1, block_num_y, 1);
1943
2377
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1946,7 +2380,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
1946
2380
  }
1947
2381
 
1948
2382
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1949
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2383
+ GGML_ASSERT(ncols % QK5_0 == 0);
1950
2384
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1951
2385
  const dim3 block_nums(1, block_num_y, 1);
1952
2386
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1955,7 +2389,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
1955
2389
  }
1956
2390
 
1957
2391
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1958
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2392
+ GGML_ASSERT(ncols % QK5_1 == 0);
1959
2393
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1960
2394
  const dim3 block_nums(1, block_num_y, 1);
1961
2395
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1964,7 +2398,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
1964
2398
  }
1965
2399
 
1966
2400
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1967
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2401
+ GGML_ASSERT(ncols % QK8_0 == 0);
1968
2402
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1969
2403
  const dim3 block_nums(1, block_num_y, 1);
1970
2404
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1972,6 +2406,57 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
1972
2406
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1973
2407
  }
1974
2408
 
2409
+ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2410
+ GGML_ASSERT(ncols % QK_K == 0);
2411
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2412
+ const dim3 block_nums(1, block_num_y, 1);
2413
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2414
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
2415
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2416
+ }
2417
+
2418
+ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2419
+ GGML_ASSERT(ncols % QK_K == 0);
2420
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2421
+ const dim3 block_nums(1, block_num_y, 1);
2422
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2423
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
2424
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2425
+ }
2426
+
2427
+ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2428
+ GGML_ASSERT(ncols % QK_K == 0);
2429
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2430
+ const dim3 block_nums(1, block_num_y, 1);
2431
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2432
+ // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2433
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2434
+ // is better amortized.
2435
+ mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
2436
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2437
+ }
2438
+
2439
+ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2440
+ GGML_ASSERT(ncols % QK_K == 0);
2441
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2442
+ const dim3 block_nums(1, block_num_y, 1);
2443
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2444
+ // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2445
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2446
+ // is better amortized.
2447
+ mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
2448
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2449
+ }
2450
+
2451
+ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2452
+ GGML_ASSERT(ncols % QK_K == 0);
2453
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2454
+ const dim3 block_nums(1, block_num_y, 1);
2455
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2456
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
2457
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2458
+ }
2459
+
1975
2460
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1976
2461
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1977
2462
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2015,20 +2500,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
2015
2500
  }
2016
2501
  }
2017
2502
 
2018
- static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
2019
- const dim3 block_nums(1, nrows_x, nchannels_x);
2503
+ static void ggml_mul_mat_p021_f16_f32_cuda(
2504
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
2505
+ const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
2506
+
2507
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2020
2508
  const dim3 block_dims(WARP_SIZE, 1, 1);
2021
- mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
2509
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
2022
2510
  }
2023
2511
 
2024
2512
  static void ggml_mul_mat_vec_nc_f16_f32_cuda(
2025
2513
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
2026
- const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
2514
+ const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
2027
2515
 
2028
- const dim3 block_nums(1, nrows_x, nchannels_x);
2516
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2029
2517
  const dim3 block_dims(WARP_SIZE, 1, 1);
2030
2518
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
2031
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
2519
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
2032
2520
  }
2033
2521
 
2034
2522
  static void ggml_cpy_f32_f32_cuda(
@@ -2064,6 +2552,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
2064
2552
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
2065
2553
  }
2066
2554
 
2555
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
2556
+ GGML_ASSERT(nrows % 4 == 0);
2557
+ const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2558
+ const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
2559
+ const dim3 block_nums(num_blocks_x, nrows, 1);
2560
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
2561
+ }
2562
+
2067
2563
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
2068
2564
  const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
2069
2565
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2106,20 +2602,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
2106
2602
  scoped_spin_lock lock(g_cuda_pool_lock);
2107
2603
  int id;
2108
2604
  CUDA_CHECK(cudaGetDevice(&id));
2109
-
2605
+ #ifdef DEBUG_CUDA_MALLOC
2606
+ int nnz = 0;
2607
+ size_t max_size = 0, tot_size = 0;
2608
+ #endif
2609
+ size_t best_diff = 1ull << 36;
2610
+ int ibest = -1;
2110
2611
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
2111
2612
  cuda_buffer& b = g_cuda_buffer_pool[id][i];
2112
- if (b.size >= size && b.ptr != nullptr) {
2113
- void * ptr = b.ptr;
2114
- *actual_size = b.size;
2115
- b.ptr = nullptr;
2116
- b.size = 0;
2117
- return ptr;
2613
+ if (b.ptr != nullptr) {
2614
+ #ifdef DEBUG_CUDA_MALLOC
2615
+ ++nnz;
2616
+ tot_size += b.size;
2617
+ if (b.size > max_size) max_size = b.size;
2618
+ #endif
2619
+ if (b.size >= size) {
2620
+ size_t diff = b.size - size;
2621
+ if (diff < best_diff) {
2622
+ best_diff = diff;
2623
+ ibest = i;
2624
+ if (!best_diff) {
2625
+ void * ptr = b.ptr;
2626
+ *actual_size = b.size;
2627
+ b.ptr = nullptr;
2628
+ b.size = 0;
2629
+ return ptr;
2630
+ }
2631
+ }
2632
+ }
2118
2633
  }
2119
2634
  }
2635
+ if (ibest >= 0) {
2636
+ cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
2637
+ void * ptr = b.ptr;
2638
+ *actual_size = b.size;
2639
+ b.ptr = nullptr;
2640
+ b.size = 0;
2641
+ return ptr;
2642
+ }
2643
+ #ifdef DEBUG_CUDA_MALLOC
2644
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
2645
+ (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
2646
+ #endif
2120
2647
  void * ptr;
2121
- CUDA_CHECK(cudaMalloc((void **) &ptr, size));
2122
- *actual_size = size;
2648
+ size_t look_ahead_size = (size_t) (1.05 * size);
2649
+ look_ahead_size = 256 * ((look_ahead_size + 255)/256);
2650
+ CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
2651
+ *actual_size = look_ahead_size;
2123
2652
  return ptr;
2124
2653
  }
2125
2654
 
@@ -2147,7 +2676,9 @@ static size_t g_scratch_offset = 0;
2147
2676
 
2148
2677
  static int g_device_count = -1;
2149
2678
  static int g_main_device = 0;
2679
+ #ifndef GGML_CUDA_FORCE_DMMV
2150
2680
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2681
+ #endif
2151
2682
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
2152
2683
 
2153
2684
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -2170,7 +2701,9 @@ void ggml_init_cublas() {
2170
2701
  g_tensor_split[id] = total_vram;
2171
2702
  total_vram += prop.totalGlobalMem;
2172
2703
 
2704
+ #ifndef GGML_CUDA_FORCE_DMMV
2173
2705
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
2706
+ #endif
2174
2707
  }
2175
2708
  for (int id = 0; id < g_device_count; ++id) {
2176
2709
  g_tensor_split[id] /= total_vram;
@@ -2195,6 +2728,9 @@ void ggml_init_cublas() {
2195
2728
  }
2196
2729
 
2197
2730
  void ggml_cuda_set_tensor_split(const float * tensor_split) {
2731
+ if (tensor_split == nullptr) {
2732
+ return;
2733
+ }
2198
2734
  bool all_zero = true;
2199
2735
  for (int i = 0; i < g_device_count; ++i) {
2200
2736
  if (tensor_split[i] != 0.0f) {
@@ -2293,17 +2829,15 @@ inline void ggml_cuda_op_add(
2293
2829
  GGML_ASSERT(src1_ddf_i != nullptr);
2294
2830
  GGML_ASSERT(dst_ddf_i != nullptr);
2295
2831
 
2296
- // TODO: support broadcasting
2297
- GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
2298
-
2299
2832
  const int64_t ne00 = src0->ne[0];
2300
2833
  const int64_t i01_diff = i01_high - i01_low;
2301
2834
 
2302
- // const int64_t ne10 = src1->ne[0];
2835
+ const int64_t ne10 = src1->ne[0];
2836
+ const int64_t ne11 = src1->ne[1];
2303
2837
 
2304
2838
  // compute
2305
2839
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2306
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2840
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2307
2841
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
2308
2842
  add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
2309
2843
  } else {
@@ -2327,23 +2861,17 @@ inline void ggml_cuda_op_mul(
2327
2861
  GGML_ASSERT(dst_ddf_i != nullptr);
2328
2862
 
2329
2863
  const int64_t ne00 = src0->ne[0];
2864
+ const int64_t i01_diff = i01_high - i01_low;
2865
+
2330
2866
  const int64_t ne10 = src1->ne[0];
2331
2867
  const int64_t ne11 = src1->ne[1];
2332
2868
 
2333
- for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
2334
- const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
2335
-
2336
- float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
2337
- float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
2338
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2339
-
2340
- // compute
2341
- mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
2342
- }
2869
+ mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2343
2870
 
2344
2871
  (void) dst;
2345
2872
  (void) src0_ddq_i;
2346
2873
  (void) i02;
2874
+ (void) i1;
2347
2875
  }
2348
2876
 
2349
2877
  inline void ggml_cuda_op_gelu(
@@ -2423,8 +2951,11 @@ inline void ggml_cuda_op_rms_norm(
2423
2951
  const int64_t ne00 = src0->ne[0];
2424
2952
  const int64_t i01_diff = i01_high - i01_low;
2425
2953
 
2954
+ float eps;
2955
+ memcpy(&eps, dst->op_params, sizeof(float));
2956
+
2426
2957
  // compute
2427
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2958
+ rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
2428
2959
 
2429
2960
  (void) src1;
2430
2961
  (void) dst;
@@ -2452,18 +2983,27 @@ inline void ggml_cuda_op_mul_mat_vec(
2452
2983
  int id;
2453
2984
  CUDA_CHECK(cudaGetDevice(&id));
2454
2985
 
2455
- const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2986
+ bool mul_mat_vec_q_implemented =
2987
+ src0->type == GGML_TYPE_Q4_0 ||
2456
2988
  src0->type == GGML_TYPE_Q4_1 ||
2457
2989
  src0->type == GGML_TYPE_Q5_0 ||
2458
2990
  src0->type == GGML_TYPE_Q5_1 ||
2459
2991
  src0->type == GGML_TYPE_Q8_0;
2460
-
2461
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
2992
+ #if QK_K == 256
2993
+ mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
2994
+ src0->type == GGML_TYPE_Q2_K ||
2995
+ src0->type == GGML_TYPE_Q3_K ||
2996
+ src0->type == GGML_TYPE_Q4_K ||
2997
+ src0->type == GGML_TYPE_Q5_K ||
2998
+ src0->type == GGML_TYPE_Q6_K;
2999
+ #endif // QK_K == 256
3000
+
3001
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
2462
3002
  #endif
2463
3003
 
2464
3004
  if (use_mul_mat_vec_q) {
2465
- int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2466
- padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
3005
+ const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
3006
+ ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
2467
3007
  size_t as;
2468
3008
  void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2469
3009
  quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
@@ -2484,6 +3024,21 @@ inline void ggml_cuda_op_mul_mat_vec(
2484
3024
  case GGML_TYPE_Q8_0:
2485
3025
  mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2486
3026
  break;
3027
+ case GGML_TYPE_Q2_K:
3028
+ mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3029
+ break;
3030
+ case GGML_TYPE_Q3_K:
3031
+ mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3032
+ break;
3033
+ case GGML_TYPE_Q4_K:
3034
+ mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3035
+ break;
3036
+ case GGML_TYPE_Q5_K:
3037
+ mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3038
+ break;
3039
+ case GGML_TYPE_Q6_K:
3040
+ mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
3041
+ break;
2487
3042
  default:
2488
3043
  GGML_ASSERT(false);
2489
3044
  break;
@@ -2615,17 +3170,31 @@ inline void ggml_cuda_op_rope(
2615
3170
  const int64_t ne00 = src0->ne[0];
2616
3171
  const int64_t i01_diff = i01_high - i01_low;
2617
3172
 
2618
- const int n_past = ((int32_t *) src1->data)[0];
2619
- const int n_dims = ((int32_t *) src1->data)[1];
2620
- const int mode = ((int32_t *) src1->data)[2];
2621
- GGML_ASSERT(mode == 0);
3173
+ const int n_past = ((int32_t *) dst->op_params)[0];
3174
+ const int n_dims = ((int32_t *) dst->op_params)[1];
3175
+ const int mode = ((int32_t *) dst->op_params)[2];
3176
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
3177
+ // RoPE alteration for extended context
2622
3178
 
2623
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
2624
- const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
3179
+ float freq_base, freq_scale;
3180
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
3181
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
3182
+
3183
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
3184
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3185
+
3186
+ bool is_glm = mode & 4;
2625
3187
 
2626
3188
  // compute
2627
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3189
+ if (is_glm) {
3190
+ const float id_p = min(p, n_ctx - 2.f);
3191
+ const float block_p = max(p - (n_ctx - 2.f), 0.f);
3192
+ rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3193
+ } else {
3194
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3195
+ }
2628
3196
 
3197
+ (void) src1;
2629
3198
  (void) dst;
2630
3199
  (void) src0_ddq_i;
2631
3200
  (void) src1_ddf_i;
@@ -2644,11 +3213,12 @@ inline void ggml_cuda_op_diag_mask_inf(
2644
3213
  const int64_t ne01 = src0->ne[1];
2645
3214
  const int64_t i01_diff = i01_high - i01_low;
2646
3215
 
2647
- const int n_past = ((int32_t *) src1->data)[0];
3216
+ const int n_past = ((int32_t *) dst->op_params)[0];
2648
3217
 
2649
3218
  // compute
2650
3219
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
2651
3220
 
3221
+ (void) src1;
2652
3222
  (void) dst;
2653
3223
  (void) src0_ddq_i;
2654
3224
  (void) src1_ddf_i;
@@ -2716,6 +3286,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2716
3286
  const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
2717
3287
  const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
2718
3288
  const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
3289
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
3290
+
3291
+ GGML_ASSERT(ne03 == ne13);
2719
3292
 
2720
3293
  const int64_t ne0 = dst->ne[0];
2721
3294
  const int64_t ne1 = dst->ne[1];
@@ -2727,12 +3300,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2727
3300
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
2728
3301
 
2729
3302
  // strides for iteration over dims 3 and 2
2730
- const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
2731
- const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
3303
+ const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
3304
+ const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
3305
+ const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
2732
3306
  const int64_t src0_stride = ne00 * ne01 * stride_mod;
2733
3307
  const int64_t src1_stride = ne10 * ne11 * stride_mod;
2734
3308
  const int64_t dst_stride = ne0 * ne1 * stride_mod;
2735
3309
 
3310
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
3311
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
3312
+ const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
3313
+ const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
3314
+ GGML_ASSERT(!(flatten_rows && ne02 < ne12));
3315
+
2736
3316
  const size_t src0_ts = ggml_type_size(src0->type);
2737
3317
  const size_t src0_bs = ggml_blck_size(src0->type);
2738
3318
 
@@ -2749,6 +3329,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2749
3329
  dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
2750
3330
 
2751
3331
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
3332
+ GGML_ASSERT(!(split && ne02 < ne12));
2752
3333
 
2753
3334
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
2754
3335
 
@@ -2785,7 +3366,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2785
3366
  row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
2786
3367
  } else {
2787
3368
  row_low = 0;
2788
- row_high = nrows0;
3369
+ row_high = nrows0*i02_divisor;
2789
3370
  }
2790
3371
  if (row_low == row_high) {
2791
3372
  continue;
@@ -2833,16 +3414,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2833
3414
  dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
2834
3415
  }
2835
3416
 
2836
- const int64_t i03_max = flatten_rows ? 1 : ne03;
2837
- const int64_t i02_max = flatten_rows ? 1 : ne02;
2838
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
2839
-
2840
3417
  for (int64_t i03 = 0; i03 < i03_max; i03++) {
2841
3418
  const int64_t i13 = i03 % ne13;
2842
3419
  for (int64_t i02 = 0; i02 < i02_max; i02++) {
2843
3420
  const int64_t i12 = i02 % ne12;
2844
3421
 
2845
- const int64_t i0 = i03*ne02 + i02;
3422
+ const int64_t i0 = i03*i02_max + i02;
2846
3423
 
2847
3424
  // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
2848
3425
  const int64_t i0_offset_low = row_low/rows_per_iter;
@@ -2876,10 +3453,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2876
3453
  const int64_t i11 = i13*ne12 + i12;
2877
3454
 
2878
3455
  // for split tensors the data begins at i0 == i0_offset_low
2879
- char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
2880
- float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
3456
+ char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
3457
+ float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
2881
3458
  float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
2882
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
3459
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
2883
3460
 
2884
3461
  // for split tensors the data pointer needs to be rounded down
2885
3462
  // to the bin edge for i03, i02 bins beyond the first
@@ -2918,11 +3495,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2918
3495
  }
2919
3496
  }
2920
3497
 
2921
- if (!src0_on_device || !src0_is_contiguous) {
3498
+ if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
2922
3499
  if (src0_is_f32) {
2923
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
3500
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
2924
3501
  } else {
2925
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
3502
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
2926
3503
  }
2927
3504
  }
2928
3505
 
@@ -3076,6 +3653,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3076
3653
  const int64_t ne01 = src0->ne[1];
3077
3654
  const int64_t ne02 = src0->ne[2];
3078
3655
 
3656
+ const int64_t ne12 = src1->ne[2];
3657
+
3079
3658
  CUDA_CHECK(cudaSetDevice(g_main_device));
3080
3659
  cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
3081
3660
 
@@ -3088,7 +3667,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3088
3667
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
3089
3668
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
3090
3669
 
3091
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
3670
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
3092
3671
  }
3093
3672
 
3094
3673
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -3102,6 +3681,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3102
3681
  const int64_t ne01 = src0->ne[1];
3103
3682
  const int64_t ne02 = src0->ne[2];
3104
3683
 
3684
+ const int64_t ne12 = src1->ne[2];
3685
+
3105
3686
  const int64_t nb01 = src0->nb[1];
3106
3687
  const int64_t nb02 = src0->nb[2];
3107
3688
 
@@ -3120,7 +3701,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3120
3701
  const int row_stride_x = nb01 / sizeof(half);
3121
3702
  const int channel_stride_x = nb02 / sizeof(half);
3122
3703
 
3123
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
3704
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
3124
3705
  }
3125
3706
 
3126
3707
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3197,6 +3778,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3197
3778
  (void) dst;
3198
3779
  }
3199
3780
 
3781
+ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3782
+ ggml_cuda_cpy(src0, dst, nullptr);
3783
+ (void) src1;
3784
+ }
3785
+
3200
3786
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3201
3787
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3202
3788
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@@ -3256,7 +3842,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3256
3842
  size_t size = ggml_nbytes_split(tensor, nrows_split);
3257
3843
  const size_t original_size = size;
3258
3844
 
3259
- // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3845
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
3260
3846
  if (ne0 % MATRIX_ROW_PADDING != 0) {
3261
3847
  size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3262
3848
  * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
@@ -3272,7 +3858,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3272
3858
  }
3273
3859
 
3274
3860
 
3275
- CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
3861
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
3276
3862
 
3277
3863
  extra->data_device[id] = buf;
3278
3864
 
@@ -3306,6 +3892,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
3306
3892
  delete extra;
3307
3893
  }
3308
3894
 
3895
+ static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
3896
+ static size_t g_temp_tensor_extra_index = 0;
3897
+
3898
+ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
3899
+ if (g_temp_tensor_extras == nullptr) {
3900
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
3901
+ }
3902
+
3903
+ size_t alloc_index = g_temp_tensor_extra_index;
3904
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
3905
+ struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
3906
+ memset(extra, 0, sizeof(*extra));
3907
+
3908
+ return extra;
3909
+ }
3910
+
3309
3911
  void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
3310
3912
  if (scratch && g_scratch_size == 0) {
3311
3913
  return;
@@ -3314,7 +3916,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3314
3916
  // recursively assign CUDA buffers until a compute tensor is found
3315
3917
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3316
3918
  const ggml_op src0_op = tensor->src[0]->op;
3317
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
3919
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
3318
3920
  ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
3319
3921
  }
3320
3922
  }
@@ -3323,8 +3925,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3323
3925
  }
3324
3926
 
3325
3927
  tensor->backend = GGML_BACKEND_GPU;
3326
- struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
3327
- memset(extra, 0, sizeof(*extra));
3928
+ struct ggml_tensor_extra_gpu * extra;
3328
3929
 
3329
3930
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
3330
3931
  tensor->op == GGML_OP_VIEW ||
@@ -3337,12 +3938,14 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3337
3938
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3338
3939
  size_t offset = 0;
3339
3940
  if (tensor->op == GGML_OP_VIEW) {
3340
- memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3941
+ memcpy(&offset, tensor->op_params, sizeof(size_t));
3341
3942
  }
3943
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3342
3944
  extra->data_device[g_main_device] = src0_ddc + offset;
3343
3945
  } else if (tensor->op == GGML_OP_CPY) {
3344
3946
  struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
3345
3947
  void * src1_ddv = src1_extra->data_device[g_main_device];
3948
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3346
3949
  extra->data_device[g_main_device] = src1_ddv;
3347
3950
  } else if (scratch) {
3348
3951
  GGML_ASSERT(size <= g_scratch_size);
@@ -3355,6 +3958,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3355
3958
  CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
3356
3959
  g_scratch_buffer = data;
3357
3960
  }
3961
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3358
3962
  extra->data_device[g_main_device] = data + g_scratch_offset;
3359
3963
 
3360
3964
  g_scratch_offset += size;
@@ -3364,6 +3968,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3364
3968
  void * data;
3365
3969
  CUDA_CHECK(cudaMalloc(&data, size));
3366
3970
  CUDA_CHECK(cudaMemset(data, 0, size));
3971
+ extra = new ggml_tensor_extra_gpu;
3972
+ memset(extra, 0, sizeof(*extra));
3367
3973
  extra->data_device[g_main_device] = data;
3368
3974
  }
3369
3975
 
@@ -3416,30 +4022,41 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3416
4022
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
3417
4023
 
3418
4024
  switch (tensor->op) {
3419
- case GGML_OP_ADD:
4025
+ case GGML_OP_DUP:
3420
4026
  if (!any_on_device) {
3421
4027
  return false;
3422
4028
  }
3423
- func = ggml_cuda_add;
4029
+ func = ggml_cuda_dup;
3424
4030
  break;
3425
- case GGML_OP_MUL:
3426
- if (!any_on_device) {
3427
- return false;
3428
- }
3429
- func = ggml_cuda_mul;
3430
- break;
3431
- case GGML_OP_GELU:
4031
+ case GGML_OP_ADD:
3432
4032
  if (!any_on_device) {
3433
4033
  return false;
3434
4034
  }
3435
- func = ggml_cuda_gelu;
4035
+ func = ggml_cuda_add;
3436
4036
  break;
3437
- case GGML_OP_SILU:
4037
+ case GGML_OP_MUL:
3438
4038
  if (!any_on_device) {
3439
4039
  return false;
3440
4040
  }
3441
- func = ggml_cuda_silu;
4041
+ func = ggml_cuda_mul;
3442
4042
  break;
4043
+ case GGML_OP_UNARY:
4044
+ switch (ggml_get_unary_op(tensor)) {
4045
+ case GGML_UNARY_OP_GELU:
4046
+ if (!any_on_device) {
4047
+ return false;
4048
+ }
4049
+ func = ggml_cuda_gelu;
4050
+ break;
4051
+ case GGML_UNARY_OP_SILU:
4052
+ if (!any_on_device) {
4053
+ return false;
4054
+ }
4055
+ func = ggml_cuda_silu;
4056
+ break;
4057
+ default:
4058
+ return false;
4059
+ } break;
3443
4060
  case GGML_OP_NORM:
3444
4061
  if (!any_on_device) {
3445
4062
  return false;
@@ -3470,6 +4087,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3470
4087
  }
3471
4088
  func = ggml_cuda_cpy;
3472
4089
  break;
4090
+ case GGML_OP_CONT:
4091
+ if (!any_on_device) {
4092
+ return false;
4093
+ }
4094
+ func = ggml_cuda_dup;
4095
+ break;
3473
4096
  case GGML_OP_RESHAPE:
3474
4097
  case GGML_OP_VIEW:
3475
4098
  case GGML_OP_PERMUTE: