llama_cpp 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,8 @@
13
13
  #include "ggml-cuda.h"
14
14
  #include "ggml.h"
15
15
 
16
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+
16
18
  #if defined(_MSC_VER)
17
19
  #pragma warning(disable: 4244 4267) // possible loss of data
18
20
  #endif
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
74
76
 
75
77
  #define QK4_0 32
76
78
  #define QR4_0 2
77
- #define QI4_0 4
79
+ #define QI4_0 (QK4_0 / (4 * QR4_0))
78
80
  typedef struct {
79
81
  half d; // delta
80
82
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
83
85
 
84
86
  #define QK4_1 32
85
87
  #define QR4_1 2
86
- #define QI4_1 4
88
+ #define QI4_1 (QK4_1 / (4 * QR4_1))
87
89
  typedef struct {
88
90
  half d; // delta
89
91
  half m; // min
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
93
95
 
94
96
  #define QK5_0 32
95
97
  #define QR5_0 2
96
- #define QI5_0 4
98
+ #define QI5_0 (QK5_0 / (4 * QR5_0))
97
99
  typedef struct {
98
100
  half d; // delta
99
101
  uint8_t qh[4]; // 5-th bit of quants
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
103
105
 
104
106
  #define QK5_1 32
105
107
  #define QR5_1 2
106
- #define QI5_1 4
108
+ #define QI5_1 (QK5_1 / (4 * QR5_1))
107
109
  typedef struct {
108
110
  half d; // delta
109
111
  half m; // min
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
114
116
 
115
117
  #define QK8_0 32
116
118
  #define QR8_0 1
117
- #define QI8_0 8
119
+ #define QI8_0 (QK8_0 / (4 * QR8_0))
118
120
  typedef struct {
119
121
  half d; // delta
120
122
  int8_t qs[QK8_0]; // quants
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
123
125
 
124
126
  #define QK8_1 32
125
127
  #define QR8_1 1
126
- #define QI8_1 8
128
+ #define QI8_1 (QK8_1 / (4 * QR8_1))
127
129
  typedef struct {
128
130
  half d; // delta
129
131
  half s; // unquantized sum
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
143
145
  #define K_SCALE_SIZE 12
144
146
  #endif
145
147
 
148
+ #define QR2_K 4
149
+ #define QI2_K (QK_K / (4*QR2_K))
146
150
  typedef struct {
147
151
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
148
152
  uint8_t qs[QK_K/4]; // quants
@@ -151,6 +155,8 @@ typedef struct {
151
155
  } block_q2_K;
152
156
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
153
157
 
158
+ #define QR3_K 4
159
+ #define QI3_K (QK_K / (4*QR3_K))
154
160
  typedef struct {
155
161
  uint8_t hmask[QK_K/8]; // quants - high bit
156
162
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -163,6 +169,8 @@ typedef struct {
163
169
  } block_q3_K;
164
170
  //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
165
171
 
172
+ #define QR4_K 2
173
+ #define QI4_K (QK_K / (4*QR4_K))
166
174
  #ifdef GGML_QKK_64
167
175
  typedef struct {
168
176
  half d[2]; // super-block scales/mins
@@ -180,6 +188,8 @@ typedef struct {
180
188
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
181
189
  #endif
182
190
 
191
+ #define QR5_K 2
192
+ #define QI5_K (QK_K / (4*QR5_K))
183
193
  #ifdef GGML_QKK_64
184
194
  typedef struct {
185
195
  half d; // super-block scale
@@ -199,6 +209,8 @@ typedef struct {
199
209
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
200
210
  #endif
201
211
 
212
+ #define QR6_K 2
213
+ #define QI6_K (QK_K / (4*QR6_K))
202
214
  typedef struct {
203
215
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
204
216
  uint8_t qh[QK_K/4]; // quants, upper 2 bits
@@ -240,13 +252,13 @@ struct ggml_tensor_extra_gpu {
240
252
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
241
253
  };
242
254
 
243
- static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
255
+ static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
244
256
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
245
257
 
246
- if (i >= k) {
258
+ if (i >= kx) {
247
259
  return;
248
260
  }
249
- dst[i] = x[i] + y[i];
261
+ dst[i] = x[i] + y[i%ky];
250
262
  }
251
263
 
252
264
  static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -1271,8 +1283,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
1271
1283
  y[iybs + iqs + y_offset] = v.y;
1272
1284
  }
1273
1285
 
1274
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1275
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1286
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1287
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1288
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1276
1289
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1277
1290
 
1278
1291
  int vi;
@@ -1293,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
1293
1306
  return sumi*d;
1294
1307
  #else
1295
1308
  return 0.0f; // only to satisfy the compiler
1296
- #endif // __CUDA_ARCH__ >= 610
1309
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1297
1310
  }
1298
1311
 
1299
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1300
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1312
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1313
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1314
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1301
1315
  const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1302
1316
 
1303
1317
  const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1318,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
1318
1332
  return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1319
1333
  #else
1320
1334
  return 0.0f; // only to satisfy the compiler
1321
- #endif // __CUDA_ARCH__ >= 610
1335
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1322
1336
  }
1323
1337
 
1324
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1325
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1338
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1339
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1340
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1326
1341
  const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1327
1342
 
1328
1343
  int qs;
@@ -1353,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
1353
1368
  return sumi*d;
1354
1369
  #else
1355
1370
  return 0.0f; // only to satisfy the compiler
1356
- #endif // __CUDA_ARCH__ >= 610
1371
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1357
1372
  }
1358
1373
 
1359
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1360
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1374
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1375
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1376
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1361
1377
  const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1362
1378
 
1363
1379
  const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1387,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
1387
1403
  return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1388
1404
  #else
1389
1405
  return 0.0f; // only to satisfy the compiler
1390
- #endif // __CUDA_ARCH__ >= 610
1406
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1391
1407
  }
1392
1408
 
1393
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1394
- #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1409
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1410
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1411
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1395
1412
  const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1396
1413
 
1397
1414
  int vi;
@@ -1406,7 +1423,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
1406
1423
  return sumi*d;
1407
1424
  #else
1408
1425
  return 0.0f; // only to satisfy the compiler
1409
- #endif // __CUDA_ARCH__ >= 610
1426
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1427
+ }
1428
+
1429
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1430
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1431
+
1432
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1433
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1434
+
1435
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
1436
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1437
+
1438
+ float sumf_d = 0.0f;
1439
+ float sumf_m = 0.0f;
1440
+
1441
+ const float d = bq2_K->d;
1442
+ const float dmin = bq2_K->dmin;
1443
+
1444
+ const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1445
+
1446
+ for (int i = 0; i < QR2_K; ++i) {
1447
+ const int sc = bq2_K->scales[scale_offset + 2*i];
1448
+
1449
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1450
+ const float d8i = bq8i->d;
1451
+
1452
+ const int vi = (v >> (2*i)) & 0x03030303;
1453
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1454
+
1455
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1456
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1457
+ }
1458
+
1459
+ return d*sumf_d - dmin*sumf_m;
1460
+ #else
1461
+ return 0.0f; // only to satisfy the compiler
1462
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1463
+ }
1464
+
1465
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1466
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1467
+
1468
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1469
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
1470
+
1471
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1472
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1473
+
1474
+ float sumf = 0.0f;
1475
+
1476
+ const float d = bq3_K->d;
1477
+
1478
+ int vl;
1479
+ memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
1480
+
1481
+ int vh;
1482
+ memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1483
+ vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1484
+ vh >>= bq8_offset;
1485
+
1486
+ for (int i = 0; i < QR3_K; ++i) {
1487
+ const int isc = scale_offset + 2*i;
1488
+
1489
+ const int isc_low = isc % (QK_K/32);
1490
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
1491
+ const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
1492
+
1493
+ const int isc_high = isc % (QK_K/64);
1494
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
1495
+ const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1496
+
1497
+ const int sc = (sc_low | sc_high) - 32;
1498
+
1499
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1500
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1501
+ const float d8i = bq8i->d;
1502
+
1503
+ const int vil = (vl >> (2*i)) & 0x03030303;
1504
+
1505
+ const int vih = ((vh >> i) << 2) & 0x04040404;
1506
+
1507
+ const int vi = __vsubss4(vil, vih);
1508
+
1509
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1510
+ }
1511
+
1512
+ return d*sumf;
1513
+ #else
1514
+ return 0.0f; // only to satisfy the compiler
1515
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1516
+ }
1517
+
1518
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1519
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1520
+
1521
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1522
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1523
+
1524
+ const int bq8_offset = QR4_K * (iqs / QI8_1);
1525
+
1526
+ float sumf_d = 0.0f;
1527
+ float sumf_m = 0.0f;
1528
+
1529
+ const float d = bq4_K->d;
1530
+ const float dmin = bq4_K->dmin;
1531
+
1532
+ const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
1533
+
1534
+ for (int i = 0; i < QR4_K; ++i) {
1535
+ const int isc = bq8_offset + i;
1536
+
1537
+ uint8_t sc, m;
1538
+ get_scale_min_k4(isc, bq4_K->scales, sc, m);
1539
+
1540
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1541
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1542
+ const float d8i = bq8i->d;
1543
+
1544
+ const int vi = (v >> (4*i)) & 0x0F0F0F0F;
1545
+
1546
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1547
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
1548
+ }
1549
+
1550
+ return d*sumf_d - dmin*sumf_m;
1551
+ #else
1552
+ return 0.0f; // only to satisfy the compiler
1553
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1554
+ }
1555
+
1556
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1557
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1558
+
1559
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1560
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1561
+
1562
+ const int bq8_offset = QR5_K * (iqs / QI8_1);
1563
+
1564
+ float sumf_d = 0.0f;
1565
+ float sumf_m = 0.0f;
1566
+
1567
+ const float d = bq5_K->d;
1568
+ const float dmin = bq5_K->dmin;
1569
+
1570
+ const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
1571
+
1572
+ const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
1573
+
1574
+ for (int i = 0; i < QR5_K; ++i) {
1575
+ const int isc = bq8_offset + i;
1576
+
1577
+ uint8_t sc, m;
1578
+ get_scale_min_k4(isc, bq5_K->scales, sc, m);
1579
+
1580
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1581
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1582
+ const float d8i = bq8i->d;
1583
+
1584
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1585
+
1586
+ const int vih = ((vh >> i) << 4) & 0x10101010;
1587
+
1588
+ const int vi = vil | vih;
1589
+
1590
+ sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1591
+ sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
1592
+ }
1593
+
1594
+ return d*sumf_d - dmin*sumf_m;
1595
+ #else
1596
+ return 0.0f; // only to satisfy the compiler
1597
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1598
+ }
1599
+
1600
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1601
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1602
+
1603
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1604
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
1605
+
1606
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1607
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1608
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
1609
+
1610
+ float sumf = 0.0f;
1611
+
1612
+ const float d = bq6_K->d;
1613
+
1614
+ int vl;
1615
+ memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
1616
+
1617
+ int vh;
1618
+ memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
1619
+
1620
+ for (int i = 0; i < QR6_K; ++i) {
1621
+ const int sc = bq6_K->scales[scale_offset + 4*i];
1622
+
1623
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1624
+ const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1625
+ const float d8i = bq8i->d;
1626
+
1627
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1628
+
1629
+ const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
1630
+
1631
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1632
+
1633
+ sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1634
+ }
1635
+
1636
+ return d*sumf;
1637
+ #else
1638
+ return 0.0f; // only to satisfy the compiler
1639
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1410
1640
  }
1411
1641
 
1412
1642
  template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -1429,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1429
1659
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1430
1660
  const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
1431
1661
 
1432
- const int iby = i + threadIdx.x / qi; // y block index
1662
+ const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
1433
1663
 
1434
1664
  const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
1435
1665
 
@@ -1667,6 +1897,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
1667
1897
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
1668
1898
  }
1669
1899
 
1900
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
1901
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
1902
+ const int half_n_dims = ncols/4;
1903
+
1904
+ if (col >= half_n_dims) {
1905
+ return;
1906
+ }
1907
+
1908
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1909
+ const int i = row*ncols + col;
1910
+
1911
+ const float col_theta_scale = powf(theta_scale, col);
1912
+
1913
+ const float theta = p*col_theta_scale;
1914
+ const float sin_theta = sinf(theta);
1915
+ const float cos_theta = cosf(theta);
1916
+
1917
+ const float x0 = x[i + 0];
1918
+ const float x1 = x[i + half_n_dims];
1919
+
1920
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
1921
+ dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
1922
+
1923
+ const float block_theta = block_p*col_theta_scale;
1924
+ const float sin_block_theta = sinf(block_theta);
1925
+ const float cos_block_theta = cosf(block_theta);
1926
+
1927
+ const float x2 = x[i + half_n_dims * 2];
1928
+ const float x3 = x[i + half_n_dims * 3];
1929
+
1930
+ dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
1931
+ dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
1932
+ }
1933
+
1670
1934
  static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
1671
1935
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
1672
1936
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1732,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
1732
1996
  dst[i] = scale * x[i];
1733
1997
  }
1734
1998
 
1735
- static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
1736
- const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1737
- add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1999
+ static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
2000
+ const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
2001
+ add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
1738
2002
  }
1739
2003
 
1740
2004
  static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -1928,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
1928
2192
  }
1929
2193
 
1930
2194
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1931
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2195
+ GGML_ASSERT(ncols % QK4_0 == 0);
1932
2196
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1933
2197
  const dim3 block_nums(1, block_num_y, 1);
1934
2198
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1937,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
1937
2201
  }
1938
2202
 
1939
2203
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1940
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2204
+ GGML_ASSERT(ncols % QK4_1 == 0);
1941
2205
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1942
2206
  const dim3 block_nums(1, block_num_y, 1);
1943
2207
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1946,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
1946
2210
  }
1947
2211
 
1948
2212
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1949
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2213
+ GGML_ASSERT(ncols % QK5_0 == 0);
1950
2214
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1951
2215
  const dim3 block_nums(1, block_num_y, 1);
1952
2216
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1955,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
1955
2219
  }
1956
2220
 
1957
2221
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1958
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2222
+ GGML_ASSERT(ncols % QK5_1 == 0);
1959
2223
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1960
2224
  const dim3 block_nums(1, block_num_y, 1);
1961
2225
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1964,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
1964
2228
  }
1965
2229
 
1966
2230
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1967
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
2231
+ GGML_ASSERT(ncols % QK8_0 == 0);
1968
2232
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
1969
2233
  const dim3 block_nums(1, block_num_y, 1);
1970
2234
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -1972,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
1972
2236
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1973
2237
  }
1974
2238
 
2239
+ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2240
+ GGML_ASSERT(ncols % QK_K == 0);
2241
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2242
+ const dim3 block_nums(1, block_num_y, 1);
2243
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2244
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
2245
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2246
+ }
2247
+
2248
+ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2249
+ GGML_ASSERT(ncols % QK_K == 0);
2250
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2251
+ const dim3 block_nums(1, block_num_y, 1);
2252
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2253
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
2254
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2255
+ }
2256
+
2257
+ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2258
+ GGML_ASSERT(ncols % QK_K == 0);
2259
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2260
+ const dim3 block_nums(1, block_num_y, 1);
2261
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2262
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
2263
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2264
+ }
2265
+
2266
+ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2267
+ GGML_ASSERT(ncols % QK_K == 0);
2268
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2269
+ const dim3 block_nums(1, block_num_y, 1);
2270
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2271
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
2272
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2273
+ }
2274
+
2275
+ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2276
+ GGML_ASSERT(ncols % QK_K == 0);
2277
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2278
+ const dim3 block_nums(1, block_num_y, 1);
2279
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2280
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
2281
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2282
+ }
2283
+
1975
2284
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1976
2285
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1977
2286
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2064,6 +2373,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
2064
2373
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
2065
2374
  }
2066
2375
 
2376
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
2377
+ GGML_ASSERT(nrows % 4 == 0);
2378
+ const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2379
+ const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
2380
+ const dim3 block_nums(num_blocks_x, nrows, 1);
2381
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
2382
+ }
2383
+
2067
2384
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
2068
2385
  const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
2069
2386
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2106,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
2106
2423
  scoped_spin_lock lock(g_cuda_pool_lock);
2107
2424
  int id;
2108
2425
  CUDA_CHECK(cudaGetDevice(&id));
2109
-
2426
+ #ifdef DEBUG_CUDA_MALLOC
2427
+ int nnz = 0;
2428
+ size_t max_size = 0, tot_size = 0;
2429
+ #endif
2430
+ size_t best_diff = 1ull << 36;
2431
+ int ibest = -1;
2110
2432
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
2111
2433
  cuda_buffer& b = g_cuda_buffer_pool[id][i];
2112
- if (b.size >= size && b.ptr != nullptr) {
2113
- void * ptr = b.ptr;
2114
- *actual_size = b.size;
2115
- b.ptr = nullptr;
2116
- b.size = 0;
2117
- return ptr;
2434
+ if (b.ptr != nullptr) {
2435
+ #ifdef DEBUG_CUDA_MALLOC
2436
+ ++nnz;
2437
+ tot_size += b.size;
2438
+ if (b.size > max_size) max_size = b.size;
2439
+ #endif
2440
+ if (b.size >= size) {
2441
+ size_t diff = b.size - size;
2442
+ if (diff < best_diff) {
2443
+ best_diff = diff;
2444
+ ibest = i;
2445
+ if (!best_diff) {
2446
+ void * ptr = b.ptr;
2447
+ *actual_size = b.size;
2448
+ b.ptr = nullptr;
2449
+ b.size = 0;
2450
+ return ptr;
2451
+ }
2452
+ }
2453
+ }
2118
2454
  }
2119
2455
  }
2456
+ if (ibest >= 0) {
2457
+ cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
2458
+ void * ptr = b.ptr;
2459
+ *actual_size = b.size;
2460
+ b.ptr = nullptr;
2461
+ b.size = 0;
2462
+ return ptr;
2463
+ }
2464
+ #ifdef DEBUG_CUDA_MALLOC
2465
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
2466
+ (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
2467
+ #endif
2120
2468
  void * ptr;
2121
- CUDA_CHECK(cudaMalloc((void **) &ptr, size));
2122
- *actual_size = size;
2469
+ size_t look_ahead_size = (size_t) (1.05 * size);
2470
+ look_ahead_size = 256 * ((look_ahead_size + 255)/256);
2471
+ CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
2472
+ *actual_size = look_ahead_size;
2123
2473
  return ptr;
2124
2474
  }
2125
2475
 
@@ -2195,6 +2545,9 @@ void ggml_init_cublas() {
2195
2545
  }
2196
2546
 
2197
2547
  void ggml_cuda_set_tensor_split(const float * tensor_split) {
2548
+ if (tensor_split == nullptr) {
2549
+ return;
2550
+ }
2198
2551
  bool all_zero = true;
2199
2552
  for (int i = 0; i < g_device_count; ++i) {
2200
2553
  if (tensor_split[i] != 0.0f) {
@@ -2293,17 +2646,15 @@ inline void ggml_cuda_op_add(
2293
2646
  GGML_ASSERT(src1_ddf_i != nullptr);
2294
2647
  GGML_ASSERT(dst_ddf_i != nullptr);
2295
2648
 
2296
- // TODO: support broadcasting
2297
- GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
2298
-
2299
2649
  const int64_t ne00 = src0->ne[0];
2300
2650
  const int64_t i01_diff = i01_high - i01_low;
2301
2651
 
2302
- // const int64_t ne10 = src1->ne[0];
2652
+ const int64_t ne10 = src1->ne[0];
2653
+ const int64_t ne11 = src1->ne[1];
2303
2654
 
2304
2655
  // compute
2305
2656
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2306
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
2657
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2307
2658
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
2308
2659
  add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
2309
2660
  } else {
@@ -2327,19 +2678,12 @@ inline void ggml_cuda_op_mul(
2327
2678
  GGML_ASSERT(dst_ddf_i != nullptr);
2328
2679
 
2329
2680
  const int64_t ne00 = src0->ne[0];
2681
+ const int64_t i01_diff = i01_high - i01_low;
2682
+
2330
2683
  const int64_t ne10 = src1->ne[0];
2331
2684
  const int64_t ne11 = src1->ne[1];
2332
2685
 
2333
- for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
2334
- const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
2335
-
2336
- float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
2337
- float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
2338
- float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
2339
-
2340
- // compute
2341
- mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
2342
- }
2686
+ mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
2343
2687
 
2344
2688
  (void) dst;
2345
2689
  (void) src0_ddq_i;
@@ -2452,13 +2796,22 @@ inline void ggml_cuda_op_mul_mat_vec(
2452
2796
  int id;
2453
2797
  CUDA_CHECK(cudaGetDevice(&id));
2454
2798
 
2455
- const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
2799
+ bool mul_mat_vec_q_implemented =
2800
+ src0->type == GGML_TYPE_Q4_0 ||
2456
2801
  src0->type == GGML_TYPE_Q4_1 ||
2457
2802
  src0->type == GGML_TYPE_Q5_0 ||
2458
2803
  src0->type == GGML_TYPE_Q5_1 ||
2459
2804
  src0->type == GGML_TYPE_Q8_0;
2460
-
2461
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
2805
+ #if QK_K == 256
2806
+ mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
2807
+ src0->type == GGML_TYPE_Q2_K ||
2808
+ src0->type == GGML_TYPE_Q3_K ||
2809
+ src0->type == GGML_TYPE_Q4_K ||
2810
+ src0->type == GGML_TYPE_Q5_K ||
2811
+ src0->type == GGML_TYPE_Q6_K;
2812
+ #endif // QK_K == 256
2813
+
2814
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
2462
2815
  #endif
2463
2816
 
2464
2817
  if (use_mul_mat_vec_q) {
@@ -2484,6 +2837,21 @@ inline void ggml_cuda_op_mul_mat_vec(
2484
2837
  case GGML_TYPE_Q8_0:
2485
2838
  mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2486
2839
  break;
2840
+ case GGML_TYPE_Q2_K:
2841
+ mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2842
+ break;
2843
+ case GGML_TYPE_Q3_K:
2844
+ mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2845
+ break;
2846
+ case GGML_TYPE_Q4_K:
2847
+ mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2848
+ break;
2849
+ case GGML_TYPE_Q5_K:
2850
+ mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2851
+ break;
2852
+ case GGML_TYPE_Q6_K:
2853
+ mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2854
+ break;
2487
2855
  default:
2488
2856
  GGML_ASSERT(false);
2489
2857
  break;
@@ -2618,13 +2986,26 @@ inline void ggml_cuda_op_rope(
2618
2986
  const int n_past = ((int32_t *) src1->data)[0];
2619
2987
  const int n_dims = ((int32_t *) src1->data)[1];
2620
2988
  const int mode = ((int32_t *) src1->data)[2];
2621
- GGML_ASSERT(mode == 0);
2989
+ const int n_ctx = ((int32_t *) src1->data)[3];
2990
+
2991
+ // RoPE alteration for extended context
2992
+ float freq_base, freq_scale;
2993
+ memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
2994
+ memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
2995
+
2996
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
2997
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
2622
2998
 
2623
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
2624
- const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
2999
+ bool is_glm = mode & 4;
2625
3000
 
2626
3001
  // compute
2627
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3002
+ if (is_glm) {
3003
+ const float id_p = min(p, n_ctx - 2.f);
3004
+ const float block_p = max(p - (n_ctx - 2.f), 0.f);
3005
+ rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3006
+ } else {
3007
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3008
+ }
2628
3009
 
2629
3010
  (void) dst;
2630
3011
  (void) src0_ddq_i;
@@ -3197,6 +3578,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3197
3578
  (void) dst;
3198
3579
  }
3199
3580
 
3581
+ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3582
+ ggml_cuda_cpy(src0, dst, nullptr);
3583
+ (void) src1;
3584
+ }
3585
+
3200
3586
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3201
3587
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3202
3588
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@@ -3306,6 +3692,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
3306
3692
  delete extra;
3307
3693
  }
3308
3694
 
3695
+ static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
3696
+ static size_t g_temp_tensor_extra_index = 0;
3697
+
3698
+ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
3699
+ if (g_temp_tensor_extras == nullptr) {
3700
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
3701
+ }
3702
+
3703
+ size_t alloc_index = g_temp_tensor_extra_index;
3704
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
3705
+ struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
3706
+ memset(extra, 0, sizeof(*extra));
3707
+
3708
+ return extra;
3709
+ }
3710
+
3309
3711
  void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
3310
3712
  if (scratch && g_scratch_size == 0) {
3311
3713
  return;
@@ -3314,7 +3716,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3314
3716
  // recursively assign CUDA buffers until a compute tensor is found
3315
3717
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3316
3718
  const ggml_op src0_op = tensor->src[0]->op;
3317
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
3719
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
3318
3720
  ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
3319
3721
  }
3320
3722
  }
@@ -3323,8 +3725,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3323
3725
  }
3324
3726
 
3325
3727
  tensor->backend = GGML_BACKEND_GPU;
3326
- struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
3327
- memset(extra, 0, sizeof(*extra));
3728
+ struct ggml_tensor_extra_gpu * extra;
3328
3729
 
3329
3730
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
3330
3731
  tensor->op == GGML_OP_VIEW ||
@@ -3339,10 +3740,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3339
3740
  if (tensor->op == GGML_OP_VIEW) {
3340
3741
  memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3341
3742
  }
3743
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3342
3744
  extra->data_device[g_main_device] = src0_ddc + offset;
3343
3745
  } else if (tensor->op == GGML_OP_CPY) {
3344
3746
  struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
3345
3747
  void * src1_ddv = src1_extra->data_device[g_main_device];
3748
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3346
3749
  extra->data_device[g_main_device] = src1_ddv;
3347
3750
  } else if (scratch) {
3348
3751
  GGML_ASSERT(size <= g_scratch_size);
@@ -3355,6 +3758,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3355
3758
  CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
3356
3759
  g_scratch_buffer = data;
3357
3760
  }
3761
+ extra = ggml_cuda_alloc_temp_tensor_extra();
3358
3762
  extra->data_device[g_main_device] = data + g_scratch_offset;
3359
3763
 
3360
3764
  g_scratch_offset += size;
@@ -3364,6 +3768,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3364
3768
  void * data;
3365
3769
  CUDA_CHECK(cudaMalloc(&data, size));
3366
3770
  CUDA_CHECK(cudaMemset(data, 0, size));
3771
+ extra = new ggml_tensor_extra_gpu;
3772
+ memset(extra, 0, sizeof(*extra));
3367
3773
  extra->data_device[g_main_device] = data;
3368
3774
  }
3369
3775
 
@@ -3416,6 +3822,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3416
3822
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
3417
3823
 
3418
3824
  switch (tensor->op) {
3825
+ case GGML_OP_DUP:
3826
+ if (!any_on_device) {
3827
+ return false;
3828
+ }
3829
+ func = ggml_cuda_dup;
3830
+ break;
3419
3831
  case GGML_OP_ADD:
3420
3832
  if (!any_on_device) {
3421
3833
  return false;
@@ -3470,6 +3882,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3470
3882
  }
3471
3883
  func = ggml_cuda_cpy;
3472
3884
  break;
3885
+ case GGML_OP_CONT:
3886
+ if (!any_on_device) {
3887
+ return false;
3888
+ }
3889
+ func = ggml_cuda_dup;
3890
+ break;
3473
3891
  case GGML_OP_RESHAPE:
3474
3892
  case GGML_OP_VIEW:
3475
3893
  case GGML_OP_PERMUTE: