llama_cpp 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +146 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +485 -67
- data/ext/llama_cpp/src/ggml-metal.m +52 -43
- data/ext/llama_cpp/src/ggml-metal.metal +587 -470
- data/ext/llama_cpp/src/ggml.c +105 -79
- data/ext/llama_cpp/src/ggml.h +13 -1
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +123 -66
- data/ext/llama_cpp/src/llama.h +34 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +12 -1
- metadata +2 -2
@@ -13,6 +13,8 @@
|
|
13
13
|
#include "ggml-cuda.h"
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
|
16
18
|
#if defined(_MSC_VER)
|
17
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
20
|
#endif
|
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
|
|
74
76
|
|
75
77
|
#define QK4_0 32
|
76
78
|
#define QR4_0 2
|
77
|
-
#define QI4_0 4
|
79
|
+
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
78
80
|
typedef struct {
|
79
81
|
half d; // delta
|
80
82
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
83
85
|
|
84
86
|
#define QK4_1 32
|
85
87
|
#define QR4_1 2
|
86
|
-
#define QI4_1 4
|
88
|
+
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
87
89
|
typedef struct {
|
88
90
|
half d; // delta
|
89
91
|
half m; // min
|
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
93
95
|
|
94
96
|
#define QK5_0 32
|
95
97
|
#define QR5_0 2
|
96
|
-
#define QI5_0 4
|
98
|
+
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
97
99
|
typedef struct {
|
98
100
|
half d; // delta
|
99
101
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
103
105
|
|
104
106
|
#define QK5_1 32
|
105
107
|
#define QR5_1 2
|
106
|
-
#define QI5_1 4
|
108
|
+
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
107
109
|
typedef struct {
|
108
110
|
half d; // delta
|
109
111
|
half m; // min
|
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
114
116
|
|
115
117
|
#define QK8_0 32
|
116
118
|
#define QR8_0 1
|
117
|
-
#define QI8_0
|
119
|
+
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
118
120
|
typedef struct {
|
119
121
|
half d; // delta
|
120
122
|
int8_t qs[QK8_0]; // quants
|
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
123
125
|
|
124
126
|
#define QK8_1 32
|
125
127
|
#define QR8_1 1
|
126
|
-
#define QI8_1
|
128
|
+
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
127
129
|
typedef struct {
|
128
130
|
half d; // delta
|
129
131
|
half s; // unquantized sum
|
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
|
143
145
|
#define K_SCALE_SIZE 12
|
144
146
|
#endif
|
145
147
|
|
148
|
+
#define QR2_K 4
|
149
|
+
#define QI2_K (QK_K / (4*QR2_K))
|
146
150
|
typedef struct {
|
147
151
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
148
152
|
uint8_t qs[QK_K/4]; // quants
|
@@ -151,6 +155,8 @@ typedef struct {
|
|
151
155
|
} block_q2_K;
|
152
156
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
153
157
|
|
158
|
+
#define QR3_K 4
|
159
|
+
#define QI3_K (QK_K / (4*QR3_K))
|
154
160
|
typedef struct {
|
155
161
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
156
162
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -163,6 +169,8 @@ typedef struct {
|
|
163
169
|
} block_q3_K;
|
164
170
|
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
165
171
|
|
172
|
+
#define QR4_K 2
|
173
|
+
#define QI4_K (QK_K / (4*QR4_K))
|
166
174
|
#ifdef GGML_QKK_64
|
167
175
|
typedef struct {
|
168
176
|
half d[2]; // super-block scales/mins
|
@@ -180,6 +188,8 @@ typedef struct {
|
|
180
188
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
181
189
|
#endif
|
182
190
|
|
191
|
+
#define QR5_K 2
|
192
|
+
#define QI5_K (QK_K / (4*QR5_K))
|
183
193
|
#ifdef GGML_QKK_64
|
184
194
|
typedef struct {
|
185
195
|
half d; // super-block scale
|
@@ -199,6 +209,8 @@ typedef struct {
|
|
199
209
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
200
210
|
#endif
|
201
211
|
|
212
|
+
#define QR6_K 2
|
213
|
+
#define QI6_K (QK_K / (4*QR6_K))
|
202
214
|
typedef struct {
|
203
215
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
204
216
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
@@ -240,13 +252,13 @@ struct ggml_tensor_extra_gpu {
|
|
240
252
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
241
253
|
};
|
242
254
|
|
243
|
-
static __global__ void add_f32(const float * x, const float * y, float * dst, const int
|
255
|
+
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
244
256
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
245
257
|
|
246
|
-
if (i >=
|
258
|
+
if (i >= kx) {
|
247
259
|
return;
|
248
260
|
}
|
249
|
-
dst[i] = x[i] + y[i];
|
261
|
+
dst[i] = x[i] + y[i%ky];
|
250
262
|
}
|
251
263
|
|
252
264
|
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
@@ -1271,8 +1283,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
|
|
1271
1283
|
y[iybs + iqs + y_offset] = v.y;
|
1272
1284
|
}
|
1273
1285
|
|
1274
|
-
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1275
|
-
|
1286
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1287
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1288
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1276
1289
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1277
1290
|
|
1278
1291
|
int vi;
|
@@ -1293,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
|
|
1293
1306
|
return sumi*d;
|
1294
1307
|
#else
|
1295
1308
|
return 0.0f; // only to satisfy the compiler
|
1296
|
-
#endif // __CUDA_ARCH__ >=
|
1309
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1297
1310
|
}
|
1298
1311
|
|
1299
|
-
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1300
|
-
|
1312
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1313
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1314
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1301
1315
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1302
1316
|
|
1303
1317
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1318,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
|
|
1318
1332
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1319
1333
|
#else
|
1320
1334
|
return 0.0f; // only to satisfy the compiler
|
1321
|
-
#endif // __CUDA_ARCH__ >=
|
1335
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1322
1336
|
}
|
1323
1337
|
|
1324
|
-
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1325
|
-
|
1338
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1339
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1340
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1326
1341
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1327
1342
|
|
1328
1343
|
int qs;
|
@@ -1353,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
|
|
1353
1368
|
return sumi*d;
|
1354
1369
|
#else
|
1355
1370
|
return 0.0f; // only to satisfy the compiler
|
1356
|
-
#endif // __CUDA_ARCH__ >=
|
1371
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1357
1372
|
}
|
1358
1373
|
|
1359
|
-
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1360
|
-
|
1374
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1375
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1376
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1361
1377
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1362
1378
|
|
1363
1379
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1387,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
|
|
1387
1403
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1388
1404
|
#else
|
1389
1405
|
return 0.0f; // only to satisfy the compiler
|
1390
|
-
#endif // __CUDA_ARCH__ >=
|
1406
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1391
1407
|
}
|
1392
1408
|
|
1393
|
-
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1394
|
-
|
1409
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1410
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1411
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1395
1412
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1396
1413
|
|
1397
1414
|
int vi;
|
@@ -1406,7 +1423,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
|
|
1406
1423
|
return sumi*d;
|
1407
1424
|
#else
|
1408
1425
|
return 0.0f; // only to satisfy the compiler
|
1409
|
-
#endif // __CUDA_ARCH__ >=
|
1426
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1427
|
+
}
|
1428
|
+
|
1429
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
1430
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1431
|
+
|
1432
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1433
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
1434
|
+
|
1435
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
1436
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1437
|
+
|
1438
|
+
float sumf_d = 0.0f;
|
1439
|
+
float sumf_m = 0.0f;
|
1440
|
+
|
1441
|
+
const float d = bq2_K->d;
|
1442
|
+
const float dmin = bq2_K->dmin;
|
1443
|
+
|
1444
|
+
const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
|
1445
|
+
|
1446
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1447
|
+
const int sc = bq2_K->scales[scale_offset + 2*i];
|
1448
|
+
|
1449
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1450
|
+
const float d8i = bq8i->d;
|
1451
|
+
|
1452
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1453
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1454
|
+
|
1455
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
|
1456
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
|
1457
|
+
}
|
1458
|
+
|
1459
|
+
return d*sumf_d - dmin*sumf_m;
|
1460
|
+
#else
|
1461
|
+
return 0.0f; // only to satisfy the compiler
|
1462
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1463
|
+
}
|
1464
|
+
|
1465
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
1466
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1467
|
+
|
1468
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1469
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
1470
|
+
|
1471
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
1472
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1473
|
+
|
1474
|
+
float sumf = 0.0f;
|
1475
|
+
|
1476
|
+
const float d = bq3_K->d;
|
1477
|
+
|
1478
|
+
int vl;
|
1479
|
+
memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
|
1480
|
+
|
1481
|
+
int vh;
|
1482
|
+
memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
|
1483
|
+
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
1484
|
+
vh >>= bq8_offset;
|
1485
|
+
|
1486
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1487
|
+
const int isc = scale_offset + 2*i;
|
1488
|
+
|
1489
|
+
const int isc_low = isc % (QK_K/32);
|
1490
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1491
|
+
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
1492
|
+
|
1493
|
+
const int isc_high = isc % (QK_K/64);
|
1494
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1495
|
+
const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1496
|
+
|
1497
|
+
const int sc = (sc_low | sc_high) - 32;
|
1498
|
+
|
1499
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1500
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1501
|
+
const float d8i = bq8i->d;
|
1502
|
+
|
1503
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1504
|
+
|
1505
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1506
|
+
|
1507
|
+
const int vi = __vsubss4(vil, vih);
|
1508
|
+
|
1509
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1510
|
+
}
|
1511
|
+
|
1512
|
+
return d*sumf;
|
1513
|
+
#else
|
1514
|
+
return 0.0f; // only to satisfy the compiler
|
1515
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1516
|
+
}
|
1517
|
+
|
1518
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
1519
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1520
|
+
|
1521
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1522
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
1523
|
+
|
1524
|
+
const int bq8_offset = QR4_K * (iqs / QI8_1);
|
1525
|
+
|
1526
|
+
float sumf_d = 0.0f;
|
1527
|
+
float sumf_m = 0.0f;
|
1528
|
+
|
1529
|
+
const float d = bq4_K->d;
|
1530
|
+
const float dmin = bq4_K->dmin;
|
1531
|
+
|
1532
|
+
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
|
1533
|
+
|
1534
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1535
|
+
const int isc = bq8_offset + i;
|
1536
|
+
|
1537
|
+
uint8_t sc, m;
|
1538
|
+
get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
1539
|
+
|
1540
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1541
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1542
|
+
const float d8i = bq8i->d;
|
1543
|
+
|
1544
|
+
const int vi = (v >> (4*i)) & 0x0F0F0F0F;
|
1545
|
+
|
1546
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1547
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
|
1548
|
+
}
|
1549
|
+
|
1550
|
+
return d*sumf_d - dmin*sumf_m;
|
1551
|
+
#else
|
1552
|
+
return 0.0f; // only to satisfy the compiler
|
1553
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1554
|
+
}
|
1555
|
+
|
1556
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
1557
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1558
|
+
|
1559
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1560
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
1561
|
+
|
1562
|
+
const int bq8_offset = QR5_K * (iqs / QI8_1);
|
1563
|
+
|
1564
|
+
float sumf_d = 0.0f;
|
1565
|
+
float sumf_m = 0.0f;
|
1566
|
+
|
1567
|
+
const float d = bq5_K->d;
|
1568
|
+
const float dmin = bq5_K->dmin;
|
1569
|
+
|
1570
|
+
const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
|
1571
|
+
|
1572
|
+
const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
|
1573
|
+
|
1574
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1575
|
+
const int isc = bq8_offset + i;
|
1576
|
+
|
1577
|
+
uint8_t sc, m;
|
1578
|
+
get_scale_min_k4(isc, bq5_K->scales, sc, m);
|
1579
|
+
|
1580
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1581
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1582
|
+
const float d8i = bq8i->d;
|
1583
|
+
|
1584
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1585
|
+
|
1586
|
+
const int vih = ((vh >> i) << 4) & 0x10101010;
|
1587
|
+
|
1588
|
+
const int vi = vil | vih;
|
1589
|
+
|
1590
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1591
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
|
1592
|
+
}
|
1593
|
+
|
1594
|
+
return d*sumf_d - dmin*sumf_m;
|
1595
|
+
#else
|
1596
|
+
return 0.0f; // only to satisfy the compiler
|
1597
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1598
|
+
}
|
1599
|
+
|
1600
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
1601
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1602
|
+
|
1603
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1604
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
1605
|
+
|
1606
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
1607
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
1608
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
1609
|
+
|
1610
|
+
float sumf = 0.0f;
|
1611
|
+
|
1612
|
+
const float d = bq6_K->d;
|
1613
|
+
|
1614
|
+
int vl;
|
1615
|
+
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
1616
|
+
|
1617
|
+
int vh;
|
1618
|
+
memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
|
1619
|
+
|
1620
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1621
|
+
const int sc = bq6_K->scales[scale_offset + 4*i];
|
1622
|
+
|
1623
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
|
1624
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1625
|
+
const float d8i = bq8i->d;
|
1626
|
+
|
1627
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1628
|
+
|
1629
|
+
const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
|
1630
|
+
|
1631
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1632
|
+
|
1633
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1634
|
+
}
|
1635
|
+
|
1636
|
+
return d*sumf;
|
1637
|
+
#else
|
1638
|
+
return 0.0f; // only to satisfy the compiler
|
1639
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1410
1640
|
}
|
1411
1641
|
|
1412
1642
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
@@ -1429,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1429
1659
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1430
1660
|
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1431
1661
|
|
1432
|
-
const int iby = i + threadIdx.x / qi; // y block index
|
1662
|
+
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
1433
1663
|
|
1434
1664
|
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1435
1665
|
|
@@ -1667,6 +1897,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
1667
1897
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1668
1898
|
}
|
1669
1899
|
|
1900
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
1901
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1902
|
+
const int half_n_dims = ncols/4;
|
1903
|
+
|
1904
|
+
if (col >= half_n_dims) {
|
1905
|
+
return;
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1909
|
+
const int i = row*ncols + col;
|
1910
|
+
|
1911
|
+
const float col_theta_scale = powf(theta_scale, col);
|
1912
|
+
|
1913
|
+
const float theta = p*col_theta_scale;
|
1914
|
+
const float sin_theta = sinf(theta);
|
1915
|
+
const float cos_theta = cosf(theta);
|
1916
|
+
|
1917
|
+
const float x0 = x[i + 0];
|
1918
|
+
const float x1 = x[i + half_n_dims];
|
1919
|
+
|
1920
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
1921
|
+
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
1922
|
+
|
1923
|
+
const float block_theta = block_p*col_theta_scale;
|
1924
|
+
const float sin_block_theta = sinf(block_theta);
|
1925
|
+
const float cos_block_theta = cosf(block_theta);
|
1926
|
+
|
1927
|
+
const float x2 = x[i + half_n_dims * 2];
|
1928
|
+
const float x3 = x[i + half_n_dims * 3];
|
1929
|
+
|
1930
|
+
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
1931
|
+
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
1932
|
+
}
|
1933
|
+
|
1670
1934
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
1671
1935
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1672
1936
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1732,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
1732
1996
|
dst[i] = scale * x[i];
|
1733
1997
|
}
|
1734
1998
|
|
1735
|
-
static void add_f32_cuda(const float * x, const float * y, float * dst, const int
|
1736
|
-
const int num_blocks = (
|
1737
|
-
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst,
|
1999
|
+
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
2000
|
+
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
2001
|
+
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1738
2002
|
}
|
1739
2003
|
|
1740
2004
|
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
@@ -1928,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1928
2192
|
}
|
1929
2193
|
|
1930
2194
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1931
|
-
GGML_ASSERT(ncols %
|
2195
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
1932
2196
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1933
2197
|
const dim3 block_nums(1, block_num_y, 1);
|
1934
2198
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1937,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1937
2201
|
}
|
1938
2202
|
|
1939
2203
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1940
|
-
GGML_ASSERT(ncols %
|
2204
|
+
GGML_ASSERT(ncols % QK4_1 == 0);
|
1941
2205
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1942
2206
|
const dim3 block_nums(1, block_num_y, 1);
|
1943
2207
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1946,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1946
2210
|
}
|
1947
2211
|
|
1948
2212
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1949
|
-
GGML_ASSERT(ncols %
|
2213
|
+
GGML_ASSERT(ncols % QK5_0 == 0);
|
1950
2214
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1951
2215
|
const dim3 block_nums(1, block_num_y, 1);
|
1952
2216
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1955,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1955
2219
|
}
|
1956
2220
|
|
1957
2221
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1958
|
-
GGML_ASSERT(ncols %
|
2222
|
+
GGML_ASSERT(ncols % QK5_1 == 0);
|
1959
2223
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1960
2224
|
const dim3 block_nums(1, block_num_y, 1);
|
1961
2225
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1964,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1964
2228
|
}
|
1965
2229
|
|
1966
2230
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1967
|
-
GGML_ASSERT(ncols %
|
2231
|
+
GGML_ASSERT(ncols % QK8_0 == 0);
|
1968
2232
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1969
2233
|
const dim3 block_nums(1, block_num_y, 1);
|
1970
2234
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1972,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1972
2236
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1973
2237
|
}
|
1974
2238
|
|
2239
|
+
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2240
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2241
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2242
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2243
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2244
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
2245
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2246
|
+
}
|
2247
|
+
|
2248
|
+
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2249
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2250
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2251
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2252
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2253
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
2254
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2255
|
+
}
|
2256
|
+
|
2257
|
+
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2258
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2259
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2260
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2261
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2262
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
|
2263
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2264
|
+
}
|
2265
|
+
|
2266
|
+
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2267
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2268
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2269
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2270
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2271
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
|
2272
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2273
|
+
}
|
2274
|
+
|
2275
|
+
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2276
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2277
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2278
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2279
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2280
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
2281
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2282
|
+
}
|
2283
|
+
|
1975
2284
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1976
2285
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1977
2286
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -2064,6 +2373,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
2064
2373
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
2065
2374
|
}
|
2066
2375
|
|
2376
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
2377
|
+
GGML_ASSERT(nrows % 4 == 0);
|
2378
|
+
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2379
|
+
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
2380
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2381
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
2382
|
+
}
|
2383
|
+
|
2067
2384
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
2068
2385
|
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
2069
2386
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -2106,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
2106
2423
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
2107
2424
|
int id;
|
2108
2425
|
CUDA_CHECK(cudaGetDevice(&id));
|
2109
|
-
|
2426
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2427
|
+
int nnz = 0;
|
2428
|
+
size_t max_size = 0, tot_size = 0;
|
2429
|
+
#endif
|
2430
|
+
size_t best_diff = 1ull << 36;
|
2431
|
+
int ibest = -1;
|
2110
2432
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
2111
2433
|
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
2112
|
-
if (b.
|
2113
|
-
|
2114
|
-
|
2115
|
-
b.
|
2116
|
-
b.size =
|
2117
|
-
|
2434
|
+
if (b.ptr != nullptr) {
|
2435
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2436
|
+
++nnz;
|
2437
|
+
tot_size += b.size;
|
2438
|
+
if (b.size > max_size) max_size = b.size;
|
2439
|
+
#endif
|
2440
|
+
if (b.size >= size) {
|
2441
|
+
size_t diff = b.size - size;
|
2442
|
+
if (diff < best_diff) {
|
2443
|
+
best_diff = diff;
|
2444
|
+
ibest = i;
|
2445
|
+
if (!best_diff) {
|
2446
|
+
void * ptr = b.ptr;
|
2447
|
+
*actual_size = b.size;
|
2448
|
+
b.ptr = nullptr;
|
2449
|
+
b.size = 0;
|
2450
|
+
return ptr;
|
2451
|
+
}
|
2452
|
+
}
|
2453
|
+
}
|
2118
2454
|
}
|
2119
2455
|
}
|
2456
|
+
if (ibest >= 0) {
|
2457
|
+
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
|
2458
|
+
void * ptr = b.ptr;
|
2459
|
+
*actual_size = b.size;
|
2460
|
+
b.ptr = nullptr;
|
2461
|
+
b.size = 0;
|
2462
|
+
return ptr;
|
2463
|
+
}
|
2464
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2465
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
2466
|
+
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
2467
|
+
#endif
|
2120
2468
|
void * ptr;
|
2121
|
-
|
2122
|
-
|
2469
|
+
size_t look_ahead_size = (size_t) (1.05 * size);
|
2470
|
+
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
2471
|
+
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
2472
|
+
*actual_size = look_ahead_size;
|
2123
2473
|
return ptr;
|
2124
2474
|
}
|
2125
2475
|
|
@@ -2195,6 +2545,9 @@ void ggml_init_cublas() {
|
|
2195
2545
|
}
|
2196
2546
|
|
2197
2547
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
2548
|
+
if (tensor_split == nullptr) {
|
2549
|
+
return;
|
2550
|
+
}
|
2198
2551
|
bool all_zero = true;
|
2199
2552
|
for (int i = 0; i < g_device_count; ++i) {
|
2200
2553
|
if (tensor_split[i] != 0.0f) {
|
@@ -2293,17 +2646,15 @@ inline void ggml_cuda_op_add(
|
|
2293
2646
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2294
2647
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
2295
2648
|
|
2296
|
-
// TODO: support broadcasting
|
2297
|
-
GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
|
2298
|
-
|
2299
2649
|
const int64_t ne00 = src0->ne[0];
|
2300
2650
|
const int64_t i01_diff = i01_high - i01_low;
|
2301
2651
|
|
2302
|
-
|
2652
|
+
const int64_t ne10 = src1->ne[0];
|
2653
|
+
const int64_t ne11 = src1->ne[1];
|
2303
2654
|
|
2304
2655
|
// compute
|
2305
2656
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2306
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2657
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2307
2658
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2308
2659
|
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2309
2660
|
} else {
|
@@ -2327,19 +2678,12 @@ inline void ggml_cuda_op_mul(
|
|
2327
2678
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
2328
2679
|
|
2329
2680
|
const int64_t ne00 = src0->ne[0];
|
2681
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2682
|
+
|
2330
2683
|
const int64_t ne10 = src1->ne[0];
|
2331
2684
|
const int64_t ne11 = src1->ne[1];
|
2332
2685
|
|
2333
|
-
|
2334
|
-
const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
|
2335
|
-
|
2336
|
-
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
2337
|
-
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
|
2338
|
-
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
|
2339
|
-
|
2340
|
-
// compute
|
2341
|
-
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
2342
|
-
}
|
2686
|
+
mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2343
2687
|
|
2344
2688
|
(void) dst;
|
2345
2689
|
(void) src0_ddq_i;
|
@@ -2452,13 +2796,22 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2452
2796
|
int id;
|
2453
2797
|
CUDA_CHECK(cudaGetDevice(&id));
|
2454
2798
|
|
2455
|
-
|
2799
|
+
bool mul_mat_vec_q_implemented =
|
2800
|
+
src0->type == GGML_TYPE_Q4_0 ||
|
2456
2801
|
src0->type == GGML_TYPE_Q4_1 ||
|
2457
2802
|
src0->type == GGML_TYPE_Q5_0 ||
|
2458
2803
|
src0->type == GGML_TYPE_Q5_1 ||
|
2459
2804
|
src0->type == GGML_TYPE_Q8_0;
|
2460
|
-
|
2461
|
-
|
2805
|
+
#if QK_K == 256
|
2806
|
+
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
2807
|
+
src0->type == GGML_TYPE_Q2_K ||
|
2808
|
+
src0->type == GGML_TYPE_Q3_K ||
|
2809
|
+
src0->type == GGML_TYPE_Q4_K ||
|
2810
|
+
src0->type == GGML_TYPE_Q5_K ||
|
2811
|
+
src0->type == GGML_TYPE_Q6_K;
|
2812
|
+
#endif // QK_K == 256
|
2813
|
+
|
2814
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
2462
2815
|
#endif
|
2463
2816
|
|
2464
2817
|
if (use_mul_mat_vec_q) {
|
@@ -2484,6 +2837,21 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2484
2837
|
case GGML_TYPE_Q8_0:
|
2485
2838
|
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2486
2839
|
break;
|
2840
|
+
case GGML_TYPE_Q2_K:
|
2841
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2842
|
+
break;
|
2843
|
+
case GGML_TYPE_Q3_K:
|
2844
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2845
|
+
break;
|
2846
|
+
case GGML_TYPE_Q4_K:
|
2847
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2848
|
+
break;
|
2849
|
+
case GGML_TYPE_Q5_K:
|
2850
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2851
|
+
break;
|
2852
|
+
case GGML_TYPE_Q6_K:
|
2853
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2854
|
+
break;
|
2487
2855
|
default:
|
2488
2856
|
GGML_ASSERT(false);
|
2489
2857
|
break;
|
@@ -2618,13 +2986,26 @@ inline void ggml_cuda_op_rope(
|
|
2618
2986
|
const int n_past = ((int32_t *) src1->data)[0];
|
2619
2987
|
const int n_dims = ((int32_t *) src1->data)[1];
|
2620
2988
|
const int mode = ((int32_t *) src1->data)[2];
|
2621
|
-
|
2989
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
2990
|
+
|
2991
|
+
// RoPE alteration for extended context
|
2992
|
+
float freq_base, freq_scale;
|
2993
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
2994
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
2995
|
+
|
2996
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
2997
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
2622
2998
|
|
2623
|
-
|
2624
|
-
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
|
2999
|
+
bool is_glm = mode & 4;
|
2625
3000
|
|
2626
3001
|
// compute
|
2627
|
-
|
3002
|
+
if (is_glm) {
|
3003
|
+
const float id_p = min(p, n_ctx - 2.f);
|
3004
|
+
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3005
|
+
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3006
|
+
} else {
|
3007
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
3008
|
+
}
|
2628
3009
|
|
2629
3010
|
(void) dst;
|
2630
3011
|
(void) src0_ddq_i;
|
@@ -3197,6 +3578,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3197
3578
|
(void) dst;
|
3198
3579
|
}
|
3199
3580
|
|
3581
|
+
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3582
|
+
ggml_cuda_cpy(src0, dst, nullptr);
|
3583
|
+
(void) src1;
|
3584
|
+
}
|
3585
|
+
|
3200
3586
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3201
3587
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3202
3588
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
@@ -3306,6 +3692,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
3306
3692
|
delete extra;
|
3307
3693
|
}
|
3308
3694
|
|
3695
|
+
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
3696
|
+
static size_t g_temp_tensor_extra_index = 0;
|
3697
|
+
|
3698
|
+
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
3699
|
+
if (g_temp_tensor_extras == nullptr) {
|
3700
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
3701
|
+
}
|
3702
|
+
|
3703
|
+
size_t alloc_index = g_temp_tensor_extra_index;
|
3704
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
3705
|
+
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
3706
|
+
memset(extra, 0, sizeof(*extra));
|
3707
|
+
|
3708
|
+
return extra;
|
3709
|
+
}
|
3710
|
+
|
3309
3711
|
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
3310
3712
|
if (scratch && g_scratch_size == 0) {
|
3311
3713
|
return;
|
@@ -3314,7 +3716,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3314
3716
|
// recursively assign CUDA buffers until a compute tensor is found
|
3315
3717
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3316
3718
|
const ggml_op src0_op = tensor->src[0]->op;
|
3317
|
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
3719
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
3318
3720
|
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
3319
3721
|
}
|
3320
3722
|
}
|
@@ -3323,8 +3725,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3323
3725
|
}
|
3324
3726
|
|
3325
3727
|
tensor->backend = GGML_BACKEND_GPU;
|
3326
|
-
struct ggml_tensor_extra_gpu * extra
|
3327
|
-
memset(extra, 0, sizeof(*extra));
|
3728
|
+
struct ggml_tensor_extra_gpu * extra;
|
3328
3729
|
|
3329
3730
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
3330
3731
|
tensor->op == GGML_OP_VIEW ||
|
@@ -3339,10 +3740,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3339
3740
|
if (tensor->op == GGML_OP_VIEW) {
|
3340
3741
|
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
3341
3742
|
}
|
3743
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3342
3744
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
3343
3745
|
} else if (tensor->op == GGML_OP_CPY) {
|
3344
3746
|
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
3345
3747
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
3748
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3346
3749
|
extra->data_device[g_main_device] = src1_ddv;
|
3347
3750
|
} else if (scratch) {
|
3348
3751
|
GGML_ASSERT(size <= g_scratch_size);
|
@@ -3355,6 +3758,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3355
3758
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
3356
3759
|
g_scratch_buffer = data;
|
3357
3760
|
}
|
3761
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3358
3762
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
3359
3763
|
|
3360
3764
|
g_scratch_offset += size;
|
@@ -3364,6 +3768,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3364
3768
|
void * data;
|
3365
3769
|
CUDA_CHECK(cudaMalloc(&data, size));
|
3366
3770
|
CUDA_CHECK(cudaMemset(data, 0, size));
|
3771
|
+
extra = new ggml_tensor_extra_gpu;
|
3772
|
+
memset(extra, 0, sizeof(*extra));
|
3367
3773
|
extra->data_device[g_main_device] = data;
|
3368
3774
|
}
|
3369
3775
|
|
@@ -3416,6 +3822,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3416
3822
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
3417
3823
|
|
3418
3824
|
switch (tensor->op) {
|
3825
|
+
case GGML_OP_DUP:
|
3826
|
+
if (!any_on_device) {
|
3827
|
+
return false;
|
3828
|
+
}
|
3829
|
+
func = ggml_cuda_dup;
|
3830
|
+
break;
|
3419
3831
|
case GGML_OP_ADD:
|
3420
3832
|
if (!any_on_device) {
|
3421
3833
|
return false;
|
@@ -3470,6 +3882,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3470
3882
|
}
|
3471
3883
|
func = ggml_cuda_cpy;
|
3472
3884
|
break;
|
3885
|
+
case GGML_OP_CONT:
|
3886
|
+
if (!any_on_device) {
|
3887
|
+
return false;
|
3888
|
+
}
|
3889
|
+
func = ggml_cuda_dup;
|
3890
|
+
break;
|
3473
3891
|
case GGML_OP_RESHAPE:
|
3474
3892
|
case GGML_OP_VIEW:
|
3475
3893
|
case GGML_OP_PERMUTE:
|