llama_cpp 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +146 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +485 -67
- data/ext/llama_cpp/src/ggml-metal.m +52 -43
- data/ext/llama_cpp/src/ggml-metal.metal +587 -470
- data/ext/llama_cpp/src/ggml.c +105 -79
- data/ext/llama_cpp/src/ggml.h +13 -1
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +123 -66
- data/ext/llama_cpp/src/llama.h +34 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +12 -1
- metadata +2 -2
@@ -13,6 +13,8 @@
|
|
13
13
|
#include "ggml-cuda.h"
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
|
16
18
|
#if defined(_MSC_VER)
|
17
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
20
|
#endif
|
@@ -74,7 +76,7 @@ typedef void (*ggml_cuda_op_t)(
|
|
74
76
|
|
75
77
|
#define QK4_0 32
|
76
78
|
#define QR4_0 2
|
77
|
-
#define QI4_0 4
|
79
|
+
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
78
80
|
typedef struct {
|
79
81
|
half d; // delta
|
80
82
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
@@ -83,7 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
83
85
|
|
84
86
|
#define QK4_1 32
|
85
87
|
#define QR4_1 2
|
86
|
-
#define QI4_1 4
|
88
|
+
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
87
89
|
typedef struct {
|
88
90
|
half d; // delta
|
89
91
|
half m; // min
|
@@ -93,7 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
|
93
95
|
|
94
96
|
#define QK5_0 32
|
95
97
|
#define QR5_0 2
|
96
|
-
#define QI5_0 4
|
98
|
+
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
97
99
|
typedef struct {
|
98
100
|
half d; // delta
|
99
101
|
uint8_t qh[4]; // 5-th bit of quants
|
@@ -103,7 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
103
105
|
|
104
106
|
#define QK5_1 32
|
105
107
|
#define QR5_1 2
|
106
|
-
#define QI5_1 4
|
108
|
+
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
107
109
|
typedef struct {
|
108
110
|
half d; // delta
|
109
111
|
half m; // min
|
@@ -114,7 +116,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
114
116
|
|
115
117
|
#define QK8_0 32
|
116
118
|
#define QR8_0 1
|
117
|
-
#define QI8_0
|
119
|
+
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
118
120
|
typedef struct {
|
119
121
|
half d; // delta
|
120
122
|
int8_t qs[QK8_0]; // quants
|
@@ -123,7 +125,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
123
125
|
|
124
126
|
#define QK8_1 32
|
125
127
|
#define QR8_1 1
|
126
|
-
#define QI8_1
|
128
|
+
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
127
129
|
typedef struct {
|
128
130
|
half d; // delta
|
129
131
|
half s; // unquantized sum
|
@@ -143,6 +145,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
|
143
145
|
#define K_SCALE_SIZE 12
|
144
146
|
#endif
|
145
147
|
|
148
|
+
#define QR2_K 4
|
149
|
+
#define QI2_K (QK_K / (4*QR2_K))
|
146
150
|
typedef struct {
|
147
151
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
148
152
|
uint8_t qs[QK_K/4]; // quants
|
@@ -151,6 +155,8 @@ typedef struct {
|
|
151
155
|
} block_q2_K;
|
152
156
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
153
157
|
|
158
|
+
#define QR3_K 4
|
159
|
+
#define QI3_K (QK_K / (4*QR3_K))
|
154
160
|
typedef struct {
|
155
161
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
156
162
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -163,6 +169,8 @@ typedef struct {
|
|
163
169
|
} block_q3_K;
|
164
170
|
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
165
171
|
|
172
|
+
#define QR4_K 2
|
173
|
+
#define QI4_K (QK_K / (4*QR4_K))
|
166
174
|
#ifdef GGML_QKK_64
|
167
175
|
typedef struct {
|
168
176
|
half d[2]; // super-block scales/mins
|
@@ -180,6 +188,8 @@ typedef struct {
|
|
180
188
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
181
189
|
#endif
|
182
190
|
|
191
|
+
#define QR5_K 2
|
192
|
+
#define QI5_K (QK_K / (4*QR5_K))
|
183
193
|
#ifdef GGML_QKK_64
|
184
194
|
typedef struct {
|
185
195
|
half d; // super-block scale
|
@@ -199,6 +209,8 @@ typedef struct {
|
|
199
209
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
200
210
|
#endif
|
201
211
|
|
212
|
+
#define QR6_K 2
|
213
|
+
#define QI6_K (QK_K / (4*QR6_K))
|
202
214
|
typedef struct {
|
203
215
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
204
216
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
@@ -240,13 +252,13 @@ struct ggml_tensor_extra_gpu {
|
|
240
252
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
241
253
|
};
|
242
254
|
|
243
|
-
static __global__ void add_f32(const float * x, const float * y, float * dst, const int
|
255
|
+
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
244
256
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
245
257
|
|
246
|
-
if (i >=
|
258
|
+
if (i >= kx) {
|
247
259
|
return;
|
248
260
|
}
|
249
|
-
dst[i] = x[i] + y[i];
|
261
|
+
dst[i] = x[i] + y[i%ky];
|
250
262
|
}
|
251
263
|
|
252
264
|
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
@@ -1271,8 +1283,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
|
|
1271
1283
|
y[iybs + iqs + y_offset] = v.y;
|
1272
1284
|
}
|
1273
1285
|
|
1274
|
-
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1275
|
-
|
1286
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1287
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1288
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1276
1289
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1277
1290
|
|
1278
1291
|
int vi;
|
@@ -1293,11 +1306,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
|
|
1293
1306
|
return sumi*d;
|
1294
1307
|
#else
|
1295
1308
|
return 0.0f; // only to satisfy the compiler
|
1296
|
-
#endif // __CUDA_ARCH__ >=
|
1309
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1297
1310
|
}
|
1298
1311
|
|
1299
|
-
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1300
|
-
|
1312
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1313
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1314
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1301
1315
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1302
1316
|
|
1303
1317
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1318,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
|
|
1318
1332
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
1319
1333
|
#else
|
1320
1334
|
return 0.0f; // only to satisfy the compiler
|
1321
|
-
#endif // __CUDA_ARCH__ >=
|
1335
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1322
1336
|
}
|
1323
1337
|
|
1324
|
-
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1325
|
-
|
1338
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1339
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1340
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1326
1341
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1327
1342
|
|
1328
1343
|
int qs;
|
@@ -1353,11 +1368,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
|
|
1353
1368
|
return sumi*d;
|
1354
1369
|
#else
|
1355
1370
|
return 0.0f; // only to satisfy the compiler
|
1356
|
-
#endif // __CUDA_ARCH__ >=
|
1371
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1357
1372
|
}
|
1358
1373
|
|
1359
|
-
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1360
|
-
|
1374
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1375
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1376
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1361
1377
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1362
1378
|
|
1363
1379
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
@@ -1387,11 +1403,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
|
|
1387
1403
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1388
1404
|
#else
|
1389
1405
|
return 0.0f; // only to satisfy the compiler
|
1390
|
-
#endif // __CUDA_ARCH__ >=
|
1406
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1391
1407
|
}
|
1392
1408
|
|
1393
|
-
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1394
|
-
|
1409
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1410
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1411
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1395
1412
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1396
1413
|
|
1397
1414
|
int vi;
|
@@ -1406,7 +1423,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
|
|
1406
1423
|
return sumi*d;
|
1407
1424
|
#else
|
1408
1425
|
return 0.0f; // only to satisfy the compiler
|
1409
|
-
#endif // __CUDA_ARCH__ >=
|
1426
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1427
|
+
}
|
1428
|
+
|
1429
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
1430
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1431
|
+
|
1432
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1433
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
1434
|
+
|
1435
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
1436
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1437
|
+
|
1438
|
+
float sumf_d = 0.0f;
|
1439
|
+
float sumf_m = 0.0f;
|
1440
|
+
|
1441
|
+
const float d = bq2_K->d;
|
1442
|
+
const float dmin = bq2_K->dmin;
|
1443
|
+
|
1444
|
+
const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
|
1445
|
+
|
1446
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1447
|
+
const int sc = bq2_K->scales[scale_offset + 2*i];
|
1448
|
+
|
1449
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1450
|
+
const float d8i = bq8i->d;
|
1451
|
+
|
1452
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1453
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1454
|
+
|
1455
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
|
1456
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
|
1457
|
+
}
|
1458
|
+
|
1459
|
+
return d*sumf_d - dmin*sumf_m;
|
1460
|
+
#else
|
1461
|
+
return 0.0f; // only to satisfy the compiler
|
1462
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1463
|
+
}
|
1464
|
+
|
1465
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
1466
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1467
|
+
|
1468
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1469
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
1470
|
+
|
1471
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
1472
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
1473
|
+
|
1474
|
+
float sumf = 0.0f;
|
1475
|
+
|
1476
|
+
const float d = bq3_K->d;
|
1477
|
+
|
1478
|
+
int vl;
|
1479
|
+
memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
|
1480
|
+
|
1481
|
+
int vh;
|
1482
|
+
memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
|
1483
|
+
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
1484
|
+
vh >>= bq8_offset;
|
1485
|
+
|
1486
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1487
|
+
const int isc = scale_offset + 2*i;
|
1488
|
+
|
1489
|
+
const int isc_low = isc % (QK_K/32);
|
1490
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1491
|
+
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
1492
|
+
|
1493
|
+
const int isc_high = isc % (QK_K/64);
|
1494
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1495
|
+
const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1496
|
+
|
1497
|
+
const int sc = (sc_low | sc_high) - 32;
|
1498
|
+
|
1499
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1500
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1501
|
+
const float d8i = bq8i->d;
|
1502
|
+
|
1503
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1504
|
+
|
1505
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1506
|
+
|
1507
|
+
const int vi = __vsubss4(vil, vih);
|
1508
|
+
|
1509
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1510
|
+
}
|
1511
|
+
|
1512
|
+
return d*sumf;
|
1513
|
+
#else
|
1514
|
+
return 0.0f; // only to satisfy the compiler
|
1515
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1516
|
+
}
|
1517
|
+
|
1518
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
1519
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1520
|
+
|
1521
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1522
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
1523
|
+
|
1524
|
+
const int bq8_offset = QR4_K * (iqs / QI8_1);
|
1525
|
+
|
1526
|
+
float sumf_d = 0.0f;
|
1527
|
+
float sumf_m = 0.0f;
|
1528
|
+
|
1529
|
+
const float d = bq4_K->d;
|
1530
|
+
const float dmin = bq4_K->dmin;
|
1531
|
+
|
1532
|
+
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
|
1533
|
+
|
1534
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1535
|
+
const int isc = bq8_offset + i;
|
1536
|
+
|
1537
|
+
uint8_t sc, m;
|
1538
|
+
get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
1539
|
+
|
1540
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1541
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1542
|
+
const float d8i = bq8i->d;
|
1543
|
+
|
1544
|
+
const int vi = (v >> (4*i)) & 0x0F0F0F0F;
|
1545
|
+
|
1546
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1547
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
|
1548
|
+
}
|
1549
|
+
|
1550
|
+
return d*sumf_d - dmin*sumf_m;
|
1551
|
+
#else
|
1552
|
+
return 0.0f; // only to satisfy the compiler
|
1553
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1554
|
+
}
|
1555
|
+
|
1556
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
1557
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1558
|
+
|
1559
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1560
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
1561
|
+
|
1562
|
+
const int bq8_offset = QR5_K * (iqs / QI8_1);
|
1563
|
+
|
1564
|
+
float sumf_d = 0.0f;
|
1565
|
+
float sumf_m = 0.0f;
|
1566
|
+
|
1567
|
+
const float d = bq5_K->d;
|
1568
|
+
const float dmin = bq5_K->dmin;
|
1569
|
+
|
1570
|
+
const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
|
1571
|
+
|
1572
|
+
const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
|
1573
|
+
|
1574
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1575
|
+
const int isc = bq8_offset + i;
|
1576
|
+
|
1577
|
+
uint8_t sc, m;
|
1578
|
+
get_scale_min_k4(isc, bq5_K->scales, sc, m);
|
1579
|
+
|
1580
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1581
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1582
|
+
const float d8i = bq8i->d;
|
1583
|
+
|
1584
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1585
|
+
|
1586
|
+
const int vih = ((vh >> i) << 4) & 0x10101010;
|
1587
|
+
|
1588
|
+
const int vi = vil | vih;
|
1589
|
+
|
1590
|
+
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1591
|
+
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
|
1592
|
+
}
|
1593
|
+
|
1594
|
+
return d*sumf_d - dmin*sumf_m;
|
1595
|
+
#else
|
1596
|
+
return 0.0f; // only to satisfy the compiler
|
1597
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1598
|
+
}
|
1599
|
+
|
1600
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
1601
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1602
|
+
|
1603
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1604
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
1605
|
+
|
1606
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
1607
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
1608
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
1609
|
+
|
1610
|
+
float sumf = 0.0f;
|
1611
|
+
|
1612
|
+
const float d = bq6_K->d;
|
1613
|
+
|
1614
|
+
int vl;
|
1615
|
+
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
1616
|
+
|
1617
|
+
int vh;
|
1618
|
+
memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
|
1619
|
+
|
1620
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1621
|
+
const int sc = bq6_K->scales[scale_offset + 4*i];
|
1622
|
+
|
1623
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
|
1624
|
+
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1625
|
+
const float d8i = bq8i->d;
|
1626
|
+
|
1627
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1628
|
+
|
1629
|
+
const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
|
1630
|
+
|
1631
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1632
|
+
|
1633
|
+
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1634
|
+
}
|
1635
|
+
|
1636
|
+
return d*sumf;
|
1637
|
+
#else
|
1638
|
+
return 0.0f; // only to satisfy the compiler
|
1639
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1410
1640
|
}
|
1411
1641
|
|
1412
1642
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
@@ -1429,7 +1659,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1429
1659
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1430
1660
|
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
1431
1661
|
|
1432
|
-
const int iby = i + threadIdx.x / qi; // y block index
|
1662
|
+
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
1433
1663
|
|
1434
1664
|
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
1435
1665
|
|
@@ -1667,6 +1897,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
1667
1897
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
1668
1898
|
}
|
1669
1899
|
|
1900
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
1901
|
+
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1902
|
+
const int half_n_dims = ncols/4;
|
1903
|
+
|
1904
|
+
if (col >= half_n_dims) {
|
1905
|
+
return;
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1909
|
+
const int i = row*ncols + col;
|
1910
|
+
|
1911
|
+
const float col_theta_scale = powf(theta_scale, col);
|
1912
|
+
|
1913
|
+
const float theta = p*col_theta_scale;
|
1914
|
+
const float sin_theta = sinf(theta);
|
1915
|
+
const float cos_theta = cosf(theta);
|
1916
|
+
|
1917
|
+
const float x0 = x[i + 0];
|
1918
|
+
const float x1 = x[i + half_n_dims];
|
1919
|
+
|
1920
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
1921
|
+
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
1922
|
+
|
1923
|
+
const float block_theta = block_p*col_theta_scale;
|
1924
|
+
const float sin_block_theta = sinf(block_theta);
|
1925
|
+
const float cos_block_theta = cosf(block_theta);
|
1926
|
+
|
1927
|
+
const float x2 = x[i + half_n_dims * 2];
|
1928
|
+
const float x3 = x[i + half_n_dims * 3];
|
1929
|
+
|
1930
|
+
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
1931
|
+
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
1932
|
+
}
|
1933
|
+
|
1670
1934
|
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
1671
1935
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
1672
1936
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
@@ -1732,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
1732
1996
|
dst[i] = scale * x[i];
|
1733
1997
|
}
|
1734
1998
|
|
1735
|
-
static void add_f32_cuda(const float * x, const float * y, float * dst, const int
|
1736
|
-
const int num_blocks = (
|
1737
|
-
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst,
|
1999
|
+
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
2000
|
+
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
2001
|
+
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
1738
2002
|
}
|
1739
2003
|
|
1740
2004
|
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
@@ -1928,7 +2192,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
1928
2192
|
}
|
1929
2193
|
|
1930
2194
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1931
|
-
GGML_ASSERT(ncols %
|
2195
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
1932
2196
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1933
2197
|
const dim3 block_nums(1, block_num_y, 1);
|
1934
2198
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1937,7 +2201,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1937
2201
|
}
|
1938
2202
|
|
1939
2203
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1940
|
-
GGML_ASSERT(ncols %
|
2204
|
+
GGML_ASSERT(ncols % QK4_1 == 0);
|
1941
2205
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1942
2206
|
const dim3 block_nums(1, block_num_y, 1);
|
1943
2207
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1946,7 +2210,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1946
2210
|
}
|
1947
2211
|
|
1948
2212
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1949
|
-
GGML_ASSERT(ncols %
|
2213
|
+
GGML_ASSERT(ncols % QK5_0 == 0);
|
1950
2214
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1951
2215
|
const dim3 block_nums(1, block_num_y, 1);
|
1952
2216
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1955,7 +2219,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1955
2219
|
}
|
1956
2220
|
|
1957
2221
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1958
|
-
GGML_ASSERT(ncols %
|
2222
|
+
GGML_ASSERT(ncols % QK5_1 == 0);
|
1959
2223
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1960
2224
|
const dim3 block_nums(1, block_num_y, 1);
|
1961
2225
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1964,7 +2228,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1964
2228
|
}
|
1965
2229
|
|
1966
2230
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1967
|
-
GGML_ASSERT(ncols %
|
2231
|
+
GGML_ASSERT(ncols % QK8_0 == 0);
|
1968
2232
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
1969
2233
|
const dim3 block_nums(1, block_num_y, 1);
|
1970
2234
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
@@ -1972,6 +2236,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
1972
2236
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
1973
2237
|
}
|
1974
2238
|
|
2239
|
+
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2240
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2241
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2242
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2243
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2244
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
2245
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2246
|
+
}
|
2247
|
+
|
2248
|
+
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2249
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2250
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2251
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2252
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2253
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
2254
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2255
|
+
}
|
2256
|
+
|
2257
|
+
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2258
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2259
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2260
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2261
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2262
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
|
2263
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2264
|
+
}
|
2265
|
+
|
2266
|
+
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2267
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2268
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2269
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2270
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2271
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
|
2272
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2273
|
+
}
|
2274
|
+
|
2275
|
+
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2276
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
2277
|
+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2278
|
+
const dim3 block_nums(1, block_num_y, 1);
|
2279
|
+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2280
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
2281
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2282
|
+
}
|
2283
|
+
|
1975
2284
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1976
2285
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
1977
2286
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
@@ -2064,6 +2373,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
2064
2373
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
2065
2374
|
}
|
2066
2375
|
|
2376
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
2377
|
+
GGML_ASSERT(nrows % 4 == 0);
|
2378
|
+
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2379
|
+
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
2380
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2381
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
2382
|
+
}
|
2383
|
+
|
2067
2384
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
2068
2385
|
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
2069
2386
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
@@ -2106,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
2106
2423
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
2107
2424
|
int id;
|
2108
2425
|
CUDA_CHECK(cudaGetDevice(&id));
|
2109
|
-
|
2426
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2427
|
+
int nnz = 0;
|
2428
|
+
size_t max_size = 0, tot_size = 0;
|
2429
|
+
#endif
|
2430
|
+
size_t best_diff = 1ull << 36;
|
2431
|
+
int ibest = -1;
|
2110
2432
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
2111
2433
|
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
2112
|
-
if (b.
|
2113
|
-
|
2114
|
-
|
2115
|
-
b.
|
2116
|
-
b.size =
|
2117
|
-
|
2434
|
+
if (b.ptr != nullptr) {
|
2435
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2436
|
+
++nnz;
|
2437
|
+
tot_size += b.size;
|
2438
|
+
if (b.size > max_size) max_size = b.size;
|
2439
|
+
#endif
|
2440
|
+
if (b.size >= size) {
|
2441
|
+
size_t diff = b.size - size;
|
2442
|
+
if (diff < best_diff) {
|
2443
|
+
best_diff = diff;
|
2444
|
+
ibest = i;
|
2445
|
+
if (!best_diff) {
|
2446
|
+
void * ptr = b.ptr;
|
2447
|
+
*actual_size = b.size;
|
2448
|
+
b.ptr = nullptr;
|
2449
|
+
b.size = 0;
|
2450
|
+
return ptr;
|
2451
|
+
}
|
2452
|
+
}
|
2453
|
+
}
|
2118
2454
|
}
|
2119
2455
|
}
|
2456
|
+
if (ibest >= 0) {
|
2457
|
+
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
|
2458
|
+
void * ptr = b.ptr;
|
2459
|
+
*actual_size = b.size;
|
2460
|
+
b.ptr = nullptr;
|
2461
|
+
b.size = 0;
|
2462
|
+
return ptr;
|
2463
|
+
}
|
2464
|
+
#ifdef DEBUG_CUDA_MALLOC
|
2465
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
2466
|
+
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
2467
|
+
#endif
|
2120
2468
|
void * ptr;
|
2121
|
-
|
2122
|
-
|
2469
|
+
size_t look_ahead_size = (size_t) (1.05 * size);
|
2470
|
+
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
2471
|
+
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
2472
|
+
*actual_size = look_ahead_size;
|
2123
2473
|
return ptr;
|
2124
2474
|
}
|
2125
2475
|
|
@@ -2195,6 +2545,9 @@ void ggml_init_cublas() {
|
|
2195
2545
|
}
|
2196
2546
|
|
2197
2547
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
2548
|
+
if (tensor_split == nullptr) {
|
2549
|
+
return;
|
2550
|
+
}
|
2198
2551
|
bool all_zero = true;
|
2199
2552
|
for (int i = 0; i < g_device_count; ++i) {
|
2200
2553
|
if (tensor_split[i] != 0.0f) {
|
@@ -2293,17 +2646,15 @@ inline void ggml_cuda_op_add(
|
|
2293
2646
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
2294
2647
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
2295
2648
|
|
2296
|
-
// TODO: support broadcasting
|
2297
|
-
GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1));
|
2298
|
-
|
2299
2649
|
const int64_t ne00 = src0->ne[0];
|
2300
2650
|
const int64_t i01_diff = i01_high - i01_low;
|
2301
2651
|
|
2302
|
-
|
2652
|
+
const int64_t ne10 = src1->ne[0];
|
2653
|
+
const int64_t ne11 = src1->ne[1];
|
2303
2654
|
|
2304
2655
|
// compute
|
2305
2656
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
2306
|
-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2657
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2307
2658
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
2308
2659
|
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2309
2660
|
} else {
|
@@ -2327,19 +2678,12 @@ inline void ggml_cuda_op_mul(
|
|
2327
2678
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
2328
2679
|
|
2329
2680
|
const int64_t ne00 = src0->ne[0];
|
2681
|
+
const int64_t i01_diff = i01_high - i01_low;
|
2682
|
+
|
2330
2683
|
const int64_t ne10 = src1->ne[0];
|
2331
2684
|
const int64_t ne11 = src1->ne[1];
|
2332
2685
|
|
2333
|
-
|
2334
|
-
const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
|
2335
|
-
|
2336
|
-
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
2337
|
-
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
|
2338
|
-
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
|
2339
|
-
|
2340
|
-
// compute
|
2341
|
-
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
2342
|
-
}
|
2686
|
+
mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
|
2343
2687
|
|
2344
2688
|
(void) dst;
|
2345
2689
|
(void) src0_ddq_i;
|
@@ -2452,13 +2796,22 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2452
2796
|
int id;
|
2453
2797
|
CUDA_CHECK(cudaGetDevice(&id));
|
2454
2798
|
|
2455
|
-
|
2799
|
+
bool mul_mat_vec_q_implemented =
|
2800
|
+
src0->type == GGML_TYPE_Q4_0 ||
|
2456
2801
|
src0->type == GGML_TYPE_Q4_1 ||
|
2457
2802
|
src0->type == GGML_TYPE_Q5_0 ||
|
2458
2803
|
src0->type == GGML_TYPE_Q5_1 ||
|
2459
2804
|
src0->type == GGML_TYPE_Q8_0;
|
2460
|
-
|
2461
|
-
|
2805
|
+
#if QK_K == 256
|
2806
|
+
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
2807
|
+
src0->type == GGML_TYPE_Q2_K ||
|
2808
|
+
src0->type == GGML_TYPE_Q3_K ||
|
2809
|
+
src0->type == GGML_TYPE_Q4_K ||
|
2810
|
+
src0->type == GGML_TYPE_Q5_K ||
|
2811
|
+
src0->type == GGML_TYPE_Q6_K;
|
2812
|
+
#endif // QK_K == 256
|
2813
|
+
|
2814
|
+
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
2462
2815
|
#endif
|
2463
2816
|
|
2464
2817
|
if (use_mul_mat_vec_q) {
|
@@ -2484,6 +2837,21 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2484
2837
|
case GGML_TYPE_Q8_0:
|
2485
2838
|
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2486
2839
|
break;
|
2840
|
+
case GGML_TYPE_Q2_K:
|
2841
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2842
|
+
break;
|
2843
|
+
case GGML_TYPE_Q3_K:
|
2844
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2845
|
+
break;
|
2846
|
+
case GGML_TYPE_Q4_K:
|
2847
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2848
|
+
break;
|
2849
|
+
case GGML_TYPE_Q5_K:
|
2850
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2851
|
+
break;
|
2852
|
+
case GGML_TYPE_Q6_K:
|
2853
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
2854
|
+
break;
|
2487
2855
|
default:
|
2488
2856
|
GGML_ASSERT(false);
|
2489
2857
|
break;
|
@@ -2618,13 +2986,26 @@ inline void ggml_cuda_op_rope(
|
|
2618
2986
|
const int n_past = ((int32_t *) src1->data)[0];
|
2619
2987
|
const int n_dims = ((int32_t *) src1->data)[1];
|
2620
2988
|
const int mode = ((int32_t *) src1->data)[2];
|
2621
|
-
|
2989
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
2990
|
+
|
2991
|
+
// RoPE alteration for extended context
|
2992
|
+
float freq_base, freq_scale;
|
2993
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
2994
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
2995
|
+
|
2996
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
2997
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
2622
2998
|
|
2623
|
-
|
2624
|
-
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
|
2999
|
+
bool is_glm = mode & 4;
|
2625
3000
|
|
2626
3001
|
// compute
|
2627
|
-
|
3002
|
+
if (is_glm) {
|
3003
|
+
const float id_p = min(p, n_ctx - 2.f);
|
3004
|
+
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3005
|
+
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3006
|
+
} else {
|
3007
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
3008
|
+
}
|
2628
3009
|
|
2629
3010
|
(void) dst;
|
2630
3011
|
(void) src0_ddq_i;
|
@@ -3197,6 +3578,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
3197
3578
|
(void) dst;
|
3198
3579
|
}
|
3199
3580
|
|
3581
|
+
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3582
|
+
ggml_cuda_cpy(src0, dst, nullptr);
|
3583
|
+
(void) src1;
|
3584
|
+
}
|
3585
|
+
|
3200
3586
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3201
3587
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3202
3588
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
@@ -3306,6 +3692,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
3306
3692
|
delete extra;
|
3307
3693
|
}
|
3308
3694
|
|
3695
|
+
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
3696
|
+
static size_t g_temp_tensor_extra_index = 0;
|
3697
|
+
|
3698
|
+
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
3699
|
+
if (g_temp_tensor_extras == nullptr) {
|
3700
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
3701
|
+
}
|
3702
|
+
|
3703
|
+
size_t alloc_index = g_temp_tensor_extra_index;
|
3704
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
3705
|
+
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
3706
|
+
memset(extra, 0, sizeof(*extra));
|
3707
|
+
|
3708
|
+
return extra;
|
3709
|
+
}
|
3710
|
+
|
3309
3711
|
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
3310
3712
|
if (scratch && g_scratch_size == 0) {
|
3311
3713
|
return;
|
@@ -3314,7 +3716,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3314
3716
|
// recursively assign CUDA buffers until a compute tensor is found
|
3315
3717
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
3316
3718
|
const ggml_op src0_op = tensor->src[0]->op;
|
3317
|
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
3719
|
+
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
3318
3720
|
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
3319
3721
|
}
|
3320
3722
|
}
|
@@ -3323,8 +3725,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3323
3725
|
}
|
3324
3726
|
|
3325
3727
|
tensor->backend = GGML_BACKEND_GPU;
|
3326
|
-
struct ggml_tensor_extra_gpu * extra
|
3327
|
-
memset(extra, 0, sizeof(*extra));
|
3728
|
+
struct ggml_tensor_extra_gpu * extra;
|
3328
3729
|
|
3329
3730
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
3330
3731
|
tensor->op == GGML_OP_VIEW ||
|
@@ -3339,10 +3740,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3339
3740
|
if (tensor->op == GGML_OP_VIEW) {
|
3340
3741
|
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
3341
3742
|
}
|
3743
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3342
3744
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
3343
3745
|
} else if (tensor->op == GGML_OP_CPY) {
|
3344
3746
|
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
3345
3747
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
3748
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3346
3749
|
extra->data_device[g_main_device] = src1_ddv;
|
3347
3750
|
} else if (scratch) {
|
3348
3751
|
GGML_ASSERT(size <= g_scratch_size);
|
@@ -3355,6 +3758,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3355
3758
|
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
3356
3759
|
g_scratch_buffer = data;
|
3357
3760
|
}
|
3761
|
+
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3358
3762
|
extra->data_device[g_main_device] = data + g_scratch_offset;
|
3359
3763
|
|
3360
3764
|
g_scratch_offset += size;
|
@@ -3364,6 +3768,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3364
3768
|
void * data;
|
3365
3769
|
CUDA_CHECK(cudaMalloc(&data, size));
|
3366
3770
|
CUDA_CHECK(cudaMemset(data, 0, size));
|
3771
|
+
extra = new ggml_tensor_extra_gpu;
|
3772
|
+
memset(extra, 0, sizeof(*extra));
|
3367
3773
|
extra->data_device[g_main_device] = data;
|
3368
3774
|
}
|
3369
3775
|
|
@@ -3416,6 +3822,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3416
3822
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
3417
3823
|
|
3418
3824
|
switch (tensor->op) {
|
3825
|
+
case GGML_OP_DUP:
|
3826
|
+
if (!any_on_device) {
|
3827
|
+
return false;
|
3828
|
+
}
|
3829
|
+
func = ggml_cuda_dup;
|
3830
|
+
break;
|
3419
3831
|
case GGML_OP_ADD:
|
3420
3832
|
if (!any_on_device) {
|
3421
3833
|
return false;
|
@@ -3470,6 +3882,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3470
3882
|
}
|
3471
3883
|
func = ggml_cuda_cpy;
|
3472
3884
|
break;
|
3885
|
+
case GGML_OP_CONT:
|
3886
|
+
if (!any_on_device) {
|
3887
|
+
return false;
|
3888
|
+
}
|
3889
|
+
func = ggml_cuda_dup;
|
3890
|
+
break;
|
3473
3891
|
case GGML_OP_RESHAPE:
|
3474
3892
|
case GGML_OP_VIEW:
|
3475
3893
|
case GGML_OP_PERMUTE:
|