llama_cpp 0.3.5 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +549 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2526 -430
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +56 -34
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +445 -176
- data/ext/llama_cpp/src/ggml.h +125 -33
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +409 -210
- data/ext/llama_cpp/src/llama.h +19 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -183,6 +183,15 @@
|
|
183
183
|
# define GGML_API
|
184
184
|
#endif
|
185
185
|
|
186
|
+
// TODO: support for clang
|
187
|
+
#ifdef __GNUC__
|
188
|
+
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
189
|
+
#elif defined(_MSC_VER)
|
190
|
+
# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
191
|
+
#else
|
192
|
+
# define GGML_DEPRECATED(func, hint) func
|
193
|
+
#endif
|
194
|
+
|
186
195
|
#include <stdint.h>
|
187
196
|
#include <stddef.h>
|
188
197
|
#include <stdbool.h>
|
@@ -374,6 +383,10 @@ extern "C" {
|
|
374
383
|
GGML_OP_MAP_UNARY,
|
375
384
|
GGML_OP_MAP_BINARY,
|
376
385
|
|
386
|
+
GGML_OP_MAP_CUSTOM1_F32,
|
387
|
+
GGML_OP_MAP_CUSTOM2_F32,
|
388
|
+
GGML_OP_MAP_CUSTOM3_F32,
|
389
|
+
|
377
390
|
GGML_OP_MAP_CUSTOM1,
|
378
391
|
GGML_OP_MAP_CUSTOM2,
|
379
392
|
GGML_OP_MAP_CUSTOM3,
|
@@ -570,6 +583,8 @@ extern "C" {
|
|
570
583
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
571
584
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
572
585
|
|
586
|
+
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
587
|
+
|
573
588
|
// use this to compute the memory overhead of a tensor
|
574
589
|
GGML_API size_t ggml_tensor_overhead(void);
|
575
590
|
|
@@ -1170,7 +1185,18 @@ extern "C" {
|
|
1170
1185
|
int mode,
|
1171
1186
|
int n_ctx);
|
1172
1187
|
|
1173
|
-
// custom RoPE
|
1188
|
+
// custom RoPE
|
1189
|
+
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1190
|
+
struct ggml_context * ctx,
|
1191
|
+
struct ggml_tensor * a,
|
1192
|
+
int n_past,
|
1193
|
+
int n_dims,
|
1194
|
+
int mode,
|
1195
|
+
int n_ctx,
|
1196
|
+
float freq_base,
|
1197
|
+
float freq_scale);
|
1198
|
+
|
1199
|
+
// in-place, returns view(a)
|
1174
1200
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1175
1201
|
struct ggml_context * ctx,
|
1176
1202
|
struct ggml_tensor * a,
|
@@ -1229,7 +1255,7 @@ extern "C" {
|
|
1229
1255
|
|
1230
1256
|
// conv_1d with padding = half
|
1231
1257
|
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1232
|
-
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1258
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_ph(
|
1233
1259
|
struct ggml_context * ctx,
|
1234
1260
|
struct ggml_tensor * a,
|
1235
1261
|
struct ggml_tensor * b,
|
@@ -1242,7 +1268,7 @@ extern "C" {
|
|
1242
1268
|
GGML_OP_POOL_COUNT,
|
1243
1269
|
};
|
1244
1270
|
|
1245
|
-
GGML_API struct ggml_tensor* ggml_pool_1d(
|
1271
|
+
GGML_API struct ggml_tensor * ggml_pool_1d(
|
1246
1272
|
struct ggml_context * ctx,
|
1247
1273
|
struct ggml_tensor * a,
|
1248
1274
|
enum ggml_op_pool op,
|
@@ -1250,7 +1276,7 @@ extern "C" {
|
|
1250
1276
|
int s0, // stride
|
1251
1277
|
int p0); // padding
|
1252
1278
|
|
1253
|
-
GGML_API struct ggml_tensor* ggml_pool_2d(
|
1279
|
+
GGML_API struct ggml_tensor * ggml_pool_2d(
|
1254
1280
|
struct ggml_context * ctx,
|
1255
1281
|
struct ggml_tensor * a,
|
1256
1282
|
enum ggml_op_pool op,
|
@@ -1304,15 +1330,6 @@ extern "C" {
|
|
1304
1330
|
int h0,
|
1305
1331
|
int w);
|
1306
1332
|
|
1307
|
-
// custom operators
|
1308
|
-
|
1309
|
-
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
1310
|
-
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
1311
|
-
|
1312
|
-
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
1313
|
-
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1314
|
-
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1315
|
-
|
1316
1333
|
GGML_API struct ggml_tensor * ggml_unary(
|
1317
1334
|
struct ggml_context * ctx,
|
1318
1335
|
struct ggml_tensor * a,
|
@@ -1323,63 +1340,138 @@ extern "C" {
|
|
1323
1340
|
struct ggml_tensor * a,
|
1324
1341
|
enum ggml_unary_op op);
|
1325
1342
|
|
1326
|
-
|
1343
|
+
// custom operators
|
1344
|
+
|
1345
|
+
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
1346
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
1347
|
+
|
1348
|
+
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
1349
|
+
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1350
|
+
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1351
|
+
|
1352
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
1327
1353
|
struct ggml_context * ctx,
|
1328
1354
|
struct ggml_tensor * a,
|
1329
|
-
ggml_unary_op_f32_t fun)
|
1355
|
+
ggml_unary_op_f32_t fun),
|
1356
|
+
"use ggml_map_custom1 instead");
|
1330
1357
|
|
1331
|
-
GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
1358
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
1332
1359
|
struct ggml_context * ctx,
|
1333
1360
|
struct ggml_tensor * a,
|
1334
|
-
ggml_unary_op_f32_t fun)
|
1361
|
+
ggml_unary_op_f32_t fun),
|
1362
|
+
"use ggml_map_custom1_inplace instead");
|
1335
1363
|
|
1336
|
-
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
1364
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
1337
1365
|
struct ggml_context * ctx,
|
1338
1366
|
struct ggml_tensor * a,
|
1339
1367
|
struct ggml_tensor * b,
|
1340
|
-
ggml_binary_op_f32_t fun)
|
1368
|
+
ggml_binary_op_f32_t fun),
|
1369
|
+
"use ggml_map_custom2 instead");
|
1341
1370
|
|
1342
|
-
GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
1371
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
1343
1372
|
struct ggml_context * ctx,
|
1344
1373
|
struct ggml_tensor * a,
|
1345
1374
|
struct ggml_tensor * b,
|
1346
|
-
ggml_binary_op_f32_t fun)
|
1375
|
+
ggml_binary_op_f32_t fun),
|
1376
|
+
"use ggml_map_custom2_inplace instead");
|
1347
1377
|
|
1348
|
-
GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
1378
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
1349
1379
|
struct ggml_context * ctx,
|
1350
1380
|
struct ggml_tensor * a,
|
1351
|
-
ggml_custom1_op_f32_t fun)
|
1381
|
+
ggml_custom1_op_f32_t fun),
|
1382
|
+
"use ggml_map_custom1 instead");
|
1352
1383
|
|
1353
|
-
GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
1384
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
1354
1385
|
struct ggml_context * ctx,
|
1355
1386
|
struct ggml_tensor * a,
|
1356
|
-
ggml_custom1_op_f32_t fun)
|
1387
|
+
ggml_custom1_op_f32_t fun),
|
1388
|
+
"use ggml_map_custom1_inplace instead");
|
1357
1389
|
|
1358
|
-
GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
1390
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
1359
1391
|
struct ggml_context * ctx,
|
1360
1392
|
struct ggml_tensor * a,
|
1361
1393
|
struct ggml_tensor * b,
|
1362
|
-
ggml_custom2_op_f32_t fun)
|
1394
|
+
ggml_custom2_op_f32_t fun),
|
1395
|
+
"use ggml_map_custom2 instead");
|
1363
1396
|
|
1364
|
-
GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
1397
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
1365
1398
|
struct ggml_context * ctx,
|
1366
1399
|
struct ggml_tensor * a,
|
1367
1400
|
struct ggml_tensor * b,
|
1368
|
-
ggml_custom2_op_f32_t fun)
|
1401
|
+
ggml_custom2_op_f32_t fun),
|
1402
|
+
"use ggml_map_custom2_inplace instead");
|
1369
1403
|
|
1370
|
-
GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
1404
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
1371
1405
|
struct ggml_context * ctx,
|
1372
1406
|
struct ggml_tensor * a,
|
1373
1407
|
struct ggml_tensor * b,
|
1374
1408
|
struct ggml_tensor * c,
|
1375
|
-
ggml_custom3_op_f32_t fun)
|
1409
|
+
ggml_custom3_op_f32_t fun),
|
1410
|
+
"use ggml_map_custom3 instead");
|
1376
1411
|
|
1377
|
-
GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
1412
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
1378
1413
|
struct ggml_context * ctx,
|
1379
1414
|
struct ggml_tensor * a,
|
1380
1415
|
struct ggml_tensor * b,
|
1381
1416
|
struct ggml_tensor * c,
|
1382
|
-
ggml_custom3_op_f32_t fun)
|
1417
|
+
ggml_custom3_op_f32_t fun),
|
1418
|
+
"use ggml_map_custom3_inplace instead");
|
1419
|
+
|
1420
|
+
// custom operators v2
|
1421
|
+
|
1422
|
+
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
|
1423
|
+
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
1424
|
+
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
1425
|
+
|
1426
|
+
#define GGML_N_TASKS_MAX -1
|
1427
|
+
|
1428
|
+
GGML_API struct ggml_tensor * ggml_map_custom1(
|
1429
|
+
struct ggml_context * ctx,
|
1430
|
+
struct ggml_tensor * a,
|
1431
|
+
ggml_custom1_op_t fun,
|
1432
|
+
int n_tasks,
|
1433
|
+
void * userdata);
|
1434
|
+
|
1435
|
+
GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
|
1436
|
+
struct ggml_context * ctx,
|
1437
|
+
struct ggml_tensor * a,
|
1438
|
+
ggml_custom1_op_t fun,
|
1439
|
+
int n_tasks,
|
1440
|
+
void * userdata);
|
1441
|
+
|
1442
|
+
GGML_API struct ggml_tensor * ggml_map_custom2(
|
1443
|
+
struct ggml_context * ctx,
|
1444
|
+
struct ggml_tensor * a,
|
1445
|
+
struct ggml_tensor * b,
|
1446
|
+
ggml_custom2_op_t fun,
|
1447
|
+
int n_tasks,
|
1448
|
+
void * userdata);
|
1449
|
+
|
1450
|
+
GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
|
1451
|
+
struct ggml_context * ctx,
|
1452
|
+
struct ggml_tensor * a,
|
1453
|
+
struct ggml_tensor * b,
|
1454
|
+
ggml_custom2_op_t fun,
|
1455
|
+
int n_tasks,
|
1456
|
+
void * userdata);
|
1457
|
+
|
1458
|
+
GGML_API struct ggml_tensor * ggml_map_custom3(
|
1459
|
+
struct ggml_context * ctx,
|
1460
|
+
struct ggml_tensor * a,
|
1461
|
+
struct ggml_tensor * b,
|
1462
|
+
struct ggml_tensor * c,
|
1463
|
+
ggml_custom3_op_t fun,
|
1464
|
+
int n_tasks,
|
1465
|
+
void * userdata);
|
1466
|
+
|
1467
|
+
GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
|
1468
|
+
struct ggml_context * ctx,
|
1469
|
+
struct ggml_tensor * a,
|
1470
|
+
struct ggml_tensor * b,
|
1471
|
+
struct ggml_tensor * c,
|
1472
|
+
ggml_custom3_op_t fun,
|
1473
|
+
int n_tasks,
|
1474
|
+
void * userdata);
|
1383
1475
|
|
1384
1476
|
// loss function
|
1385
1477
|
|
@@ -39,6 +39,8 @@
|
|
39
39
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
40
40
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
41
41
|
|
42
|
+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
43
|
+
|
42
44
|
//
|
43
45
|
// 2-6 bit quantization in super-blocks
|
44
46
|
//
|
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1353
1355
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
|
1354
1356
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
1355
1357
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
1356
|
-
const __m256i scales[2] = {
|
1358
|
+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
1357
1359
|
|
1358
1360
|
__m256i sumi = _mm256_setzero_si256();
|
1359
1361
|
|
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1421
1423
|
const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
|
1422
1424
|
|
1423
1425
|
// sumf += -dmin * summs in 32bits*8
|
1424
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(
|
1426
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
|
1425
1427
|
|
1426
1428
|
const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
|
1427
1429
|
const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
|
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1493
1495
|
}
|
1494
1496
|
|
1495
1497
|
// sumf += dall * isum - dmin * summs in 32bits
|
1496
|
-
__m256i sumi =
|
1498
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
1497
1499
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
|
1498
1500
|
}
|
1499
1501
|
|
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1644
1646
|
summs += dmin * smin;
|
1645
1647
|
|
1646
1648
|
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
|
1647
|
-
const __m256i q2_0 = _mm256_and_si256(
|
1648
|
-
const __m256i q2_1 = _mm256_and_si256(
|
1649
|
+
const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
|
1650
|
+
const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
|
1649
1651
|
|
1650
1652
|
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
1651
1653
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1709
1711
|
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
|
1710
1712
|
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
|
1711
1713
|
|
1712
|
-
const __m256i p_0 =
|
1713
|
-
const __m256i p_1 =
|
1714
|
-
const __m256i p_2 =
|
1715
|
-
const __m256i p_3 =
|
1714
|
+
const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
|
1715
|
+
const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
|
1716
|
+
const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
|
1717
|
+
const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
|
1716
1718
|
|
1717
1719
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
|
1718
1720
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
|
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1917
1919
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
|
1918
1920
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
1919
1921
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
1920
|
-
const __m256i scales[2] = {
|
1922
|
+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
1921
1923
|
|
1922
1924
|
// high bit
|
1923
1925
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
|
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2128
2130
|
}
|
2129
2131
|
|
2130
2132
|
// multiply with block scale and accumulate
|
2131
|
-
__m256i sumi =
|
2133
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
2132
2134
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
2133
2135
|
|
2134
2136
|
}
|
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2303
2305
|
aux16[0] = a & 0x0f0f;
|
2304
2306
|
aux16[1] = (a >> 4) & 0x0f0f;
|
2305
2307
|
|
2306
|
-
const __m256i scale_0 =
|
2307
|
-
const __m256i scale_1 =
|
2308
|
+
const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
|
2309
|
+
const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
|
2308
2310
|
|
2309
2311
|
memcpy(&aux64, x[i].hmask, 8);
|
2310
2312
|
|
2311
2313
|
const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
|
2312
|
-
__m256i q3h_0 =
|
2314
|
+
__m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
|
2313
2315
|
__m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
|
2314
2316
|
q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
|
2315
2317
|
q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
|
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2318
2320
|
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
|
2319
2321
|
|
2320
2322
|
// prepare low and high bits
|
2321
|
-
const __m256i q3aux =
|
2323
|
+
const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
|
2322
2324
|
const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
|
2323
2325
|
const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
|
2324
2326
|
|
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2429
2431
|
|
2430
2432
|
p16_0 = _mm_add_epi32(p16_0, p16_2);
|
2431
2433
|
p16_1 = _mm_add_epi32(p16_1, p16_3);
|
2432
|
-
__m256i p16 =
|
2434
|
+
__m256i p16 = MM256_SET_M128I(p16_1, p16_0);
|
2433
2435
|
|
2434
2436
|
// multiply with block scale and accumulate
|
2435
2437
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
|
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2620
2622
|
acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
|
2621
2623
|
|
2622
2624
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
2623
|
-
const __m256i scales =
|
2625
|
+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
2624
2626
|
|
2625
2627
|
__m256i sumi = _mm256_setzero_si256();
|
2626
2628
|
|
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2727
2729
|
}
|
2728
2730
|
|
2729
2731
|
__m256 vd = _mm256_set1_ps(d);
|
2730
|
-
__m256i sumi =
|
2732
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
2731
2733
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
2732
2734
|
|
2733
2735
|
}
|
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2968
2970
|
|
2969
2971
|
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
|
2970
2972
|
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
|
2971
|
-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(
|
2973
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
|
2972
2974
|
|
2973
2975
|
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
|
2974
2976
|
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
|
2975
|
-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(
|
2977
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
|
2976
2978
|
|
2977
2979
|
}
|
2978
2980
|
|
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3160
3162
|
summs += dmin * _mm_extract_epi32(hsum, 0);
|
3161
3163
|
|
3162
3164
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
3163
|
-
const __m256i scales =
|
3165
|
+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
3164
3166
|
|
3165
3167
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
|
3166
3168
|
__m256i hmask = mone;
|
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3299
3301
|
}
|
3300
3302
|
|
3301
3303
|
__m256 vd = _mm256_set1_ps(d);
|
3302
|
-
__m256i sumi =
|
3304
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
3303
3305
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
3304
3306
|
|
3305
3307
|
}
|
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3462
3464
|
|
3463
3465
|
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
|
3464
3466
|
|
3465
|
-
const __m256i scale_l =
|
3466
|
-
const __m256i scale_h =
|
3467
|
+
const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
|
3468
|
+
const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
|
3467
3469
|
|
3468
3470
|
int64_t aux64;
|
3469
3471
|
memcpy(&aux64, x[i].qh, 8);
|
3470
3472
|
const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
|
3471
|
-
const __m256i haux256 =
|
3473
|
+
const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
|
3472
3474
|
|
3473
3475
|
const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
|
3474
3476
|
const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
|
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3543
3545
|
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
|
3544
3546
|
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
|
3545
3547
|
|
3546
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(
|
3548
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
|
3547
3549
|
|
3548
3550
|
}
|
3549
3551
|
|
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3925
3927
|
|
3926
3928
|
}
|
3927
3929
|
|
3928
|
-
__m256i sumi =
|
3930
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
3929
3931
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
3930
3932
|
}
|
3931
3933
|
|
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4083
4085
|
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
|
4084
4086
|
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
|
4085
4087
|
|
4086
|
-
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(
|
4087
|
-
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(
|
4088
|
+
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
|
4089
|
+
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
|
4088
4090
|
|
4089
4091
|
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
|
4090
4092
|
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
|
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4177
4179
|
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
4178
4180
|
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
4179
4181
|
|
4180
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(
|
4182
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
|
4181
4183
|
}
|
4182
4184
|
|
4183
4185
|
*s = hsum_float_8(acc);
|
@@ -149,6 +149,46 @@ struct llama_file {
|
|
149
149
|
}
|
150
150
|
};
|
151
151
|
|
152
|
+
// llama_context_data
|
153
|
+
struct llama_data_context {
|
154
|
+
virtual void write(const void * src, size_t size) = 0;
|
155
|
+
virtual size_t get_size_written() = 0;
|
156
|
+
virtual ~llama_data_context() = default;
|
157
|
+
};
|
158
|
+
|
159
|
+
struct llama_data_buffer_context : llama_data_context {
|
160
|
+
uint8_t* ptr;
|
161
|
+
size_t size_written = 0;
|
162
|
+
|
163
|
+
llama_data_buffer_context(uint8_t * p) : ptr(p) {}
|
164
|
+
|
165
|
+
void write(const void * src, size_t size) override {
|
166
|
+
memcpy(ptr, src, size);
|
167
|
+
ptr += size;
|
168
|
+
size_written += size;
|
169
|
+
}
|
170
|
+
|
171
|
+
size_t get_size_written() override {
|
172
|
+
return size_written;
|
173
|
+
}
|
174
|
+
};
|
175
|
+
|
176
|
+
struct llama_data_file_context : llama_data_context {
|
177
|
+
llama_file* file;
|
178
|
+
size_t size_written = 0;
|
179
|
+
|
180
|
+
llama_data_file_context(llama_file * f) : file(f) {}
|
181
|
+
|
182
|
+
void write(const void * src, size_t size) override {
|
183
|
+
file->write_raw(src, size);
|
184
|
+
size_written += size;
|
185
|
+
}
|
186
|
+
|
187
|
+
size_t get_size_written() override {
|
188
|
+
return size_written;
|
189
|
+
}
|
190
|
+
};
|
191
|
+
|
152
192
|
#if defined(_WIN32)
|
153
193
|
static std::string llama_format_win_err(DWORD err) {
|
154
194
|
LPSTR buf;
|
@@ -179,7 +219,7 @@ struct llama_mmap {
|
|
179
219
|
// prefetch/readahead impairs performance on NUMA systems
|
180
220
|
if (numa) { prefetch = 0; }
|
181
221
|
#ifdef __linux__
|
182
|
-
if (prefetch) { flags |= MAP_POPULATE; }
|
222
|
+
if (prefetch >= file->size) { flags |= MAP_POPULATE; }
|
183
223
|
#endif
|
184
224
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
185
225
|
if (addr == MAP_FAILED) {
|