llama_cpp 0.3.5 → 0.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +549 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2526 -430
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +56 -34
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +445 -176
- data/ext/llama_cpp/src/ggml.h +125 -33
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +409 -210
- data/ext/llama_cpp/src/llama.h +19 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -183,6 +183,15 @@
|
|
183
183
|
# define GGML_API
|
184
184
|
#endif
|
185
185
|
|
186
|
+
// TODO: support for clang
|
187
|
+
#ifdef __GNUC__
|
188
|
+
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
189
|
+
#elif defined(_MSC_VER)
|
190
|
+
# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
191
|
+
#else
|
192
|
+
# define GGML_DEPRECATED(func, hint) func
|
193
|
+
#endif
|
194
|
+
|
186
195
|
#include <stdint.h>
|
187
196
|
#include <stddef.h>
|
188
197
|
#include <stdbool.h>
|
@@ -374,6 +383,10 @@ extern "C" {
|
|
374
383
|
GGML_OP_MAP_UNARY,
|
375
384
|
GGML_OP_MAP_BINARY,
|
376
385
|
|
386
|
+
GGML_OP_MAP_CUSTOM1_F32,
|
387
|
+
GGML_OP_MAP_CUSTOM2_F32,
|
388
|
+
GGML_OP_MAP_CUSTOM3_F32,
|
389
|
+
|
377
390
|
GGML_OP_MAP_CUSTOM1,
|
378
391
|
GGML_OP_MAP_CUSTOM2,
|
379
392
|
GGML_OP_MAP_CUSTOM3,
|
@@ -570,6 +583,8 @@ extern "C" {
|
|
570
583
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
571
584
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
572
585
|
|
586
|
+
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
587
|
+
|
573
588
|
// use this to compute the memory overhead of a tensor
|
574
589
|
GGML_API size_t ggml_tensor_overhead(void);
|
575
590
|
|
@@ -1170,7 +1185,18 @@ extern "C" {
|
|
1170
1185
|
int mode,
|
1171
1186
|
int n_ctx);
|
1172
1187
|
|
1173
|
-
// custom RoPE
|
1188
|
+
// custom RoPE
|
1189
|
+
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1190
|
+
struct ggml_context * ctx,
|
1191
|
+
struct ggml_tensor * a,
|
1192
|
+
int n_past,
|
1193
|
+
int n_dims,
|
1194
|
+
int mode,
|
1195
|
+
int n_ctx,
|
1196
|
+
float freq_base,
|
1197
|
+
float freq_scale);
|
1198
|
+
|
1199
|
+
// in-place, returns view(a)
|
1174
1200
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1175
1201
|
struct ggml_context * ctx,
|
1176
1202
|
struct ggml_tensor * a,
|
@@ -1229,7 +1255,7 @@ extern "C" {
|
|
1229
1255
|
|
1230
1256
|
// conv_1d with padding = half
|
1231
1257
|
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1232
|
-
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1258
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_ph(
|
1233
1259
|
struct ggml_context * ctx,
|
1234
1260
|
struct ggml_tensor * a,
|
1235
1261
|
struct ggml_tensor * b,
|
@@ -1242,7 +1268,7 @@ extern "C" {
|
|
1242
1268
|
GGML_OP_POOL_COUNT,
|
1243
1269
|
};
|
1244
1270
|
|
1245
|
-
GGML_API struct ggml_tensor* ggml_pool_1d(
|
1271
|
+
GGML_API struct ggml_tensor * ggml_pool_1d(
|
1246
1272
|
struct ggml_context * ctx,
|
1247
1273
|
struct ggml_tensor * a,
|
1248
1274
|
enum ggml_op_pool op,
|
@@ -1250,7 +1276,7 @@ extern "C" {
|
|
1250
1276
|
int s0, // stride
|
1251
1277
|
int p0); // padding
|
1252
1278
|
|
1253
|
-
GGML_API struct ggml_tensor* ggml_pool_2d(
|
1279
|
+
GGML_API struct ggml_tensor * ggml_pool_2d(
|
1254
1280
|
struct ggml_context * ctx,
|
1255
1281
|
struct ggml_tensor * a,
|
1256
1282
|
enum ggml_op_pool op,
|
@@ -1304,15 +1330,6 @@ extern "C" {
|
|
1304
1330
|
int h0,
|
1305
1331
|
int w);
|
1306
1332
|
|
1307
|
-
// custom operators
|
1308
|
-
|
1309
|
-
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
1310
|
-
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
1311
|
-
|
1312
|
-
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
1313
|
-
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1314
|
-
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1315
|
-
|
1316
1333
|
GGML_API struct ggml_tensor * ggml_unary(
|
1317
1334
|
struct ggml_context * ctx,
|
1318
1335
|
struct ggml_tensor * a,
|
@@ -1323,63 +1340,138 @@ extern "C" {
|
|
1323
1340
|
struct ggml_tensor * a,
|
1324
1341
|
enum ggml_unary_op op);
|
1325
1342
|
|
1326
|
-
|
1343
|
+
// custom operators
|
1344
|
+
|
1345
|
+
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
1346
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
1347
|
+
|
1348
|
+
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
1349
|
+
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1350
|
+
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1351
|
+
|
1352
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
1327
1353
|
struct ggml_context * ctx,
|
1328
1354
|
struct ggml_tensor * a,
|
1329
|
-
ggml_unary_op_f32_t fun)
|
1355
|
+
ggml_unary_op_f32_t fun),
|
1356
|
+
"use ggml_map_custom1 instead");
|
1330
1357
|
|
1331
|
-
GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
1358
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
1332
1359
|
struct ggml_context * ctx,
|
1333
1360
|
struct ggml_tensor * a,
|
1334
|
-
ggml_unary_op_f32_t fun)
|
1361
|
+
ggml_unary_op_f32_t fun),
|
1362
|
+
"use ggml_map_custom1_inplace instead");
|
1335
1363
|
|
1336
|
-
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
1364
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
1337
1365
|
struct ggml_context * ctx,
|
1338
1366
|
struct ggml_tensor * a,
|
1339
1367
|
struct ggml_tensor * b,
|
1340
|
-
ggml_binary_op_f32_t fun)
|
1368
|
+
ggml_binary_op_f32_t fun),
|
1369
|
+
"use ggml_map_custom2 instead");
|
1341
1370
|
|
1342
|
-
GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
1371
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
1343
1372
|
struct ggml_context * ctx,
|
1344
1373
|
struct ggml_tensor * a,
|
1345
1374
|
struct ggml_tensor * b,
|
1346
|
-
ggml_binary_op_f32_t fun)
|
1375
|
+
ggml_binary_op_f32_t fun),
|
1376
|
+
"use ggml_map_custom2_inplace instead");
|
1347
1377
|
|
1348
|
-
GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
1378
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
1349
1379
|
struct ggml_context * ctx,
|
1350
1380
|
struct ggml_tensor * a,
|
1351
|
-
ggml_custom1_op_f32_t fun)
|
1381
|
+
ggml_custom1_op_f32_t fun),
|
1382
|
+
"use ggml_map_custom1 instead");
|
1352
1383
|
|
1353
|
-
GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
1384
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
1354
1385
|
struct ggml_context * ctx,
|
1355
1386
|
struct ggml_tensor * a,
|
1356
|
-
ggml_custom1_op_f32_t fun)
|
1387
|
+
ggml_custom1_op_f32_t fun),
|
1388
|
+
"use ggml_map_custom1_inplace instead");
|
1357
1389
|
|
1358
|
-
GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
1390
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
1359
1391
|
struct ggml_context * ctx,
|
1360
1392
|
struct ggml_tensor * a,
|
1361
1393
|
struct ggml_tensor * b,
|
1362
|
-
ggml_custom2_op_f32_t fun)
|
1394
|
+
ggml_custom2_op_f32_t fun),
|
1395
|
+
"use ggml_map_custom2 instead");
|
1363
1396
|
|
1364
|
-
GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
1397
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
1365
1398
|
struct ggml_context * ctx,
|
1366
1399
|
struct ggml_tensor * a,
|
1367
1400
|
struct ggml_tensor * b,
|
1368
|
-
ggml_custom2_op_f32_t fun)
|
1401
|
+
ggml_custom2_op_f32_t fun),
|
1402
|
+
"use ggml_map_custom2_inplace instead");
|
1369
1403
|
|
1370
|
-
GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
1404
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
1371
1405
|
struct ggml_context * ctx,
|
1372
1406
|
struct ggml_tensor * a,
|
1373
1407
|
struct ggml_tensor * b,
|
1374
1408
|
struct ggml_tensor * c,
|
1375
|
-
ggml_custom3_op_f32_t fun)
|
1409
|
+
ggml_custom3_op_f32_t fun),
|
1410
|
+
"use ggml_map_custom3 instead");
|
1376
1411
|
|
1377
|
-
GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
1412
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
1378
1413
|
struct ggml_context * ctx,
|
1379
1414
|
struct ggml_tensor * a,
|
1380
1415
|
struct ggml_tensor * b,
|
1381
1416
|
struct ggml_tensor * c,
|
1382
|
-
ggml_custom3_op_f32_t fun)
|
1417
|
+
ggml_custom3_op_f32_t fun),
|
1418
|
+
"use ggml_map_custom3_inplace instead");
|
1419
|
+
|
1420
|
+
// custom operators v2
|
1421
|
+
|
1422
|
+
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
|
1423
|
+
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
1424
|
+
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
1425
|
+
|
1426
|
+
#define GGML_N_TASKS_MAX -1
|
1427
|
+
|
1428
|
+
GGML_API struct ggml_tensor * ggml_map_custom1(
|
1429
|
+
struct ggml_context * ctx,
|
1430
|
+
struct ggml_tensor * a,
|
1431
|
+
ggml_custom1_op_t fun,
|
1432
|
+
int n_tasks,
|
1433
|
+
void * userdata);
|
1434
|
+
|
1435
|
+
GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
|
1436
|
+
struct ggml_context * ctx,
|
1437
|
+
struct ggml_tensor * a,
|
1438
|
+
ggml_custom1_op_t fun,
|
1439
|
+
int n_tasks,
|
1440
|
+
void * userdata);
|
1441
|
+
|
1442
|
+
GGML_API struct ggml_tensor * ggml_map_custom2(
|
1443
|
+
struct ggml_context * ctx,
|
1444
|
+
struct ggml_tensor * a,
|
1445
|
+
struct ggml_tensor * b,
|
1446
|
+
ggml_custom2_op_t fun,
|
1447
|
+
int n_tasks,
|
1448
|
+
void * userdata);
|
1449
|
+
|
1450
|
+
GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
|
1451
|
+
struct ggml_context * ctx,
|
1452
|
+
struct ggml_tensor * a,
|
1453
|
+
struct ggml_tensor * b,
|
1454
|
+
ggml_custom2_op_t fun,
|
1455
|
+
int n_tasks,
|
1456
|
+
void * userdata);
|
1457
|
+
|
1458
|
+
GGML_API struct ggml_tensor * ggml_map_custom3(
|
1459
|
+
struct ggml_context * ctx,
|
1460
|
+
struct ggml_tensor * a,
|
1461
|
+
struct ggml_tensor * b,
|
1462
|
+
struct ggml_tensor * c,
|
1463
|
+
ggml_custom3_op_t fun,
|
1464
|
+
int n_tasks,
|
1465
|
+
void * userdata);
|
1466
|
+
|
1467
|
+
GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
|
1468
|
+
struct ggml_context * ctx,
|
1469
|
+
struct ggml_tensor * a,
|
1470
|
+
struct ggml_tensor * b,
|
1471
|
+
struct ggml_tensor * c,
|
1472
|
+
ggml_custom3_op_t fun,
|
1473
|
+
int n_tasks,
|
1474
|
+
void * userdata);
|
1383
1475
|
|
1384
1476
|
// loss function
|
1385
1477
|
|
@@ -39,6 +39,8 @@
|
|
39
39
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
40
40
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
41
41
|
|
42
|
+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
43
|
+
|
42
44
|
//
|
43
45
|
// 2-6 bit quantization in super-blocks
|
44
46
|
//
|
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1353
1355
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
|
1354
1356
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
1355
1357
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
1356
|
-
const __m256i scales[2] = {
|
1358
|
+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
1357
1359
|
|
1358
1360
|
__m256i sumi = _mm256_setzero_si256();
|
1359
1361
|
|
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1421
1423
|
const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
|
1422
1424
|
|
1423
1425
|
// sumf += -dmin * summs in 32bits*8
|
1424
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(
|
1426
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
|
1425
1427
|
|
1426
1428
|
const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
|
1427
1429
|
const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
|
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1493
1495
|
}
|
1494
1496
|
|
1495
1497
|
// sumf += dall * isum - dmin * summs in 32bits
|
1496
|
-
__m256i sumi =
|
1498
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
1497
1499
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
|
1498
1500
|
}
|
1499
1501
|
|
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1644
1646
|
summs += dmin * smin;
|
1645
1647
|
|
1646
1648
|
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
|
1647
|
-
const __m256i q2_0 = _mm256_and_si256(
|
1648
|
-
const __m256i q2_1 = _mm256_and_si256(
|
1649
|
+
const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
|
1650
|
+
const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
|
1649
1651
|
|
1650
1652
|
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
1651
1653
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1709
1711
|
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
|
1710
1712
|
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
|
1711
1713
|
|
1712
|
-
const __m256i p_0 =
|
1713
|
-
const __m256i p_1 =
|
1714
|
-
const __m256i p_2 =
|
1715
|
-
const __m256i p_3 =
|
1714
|
+
const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
|
1715
|
+
const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
|
1716
|
+
const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
|
1717
|
+
const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
|
1716
1718
|
|
1717
1719
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
|
1718
1720
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
|
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1917
1919
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
|
1918
1920
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
1919
1921
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
1920
|
-
const __m256i scales[2] = {
|
1922
|
+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
1921
1923
|
|
1922
1924
|
// high bit
|
1923
1925
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
|
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2128
2130
|
}
|
2129
2131
|
|
2130
2132
|
// multiply with block scale and accumulate
|
2131
|
-
__m256i sumi =
|
2133
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
2132
2134
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
2133
2135
|
|
2134
2136
|
}
|
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2303
2305
|
aux16[0] = a & 0x0f0f;
|
2304
2306
|
aux16[1] = (a >> 4) & 0x0f0f;
|
2305
2307
|
|
2306
|
-
const __m256i scale_0 =
|
2307
|
-
const __m256i scale_1 =
|
2308
|
+
const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
|
2309
|
+
const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
|
2308
2310
|
|
2309
2311
|
memcpy(&aux64, x[i].hmask, 8);
|
2310
2312
|
|
2311
2313
|
const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
|
2312
|
-
__m256i q3h_0 =
|
2314
|
+
__m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
|
2313
2315
|
__m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
|
2314
2316
|
q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
|
2315
2317
|
q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
|
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2318
2320
|
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
|
2319
2321
|
|
2320
2322
|
// prepare low and high bits
|
2321
|
-
const __m256i q3aux =
|
2323
|
+
const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
|
2322
2324
|
const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
|
2323
2325
|
const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
|
2324
2326
|
|
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2429
2431
|
|
2430
2432
|
p16_0 = _mm_add_epi32(p16_0, p16_2);
|
2431
2433
|
p16_1 = _mm_add_epi32(p16_1, p16_3);
|
2432
|
-
__m256i p16 =
|
2434
|
+
__m256i p16 = MM256_SET_M128I(p16_1, p16_0);
|
2433
2435
|
|
2434
2436
|
// multiply with block scale and accumulate
|
2435
2437
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
|
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2620
2622
|
acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
|
2621
2623
|
|
2622
2624
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
2623
|
-
const __m256i scales =
|
2625
|
+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
2624
2626
|
|
2625
2627
|
__m256i sumi = _mm256_setzero_si256();
|
2626
2628
|
|
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2727
2729
|
}
|
2728
2730
|
|
2729
2731
|
__m256 vd = _mm256_set1_ps(d);
|
2730
|
-
__m256i sumi =
|
2732
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
2731
2733
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
2732
2734
|
|
2733
2735
|
}
|
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2968
2970
|
|
2969
2971
|
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
|
2970
2972
|
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
|
2971
|
-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(
|
2973
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
|
2972
2974
|
|
2973
2975
|
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
|
2974
2976
|
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
|
2975
|
-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(
|
2977
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
|
2976
2978
|
|
2977
2979
|
}
|
2978
2980
|
|
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3160
3162
|
summs += dmin * _mm_extract_epi32(hsum, 0);
|
3161
3163
|
|
3162
3164
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
3163
|
-
const __m256i scales =
|
3165
|
+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
3164
3166
|
|
3165
3167
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
|
3166
3168
|
__m256i hmask = mone;
|
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3299
3301
|
}
|
3300
3302
|
|
3301
3303
|
__m256 vd = _mm256_set1_ps(d);
|
3302
|
-
__m256i sumi =
|
3304
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
3303
3305
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
3304
3306
|
|
3305
3307
|
}
|
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3462
3464
|
|
3463
3465
|
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
|
3464
3466
|
|
3465
|
-
const __m256i scale_l =
|
3466
|
-
const __m256i scale_h =
|
3467
|
+
const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
|
3468
|
+
const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
|
3467
3469
|
|
3468
3470
|
int64_t aux64;
|
3469
3471
|
memcpy(&aux64, x[i].qh, 8);
|
3470
3472
|
const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
|
3471
|
-
const __m256i haux256 =
|
3473
|
+
const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
|
3472
3474
|
|
3473
3475
|
const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
|
3474
3476
|
const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
|
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3543
3545
|
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
|
3544
3546
|
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
|
3545
3547
|
|
3546
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(
|
3548
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
|
3547
3549
|
|
3548
3550
|
}
|
3549
3551
|
|
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3925
3927
|
|
3926
3928
|
}
|
3927
3929
|
|
3928
|
-
__m256i sumi =
|
3930
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
3929
3931
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
3930
3932
|
}
|
3931
3933
|
|
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4083
4085
|
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
|
4084
4086
|
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
|
4085
4087
|
|
4086
|
-
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(
|
4087
|
-
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(
|
4088
|
+
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
|
4089
|
+
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
|
4088
4090
|
|
4089
4091
|
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
|
4090
4092
|
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
|
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4177
4179
|
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
4178
4180
|
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
4179
4181
|
|
4180
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(
|
4182
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
|
4181
4183
|
}
|
4182
4184
|
|
4183
4185
|
*s = hsum_float_8(acc);
|
@@ -149,6 +149,46 @@ struct llama_file {
|
|
149
149
|
}
|
150
150
|
};
|
151
151
|
|
152
|
+
// llama_context_data
|
153
|
+
struct llama_data_context {
|
154
|
+
virtual void write(const void * src, size_t size) = 0;
|
155
|
+
virtual size_t get_size_written() = 0;
|
156
|
+
virtual ~llama_data_context() = default;
|
157
|
+
};
|
158
|
+
|
159
|
+
struct llama_data_buffer_context : llama_data_context {
|
160
|
+
uint8_t* ptr;
|
161
|
+
size_t size_written = 0;
|
162
|
+
|
163
|
+
llama_data_buffer_context(uint8_t * p) : ptr(p) {}
|
164
|
+
|
165
|
+
void write(const void * src, size_t size) override {
|
166
|
+
memcpy(ptr, src, size);
|
167
|
+
ptr += size;
|
168
|
+
size_written += size;
|
169
|
+
}
|
170
|
+
|
171
|
+
size_t get_size_written() override {
|
172
|
+
return size_written;
|
173
|
+
}
|
174
|
+
};
|
175
|
+
|
176
|
+
struct llama_data_file_context : llama_data_context {
|
177
|
+
llama_file* file;
|
178
|
+
size_t size_written = 0;
|
179
|
+
|
180
|
+
llama_data_file_context(llama_file * f) : file(f) {}
|
181
|
+
|
182
|
+
void write(const void * src, size_t size) override {
|
183
|
+
file->write_raw(src, size);
|
184
|
+
size_written += size;
|
185
|
+
}
|
186
|
+
|
187
|
+
size_t get_size_written() override {
|
188
|
+
return size_written;
|
189
|
+
}
|
190
|
+
};
|
191
|
+
|
152
192
|
#if defined(_WIN32)
|
153
193
|
static std::string llama_format_win_err(DWORD err) {
|
154
194
|
LPSTR buf;
|
@@ -179,7 +219,7 @@ struct llama_mmap {
|
|
179
219
|
// prefetch/readahead impairs performance on NUMA systems
|
180
220
|
if (numa) { prefetch = 0; }
|
181
221
|
#ifdef __linux__
|
182
|
-
if (prefetch) { flags |= MAP_POPULATE; }
|
222
|
+
if (prefetch >= file->size) { flags |= MAP_POPULATE; }
|
183
223
|
#endif
|
184
224
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
185
225
|
if (addr == MAP_FAILED) {
|