llama_cpp 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +293 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +304 -99
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +201 -71
- data/ext/llama_cpp/src/ggml-metal.metal +68 -54
- data/ext/llama_cpp/src/ggml.c +713 -978
- data/ext/llama_cpp/src/ggml.h +82 -17
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/llama.cpp +524 -121
- data/ext/llama_cpp/src/llama.h +60 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -199,6 +199,7 @@
|
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
|
+
#define GGML_MAX_OP_PARAMS 32
|
202
203
|
#define GGML_DEFAULT_N_THREADS 4
|
203
204
|
|
204
205
|
|
@@ -207,6 +208,7 @@
|
|
207
208
|
|
208
209
|
#define GGML_UNUSED(x) (void)(x)
|
209
210
|
|
211
|
+
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
210
212
|
|
211
213
|
#define GGML_ASSERT(x) \
|
212
214
|
do { \
|
@@ -329,16 +331,6 @@ extern "C" {
|
|
329
331
|
GGML_OP_ARGMAX,
|
330
332
|
GGML_OP_REPEAT,
|
331
333
|
GGML_OP_REPEAT_BACK,
|
332
|
-
GGML_OP_ABS,
|
333
|
-
GGML_OP_SGN,
|
334
|
-
GGML_OP_NEG,
|
335
|
-
GGML_OP_STEP,
|
336
|
-
GGML_OP_TANH,
|
337
|
-
GGML_OP_ELU,
|
338
|
-
GGML_OP_RELU,
|
339
|
-
GGML_OP_GELU,
|
340
|
-
GGML_OP_GELU_QUICK,
|
341
|
-
GGML_OP_SILU,
|
342
334
|
GGML_OP_SILU_BACK,
|
343
335
|
GGML_OP_NORM, // normalize
|
344
336
|
GGML_OP_RMS_NORM,
|
@@ -377,6 +369,8 @@ extern "C" {
|
|
377
369
|
GGML_OP_WIN_PART,
|
378
370
|
GGML_OP_WIN_UNPART,
|
379
371
|
|
372
|
+
GGML_OP_UNARY,
|
373
|
+
|
380
374
|
GGML_OP_MAP_UNARY,
|
381
375
|
GGML_OP_MAP_BINARY,
|
382
376
|
|
@@ -390,6 +384,24 @@ extern "C" {
|
|
390
384
|
GGML_OP_COUNT,
|
391
385
|
};
|
392
386
|
|
387
|
+
enum ggml_unary_op {
|
388
|
+
GGML_UNARY_OP_ABS,
|
389
|
+
GGML_UNARY_OP_SGN,
|
390
|
+
GGML_UNARY_OP_NEG,
|
391
|
+
GGML_UNARY_OP_STEP,
|
392
|
+
GGML_UNARY_OP_TANH,
|
393
|
+
GGML_UNARY_OP_ELU,
|
394
|
+
GGML_UNARY_OP_RELU,
|
395
|
+
GGML_UNARY_OP_GELU,
|
396
|
+
GGML_UNARY_OP_GELU_QUICK,
|
397
|
+
GGML_UNARY_OP_SILU,
|
398
|
+
};
|
399
|
+
|
400
|
+
enum ggml_object_type {
|
401
|
+
GGML_OBJECT_TENSOR,
|
402
|
+
GGML_OBJECT_GRAPH,
|
403
|
+
GGML_OBJECT_WORK_BUFFER
|
404
|
+
};
|
393
405
|
|
394
406
|
// ggml object
|
395
407
|
struct ggml_object {
|
@@ -398,7 +410,9 @@ extern "C" {
|
|
398
410
|
|
399
411
|
struct ggml_object * next;
|
400
412
|
|
401
|
-
|
413
|
+
enum ggml_object_type type;
|
414
|
+
|
415
|
+
char padding[4];
|
402
416
|
};
|
403
417
|
|
404
418
|
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
@@ -418,6 +432,9 @@ extern "C" {
|
|
418
432
|
// compute data
|
419
433
|
enum ggml_op op;
|
420
434
|
|
435
|
+
// op params - allocated as int32_t for alignment
|
436
|
+
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
437
|
+
|
421
438
|
bool is_param;
|
422
439
|
|
423
440
|
struct ggml_tensor * grad;
|
@@ -434,7 +451,7 @@ extern "C" {
|
|
434
451
|
|
435
452
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
436
453
|
|
437
|
-
char padding[
|
454
|
+
char padding[4];
|
438
455
|
};
|
439
456
|
|
440
457
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -455,6 +472,11 @@ extern "C" {
|
|
455
472
|
void * abort_callback_data;
|
456
473
|
};
|
457
474
|
|
475
|
+
// next prime after GGML_MAX_NODES
|
476
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
477
|
+
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
478
|
+
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
479
|
+
|
458
480
|
// computation graph
|
459
481
|
struct ggml_cgraph {
|
460
482
|
int n_nodes;
|
@@ -464,12 +486,16 @@ extern "C" {
|
|
464
486
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
465
487
|
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
466
488
|
|
489
|
+
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
490
|
+
|
467
491
|
// performance
|
468
492
|
int perf_runs;
|
469
493
|
int64_t perf_cycles;
|
470
494
|
int64_t perf_time_us;
|
471
495
|
};
|
472
496
|
|
497
|
+
static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
|
498
|
+
|
473
499
|
// scratch buffer
|
474
500
|
struct ggml_scratch {
|
475
501
|
size_t offs;
|
@@ -531,6 +557,7 @@ extern "C" {
|
|
531
557
|
|
532
558
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
533
559
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
560
|
+
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
534
561
|
|
535
562
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
536
563
|
|
@@ -554,6 +581,7 @@ extern "C" {
|
|
554
581
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
555
582
|
|
556
583
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
584
|
+
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
|
557
585
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
558
586
|
|
559
587
|
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
@@ -613,9 +641,11 @@ extern "C" {
|
|
613
641
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
614
642
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
615
643
|
|
616
|
-
GGML_API
|
617
|
-
|
618
|
-
GGML_API
|
644
|
+
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
645
|
+
|
646
|
+
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
647
|
+
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
648
|
+
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
619
649
|
|
620
650
|
//
|
621
651
|
// operations on tensors with backpropagation
|
@@ -625,6 +655,11 @@ extern "C" {
|
|
625
655
|
struct ggml_context * ctx,
|
626
656
|
struct ggml_tensor * a);
|
627
657
|
|
658
|
+
// in-place, returns view(a)
|
659
|
+
GGML_API struct ggml_tensor * ggml_dup_inplace(
|
660
|
+
struct ggml_context * ctx,
|
661
|
+
struct ggml_tensor * a);
|
662
|
+
|
628
663
|
GGML_API struct ggml_tensor * ggml_add(
|
629
664
|
struct ggml_context * ctx,
|
630
665
|
struct ggml_tensor * a,
|
@@ -849,14 +884,17 @@ extern "C" {
|
|
849
884
|
|
850
885
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
851
886
|
struct ggml_context * ctx,
|
852
|
-
struct ggml_tensor * a
|
887
|
+
struct ggml_tensor * a,
|
888
|
+
float eps);
|
853
889
|
|
854
890
|
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
855
891
|
struct ggml_context * ctx,
|
856
|
-
struct ggml_tensor * a
|
892
|
+
struct ggml_tensor * a,
|
893
|
+
float eps);
|
857
894
|
|
858
895
|
// a - x
|
859
896
|
// b - dy
|
897
|
+
// TODO: update with configurable eps
|
860
898
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
861
899
|
struct ggml_context * ctx,
|
862
900
|
struct ggml_tensor * a,
|
@@ -948,11 +986,22 @@ extern "C" {
|
|
948
986
|
struct ggml_tensor * a,
|
949
987
|
struct ggml_tensor * b);
|
950
988
|
|
989
|
+
// a -> b, in-place, return view(b)
|
990
|
+
GGML_API struct ggml_tensor * ggml_cpy_inplace(
|
991
|
+
struct ggml_context * ctx,
|
992
|
+
struct ggml_tensor * a,
|
993
|
+
struct ggml_tensor * b);
|
994
|
+
|
951
995
|
// make contiguous
|
952
996
|
GGML_API struct ggml_tensor * ggml_cont(
|
953
997
|
struct ggml_context * ctx,
|
954
998
|
struct ggml_tensor * a);
|
955
999
|
|
1000
|
+
// make contiguous, in-place
|
1001
|
+
GGML_API struct ggml_tensor * ggml_cont_inplace(
|
1002
|
+
struct ggml_context * ctx,
|
1003
|
+
struct ggml_tensor * a);
|
1004
|
+
|
956
1005
|
// return view(a), b specifies the new shape
|
957
1006
|
// TODO: when we start computing gradient, make a copy instead of view
|
958
1007
|
GGML_API struct ggml_tensor * ggml_reshape(
|
@@ -1264,6 +1313,16 @@ extern "C" {
|
|
1264
1313
|
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1265
1314
|
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1266
1315
|
|
1316
|
+
GGML_API struct ggml_tensor * ggml_unary(
|
1317
|
+
struct ggml_context * ctx,
|
1318
|
+
struct ggml_tensor * a,
|
1319
|
+
enum ggml_unary_op op);
|
1320
|
+
|
1321
|
+
GGML_API struct ggml_tensor * ggml_unary_inplace(
|
1322
|
+
struct ggml_context * ctx,
|
1323
|
+
struct ggml_tensor * a,
|
1324
|
+
enum ggml_unary_op op);
|
1325
|
+
|
1267
1326
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
1268
1327
|
struct ggml_context * ctx,
|
1269
1328
|
struct ggml_tensor * a,
|
@@ -1343,11 +1402,17 @@ extern "C" {
|
|
1343
1402
|
struct ggml_context * ctx,
|
1344
1403
|
struct ggml_tensor * tensor);
|
1345
1404
|
|
1405
|
+
|
1346
1406
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1347
1407
|
|
1348
1408
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1349
1409
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1350
1410
|
|
1411
|
+
// graph allocation in a context
|
1412
|
+
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
|
1413
|
+
GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
1414
|
+
GGML_API size_t ggml_graph_overhead(void);
|
1415
|
+
|
1351
1416
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1352
1417
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1353
1418
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
@@ -1666,6 +1666,62 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1666
1666
|
|
1667
1667
|
*s = hsum_float_8(acc) + summs;
|
1668
1668
|
|
1669
|
+
#elif defined __AVX__
|
1670
|
+
|
1671
|
+
const __m128i m3 = _mm_set1_epi8(3);
|
1672
|
+
|
1673
|
+
__m256 acc = _mm256_setzero_ps();
|
1674
|
+
|
1675
|
+
uint32_t ud, um;
|
1676
|
+
const uint8_t * restrict db = (const uint8_t *)&ud;
|
1677
|
+
const uint8_t * restrict mb = (const uint8_t *)&um;
|
1678
|
+
|
1679
|
+
float summs = 0;
|
1680
|
+
|
1681
|
+
// TODO: optimize this
|
1682
|
+
|
1683
|
+
for (int i = 0; i < nb; ++i) {
|
1684
|
+
|
1685
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
1686
|
+
const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
1687
|
+
|
1688
|
+
const uint8_t * restrict q2 = x[i].qs;
|
1689
|
+
const int8_t * restrict q8 = y[i].qs;
|
1690
|
+
|
1691
|
+
const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
|
1692
|
+
ud = (sc[0] >> 0) & 0x0f0f0f0f;
|
1693
|
+
um = (sc[0] >> 4) & 0x0f0f0f0f;
|
1694
|
+
|
1695
|
+
int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
|
1696
|
+
summs += dmin * smin;
|
1697
|
+
|
1698
|
+
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
|
1699
|
+
const __m128i q2_0 = _mm_and_si128(q2bits, m3);
|
1700
|
+
const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
|
1701
|
+
const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
|
1702
|
+
const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
|
1703
|
+
|
1704
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
1705
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
1706
|
+
|
1707
|
+
const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
|
1708
|
+
const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
|
1709
|
+
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
|
1710
|
+
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
|
1711
|
+
|
1712
|
+
const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
|
1713
|
+
const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
|
1714
|
+
const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
|
1715
|
+
const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
|
1716
|
+
|
1717
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
|
1718
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
|
1719
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
|
1720
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
|
1721
|
+
}
|
1722
|
+
|
1723
|
+
*s = hsum_float_8(acc) + summs;
|
1724
|
+
|
1669
1725
|
#else
|
1670
1726
|
|
1671
1727
|
float sumf = 0;
|
@@ -2295,6 +2351,93 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2295
2351
|
|
2296
2352
|
*s = hsum_float_8(acc);
|
2297
2353
|
|
2354
|
+
#elif defined __AVX__
|
2355
|
+
|
2356
|
+
const __m128i m3 = _mm_set1_epi8(3);
|
2357
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
2358
|
+
|
2359
|
+
__m256 acc = _mm256_setzero_ps();
|
2360
|
+
|
2361
|
+
uint64_t aux64;
|
2362
|
+
|
2363
|
+
uint16_t aux16[2];
|
2364
|
+
const int8_t * aux8 = (const int8_t *)aux16;
|
2365
|
+
|
2366
|
+
for (int i = 0; i < nb; ++i) {
|
2367
|
+
|
2368
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
2369
|
+
|
2370
|
+
const uint8_t * restrict q3 = x[i].qs;
|
2371
|
+
const int8_t * restrict q8 = y[i].qs;
|
2372
|
+
|
2373
|
+
const uint16_t a = *(const uint16_t *)x[i].scales;
|
2374
|
+
aux16[0] = a & 0x0f0f;
|
2375
|
+
aux16[1] = (a >> 4) & 0x0f0f;
|
2376
|
+
|
2377
|
+
const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
|
2378
|
+
const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
|
2379
|
+
const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
|
2380
|
+
const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
|
2381
|
+
|
2382
|
+
memcpy(&aux64, x[i].hmask, 8);
|
2383
|
+
|
2384
|
+
__m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
|
2385
|
+
__m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
|
2386
|
+
__m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
|
2387
|
+
__m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
|
2388
|
+
q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
|
2389
|
+
q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
|
2390
|
+
q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
|
2391
|
+
q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
|
2392
|
+
|
2393
|
+
// load low 2 bits
|
2394
|
+
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
|
2395
|
+
|
2396
|
+
// prepare low and high bits
|
2397
|
+
const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
|
2398
|
+
const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
|
2399
|
+
const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
|
2400
|
+
const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
|
2401
|
+
|
2402
|
+
// load Q8 quants
|
2403
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
2404
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
2405
|
+
|
2406
|
+
// Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
|
2407
|
+
// and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
|
2408
|
+
// and 2 if the high bit was set)
|
2409
|
+
const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
|
2410
|
+
const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
|
2411
|
+
const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
|
2412
|
+
const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
|
2413
|
+
|
2414
|
+
__m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
|
2415
|
+
__m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
|
2416
|
+
__m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
|
2417
|
+
__m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
|
2418
|
+
|
2419
|
+
p16_0 = _mm_sub_epi16(p16_0, q8s_0);
|
2420
|
+
p16_1 = _mm_sub_epi16(p16_1, q8s_1);
|
2421
|
+
p16_2 = _mm_sub_epi16(p16_2, q8s_2);
|
2422
|
+
p16_3 = _mm_sub_epi16(p16_3, q8s_3);
|
2423
|
+
|
2424
|
+
// multiply with scales
|
2425
|
+
p16_0 = _mm_madd_epi16(scale_0, p16_0);
|
2426
|
+
p16_1 = _mm_madd_epi16(scale_1, p16_1);
|
2427
|
+
p16_2 = _mm_madd_epi16(scale_2, p16_2);
|
2428
|
+
p16_3 = _mm_madd_epi16(scale_3, p16_3);
|
2429
|
+
|
2430
|
+
p16_0 = _mm_add_epi32(p16_0, p16_2);
|
2431
|
+
p16_1 = _mm_add_epi32(p16_1, p16_3);
|
2432
|
+
__m256i p16 = _mm256_set_m128i(p16_1, p16_0);
|
2433
|
+
|
2434
|
+
// multiply with block scale and accumulate
|
2435
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
|
2436
|
+
|
2437
|
+
}
|
2438
|
+
|
2439
|
+
*s = hsum_float_8(acc);
|
2440
|
+
|
2298
2441
|
#else
|
2299
2442
|
|
2300
2443
|
int8_t aux8[QK_K];
|
@@ -2781,6 +2924,60 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2781
2924
|
|
2782
2925
|
*s = hsum_float_8(acc) - summs;
|
2783
2926
|
|
2927
|
+
#elif defined __AVX__
|
2928
|
+
|
2929
|
+
const __m128i m4 = _mm_set1_epi8(0xF);
|
2930
|
+
|
2931
|
+
__m256 acc = _mm256_setzero_ps();
|
2932
|
+
|
2933
|
+
float summs = 0;
|
2934
|
+
|
2935
|
+
uint16_t aux16[2];
|
2936
|
+
const uint8_t * scales = (const uint8_t *)aux16;
|
2937
|
+
|
2938
|
+
for (int i = 0; i < nb; ++i) {
|
2939
|
+
|
2940
|
+
const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
|
2941
|
+
const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
|
2942
|
+
const __m256 vd = _mm256_set1_ps(d);
|
2943
|
+
|
2944
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
2945
|
+
aux16[0] = a[0] & 0x0f0f;
|
2946
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2947
|
+
|
2948
|
+
summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
|
2949
|
+
|
2950
|
+
const uint8_t * restrict q4 = x[i].qs;
|
2951
|
+
const int8_t * restrict q8 = y[i].qs;
|
2952
|
+
|
2953
|
+
const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
|
2954
|
+
const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
|
2955
|
+
const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
|
2956
|
+
const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
|
2957
|
+
const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
|
2958
|
+
const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
|
2959
|
+
const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
|
2960
|
+
|
2961
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
2962
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
2963
|
+
|
2964
|
+
const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
|
2965
|
+
const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
|
2966
|
+
const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
|
2967
|
+
const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
|
2968
|
+
|
2969
|
+
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
|
2970
|
+
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
|
2971
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
|
2972
|
+
|
2973
|
+
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
|
2974
|
+
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
|
2975
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
|
2976
|
+
|
2977
|
+
}
|
2978
|
+
|
2979
|
+
*s = hsum_float_8(acc) - summs;
|
2980
|
+
|
2784
2981
|
#else
|
2785
2982
|
|
2786
2983
|
uint8_t aux8[QK_K];
|
@@ -3295,10 +3492,66 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3295
3492
|
|
3296
3493
|
*s = hsum_float_8(acc);
|
3297
3494
|
|
3298
|
-
#
|
3495
|
+
#elif defined __AVX__
|
3299
3496
|
|
3497
|
+
const __m128i m4 = _mm_set1_epi8(0xF);
|
3498
|
+
const __m128i mone = _mm_set1_epi8(1);
|
3300
3499
|
|
3301
|
-
|
3500
|
+
__m256 acc = _mm256_setzero_ps();
|
3501
|
+
|
3502
|
+
for (int i = 0; i < nb; ++i) {
|
3503
|
+
|
3504
|
+
const uint8_t * restrict q5 = x[i].qs;
|
3505
|
+
const int8_t * restrict q8 = y[i].qs;
|
3506
|
+
|
3507
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
3508
|
+
|
3509
|
+
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
|
3510
|
+
|
3511
|
+
const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
|
3512
|
+
const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
|
3513
|
+
const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
|
3514
|
+
const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
|
3515
|
+
|
3516
|
+
int64_t aux64;
|
3517
|
+
memcpy(&aux64, x[i].qh, 8);
|
3518
|
+
const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
|
3519
|
+
const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
|
3520
|
+
|
3521
|
+
const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
|
3522
|
+
const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
|
3523
|
+
const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
|
3524
|
+
const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
|
3525
|
+
|
3526
|
+
const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
|
3527
|
+
const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
|
3528
|
+
const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
|
3529
|
+
const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
|
3530
|
+
|
3531
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
3532
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
3533
|
+
|
3534
|
+
const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
|
3535
|
+
const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
|
3536
|
+
const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
|
3537
|
+
const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
|
3538
|
+
const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
|
3539
|
+
const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
|
3540
|
+
const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
|
3541
|
+
const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
|
3542
|
+
|
3543
|
+
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
|
3544
|
+
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
|
3545
|
+
|
3546
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
|
3547
|
+
|
3548
|
+
}
|
3549
|
+
|
3550
|
+
*s = hsum_float_8(acc);
|
3551
|
+
|
3552
|
+
#else
|
3553
|
+
|
3554
|
+
int8_t aux8[QK_K];
|
3302
3555
|
int16_t aux16[16];
|
3303
3556
|
float sums [8];
|
3304
3557
|
memset(sums, 0, 8*sizeof(float));
|
@@ -3308,7 +3561,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3308
3561
|
const uint8_t * restrict q4 = x[i].qs;
|
3309
3562
|
const uint8_t * restrict hm = x[i].qh;
|
3310
3563
|
const int8_t * restrict q8 = y[i].qs;
|
3311
|
-
|
3564
|
+
int8_t * restrict a = aux8;
|
3312
3565
|
for (int l = 0; l < 32; ++l) {
|
3313
3566
|
a[l+ 0] = q4[l] & 0xF;
|
3314
3567
|
a[l+32] = q4[l] >> 4;
|
@@ -3858,6 +4111,77 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3858
4111
|
|
3859
4112
|
*s = hsum_float_8(acc);
|
3860
4113
|
|
4114
|
+
#elif defined __AVX__
|
4115
|
+
|
4116
|
+
const __m128i m4 = _mm_set1_epi8(0xF);
|
4117
|
+
const __m128i m2 = _mm_set1_epi8(3);
|
4118
|
+
const __m128i m32s = _mm_set1_epi8(32);
|
4119
|
+
|
4120
|
+
__m256 acc = _mm256_setzero_ps();
|
4121
|
+
|
4122
|
+
for (int i = 0; i < nb; ++i) {
|
4123
|
+
|
4124
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
4125
|
+
|
4126
|
+
const uint8_t * restrict q4 = x[i].ql;
|
4127
|
+
const uint8_t * restrict qh = x[i].qh;
|
4128
|
+
const int8_t * restrict q8 = y[i].qs;
|
4129
|
+
|
4130
|
+
const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
|
4131
|
+
const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
|
4132
|
+
const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
|
4133
|
+
const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
|
4134
|
+
|
4135
|
+
__m128i sumi_0 = _mm_setzero_si128();
|
4136
|
+
__m128i sumi_1 = _mm_setzero_si128();
|
4137
|
+
|
4138
|
+
const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
|
4139
|
+
const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
|
4140
|
+
|
4141
|
+
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
|
4142
|
+
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
|
4143
|
+
|
4144
|
+
const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
|
4145
|
+
const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
|
4146
|
+
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
|
4147
|
+
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
|
4148
|
+
|
4149
|
+
const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
|
4150
|
+
const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
|
4151
|
+
const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
|
4152
|
+
const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
|
4153
|
+
|
4154
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
4155
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
4156
|
+
|
4157
|
+
__m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
|
4158
|
+
__m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
|
4159
|
+
__m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
|
4160
|
+
__m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
|
4161
|
+
|
4162
|
+
__m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
|
4163
|
+
__m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
|
4164
|
+
__m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
|
4165
|
+
__m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
|
4166
|
+
|
4167
|
+
p16_0 = _mm_sub_epi16(p16_0, q8s_0);
|
4168
|
+
p16_1 = _mm_sub_epi16(p16_1, q8s_1);
|
4169
|
+
p16_2 = _mm_sub_epi16(p16_2, q8s_2);
|
4170
|
+
p16_3 = _mm_sub_epi16(p16_3, q8s_3);
|
4171
|
+
|
4172
|
+
p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
|
4173
|
+
p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
|
4174
|
+
p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
|
4175
|
+
p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
|
4176
|
+
|
4177
|
+
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
4178
|
+
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
4179
|
+
|
4180
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
|
4181
|
+
}
|
4182
|
+
|
4183
|
+
*s = hsum_float_8(acc);
|
4184
|
+
|
3861
4185
|
#else
|
3862
4186
|
|
3863
4187
|
int8_t aux8[QK_K];
|