llama_cpp 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +293 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +304 -99
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +201 -71
- data/ext/llama_cpp/src/ggml-metal.metal +68 -54
- data/ext/llama_cpp/src/ggml.c +713 -978
- data/ext/llama_cpp/src/ggml.h +82 -17
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/llama.cpp +524 -121
- data/ext/llama_cpp/src/llama.h +60 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -199,6 +199,7 @@
|
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
|
+
#define GGML_MAX_OP_PARAMS 32
|
202
203
|
#define GGML_DEFAULT_N_THREADS 4
|
203
204
|
|
204
205
|
|
@@ -207,6 +208,7 @@
|
|
207
208
|
|
208
209
|
#define GGML_UNUSED(x) (void)(x)
|
209
210
|
|
211
|
+
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
210
212
|
|
211
213
|
#define GGML_ASSERT(x) \
|
212
214
|
do { \
|
@@ -329,16 +331,6 @@ extern "C" {
|
|
329
331
|
GGML_OP_ARGMAX,
|
330
332
|
GGML_OP_REPEAT,
|
331
333
|
GGML_OP_REPEAT_BACK,
|
332
|
-
GGML_OP_ABS,
|
333
|
-
GGML_OP_SGN,
|
334
|
-
GGML_OP_NEG,
|
335
|
-
GGML_OP_STEP,
|
336
|
-
GGML_OP_TANH,
|
337
|
-
GGML_OP_ELU,
|
338
|
-
GGML_OP_RELU,
|
339
|
-
GGML_OP_GELU,
|
340
|
-
GGML_OP_GELU_QUICK,
|
341
|
-
GGML_OP_SILU,
|
342
334
|
GGML_OP_SILU_BACK,
|
343
335
|
GGML_OP_NORM, // normalize
|
344
336
|
GGML_OP_RMS_NORM,
|
@@ -377,6 +369,8 @@ extern "C" {
|
|
377
369
|
GGML_OP_WIN_PART,
|
378
370
|
GGML_OP_WIN_UNPART,
|
379
371
|
|
372
|
+
GGML_OP_UNARY,
|
373
|
+
|
380
374
|
GGML_OP_MAP_UNARY,
|
381
375
|
GGML_OP_MAP_BINARY,
|
382
376
|
|
@@ -390,6 +384,24 @@ extern "C" {
|
|
390
384
|
GGML_OP_COUNT,
|
391
385
|
};
|
392
386
|
|
387
|
+
enum ggml_unary_op {
|
388
|
+
GGML_UNARY_OP_ABS,
|
389
|
+
GGML_UNARY_OP_SGN,
|
390
|
+
GGML_UNARY_OP_NEG,
|
391
|
+
GGML_UNARY_OP_STEP,
|
392
|
+
GGML_UNARY_OP_TANH,
|
393
|
+
GGML_UNARY_OP_ELU,
|
394
|
+
GGML_UNARY_OP_RELU,
|
395
|
+
GGML_UNARY_OP_GELU,
|
396
|
+
GGML_UNARY_OP_GELU_QUICK,
|
397
|
+
GGML_UNARY_OP_SILU,
|
398
|
+
};
|
399
|
+
|
400
|
+
enum ggml_object_type {
|
401
|
+
GGML_OBJECT_TENSOR,
|
402
|
+
GGML_OBJECT_GRAPH,
|
403
|
+
GGML_OBJECT_WORK_BUFFER
|
404
|
+
};
|
393
405
|
|
394
406
|
// ggml object
|
395
407
|
struct ggml_object {
|
@@ -398,7 +410,9 @@ extern "C" {
|
|
398
410
|
|
399
411
|
struct ggml_object * next;
|
400
412
|
|
401
|
-
|
413
|
+
enum ggml_object_type type;
|
414
|
+
|
415
|
+
char padding[4];
|
402
416
|
};
|
403
417
|
|
404
418
|
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
@@ -418,6 +432,9 @@ extern "C" {
|
|
418
432
|
// compute data
|
419
433
|
enum ggml_op op;
|
420
434
|
|
435
|
+
// op params - allocated as int32_t for alignment
|
436
|
+
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
437
|
+
|
421
438
|
bool is_param;
|
422
439
|
|
423
440
|
struct ggml_tensor * grad;
|
@@ -434,7 +451,7 @@ extern "C" {
|
|
434
451
|
|
435
452
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
436
453
|
|
437
|
-
char padding[
|
454
|
+
char padding[4];
|
438
455
|
};
|
439
456
|
|
440
457
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -455,6 +472,11 @@ extern "C" {
|
|
455
472
|
void * abort_callback_data;
|
456
473
|
};
|
457
474
|
|
475
|
+
// next prime after GGML_MAX_NODES
|
476
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
477
|
+
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
478
|
+
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
479
|
+
|
458
480
|
// computation graph
|
459
481
|
struct ggml_cgraph {
|
460
482
|
int n_nodes;
|
@@ -464,12 +486,16 @@ extern "C" {
|
|
464
486
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
465
487
|
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
466
488
|
|
489
|
+
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
490
|
+
|
467
491
|
// performance
|
468
492
|
int perf_runs;
|
469
493
|
int64_t perf_cycles;
|
470
494
|
int64_t perf_time_us;
|
471
495
|
};
|
472
496
|
|
497
|
+
static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
|
498
|
+
|
473
499
|
// scratch buffer
|
474
500
|
struct ggml_scratch {
|
475
501
|
size_t offs;
|
@@ -531,6 +557,7 @@ extern "C" {
|
|
531
557
|
|
532
558
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
533
559
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
560
|
+
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
534
561
|
|
535
562
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
536
563
|
|
@@ -554,6 +581,7 @@ extern "C" {
|
|
554
581
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
555
582
|
|
556
583
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
584
|
+
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
|
557
585
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
558
586
|
|
559
587
|
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
@@ -613,9 +641,11 @@ extern "C" {
|
|
613
641
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
614
642
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
615
643
|
|
616
|
-
GGML_API
|
617
|
-
|
618
|
-
GGML_API
|
644
|
+
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
645
|
+
|
646
|
+
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
647
|
+
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
648
|
+
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
619
649
|
|
620
650
|
//
|
621
651
|
// operations on tensors with backpropagation
|
@@ -625,6 +655,11 @@ extern "C" {
|
|
625
655
|
struct ggml_context * ctx,
|
626
656
|
struct ggml_tensor * a);
|
627
657
|
|
658
|
+
// in-place, returns view(a)
|
659
|
+
GGML_API struct ggml_tensor * ggml_dup_inplace(
|
660
|
+
struct ggml_context * ctx,
|
661
|
+
struct ggml_tensor * a);
|
662
|
+
|
628
663
|
GGML_API struct ggml_tensor * ggml_add(
|
629
664
|
struct ggml_context * ctx,
|
630
665
|
struct ggml_tensor * a,
|
@@ -849,14 +884,17 @@ extern "C" {
|
|
849
884
|
|
850
885
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
851
886
|
struct ggml_context * ctx,
|
852
|
-
struct ggml_tensor * a
|
887
|
+
struct ggml_tensor * a,
|
888
|
+
float eps);
|
853
889
|
|
854
890
|
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
855
891
|
struct ggml_context * ctx,
|
856
|
-
struct ggml_tensor * a
|
892
|
+
struct ggml_tensor * a,
|
893
|
+
float eps);
|
857
894
|
|
858
895
|
// a - x
|
859
896
|
// b - dy
|
897
|
+
// TODO: update with configurable eps
|
860
898
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
861
899
|
struct ggml_context * ctx,
|
862
900
|
struct ggml_tensor * a,
|
@@ -948,11 +986,22 @@ extern "C" {
|
|
948
986
|
struct ggml_tensor * a,
|
949
987
|
struct ggml_tensor * b);
|
950
988
|
|
989
|
+
// a -> b, in-place, return view(b)
|
990
|
+
GGML_API struct ggml_tensor * ggml_cpy_inplace(
|
991
|
+
struct ggml_context * ctx,
|
992
|
+
struct ggml_tensor * a,
|
993
|
+
struct ggml_tensor * b);
|
994
|
+
|
951
995
|
// make contiguous
|
952
996
|
GGML_API struct ggml_tensor * ggml_cont(
|
953
997
|
struct ggml_context * ctx,
|
954
998
|
struct ggml_tensor * a);
|
955
999
|
|
1000
|
+
// make contiguous, in-place
|
1001
|
+
GGML_API struct ggml_tensor * ggml_cont_inplace(
|
1002
|
+
struct ggml_context * ctx,
|
1003
|
+
struct ggml_tensor * a);
|
1004
|
+
|
956
1005
|
// return view(a), b specifies the new shape
|
957
1006
|
// TODO: when we start computing gradient, make a copy instead of view
|
958
1007
|
GGML_API struct ggml_tensor * ggml_reshape(
|
@@ -1264,6 +1313,16 @@ extern "C" {
|
|
1264
1313
|
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1265
1314
|
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1266
1315
|
|
1316
|
+
GGML_API struct ggml_tensor * ggml_unary(
|
1317
|
+
struct ggml_context * ctx,
|
1318
|
+
struct ggml_tensor * a,
|
1319
|
+
enum ggml_unary_op op);
|
1320
|
+
|
1321
|
+
GGML_API struct ggml_tensor * ggml_unary_inplace(
|
1322
|
+
struct ggml_context * ctx,
|
1323
|
+
struct ggml_tensor * a,
|
1324
|
+
enum ggml_unary_op op);
|
1325
|
+
|
1267
1326
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
1268
1327
|
struct ggml_context * ctx,
|
1269
1328
|
struct ggml_tensor * a,
|
@@ -1343,11 +1402,17 @@ extern "C" {
|
|
1343
1402
|
struct ggml_context * ctx,
|
1344
1403
|
struct ggml_tensor * tensor);
|
1345
1404
|
|
1405
|
+
|
1346
1406
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1347
1407
|
|
1348
1408
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1349
1409
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1350
1410
|
|
1411
|
+
// graph allocation in a context
|
1412
|
+
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
|
1413
|
+
GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
1414
|
+
GGML_API size_t ggml_graph_overhead(void);
|
1415
|
+
|
1351
1416
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1352
1417
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1353
1418
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
@@ -1666,6 +1666,62 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1666
1666
|
|
1667
1667
|
*s = hsum_float_8(acc) + summs;
|
1668
1668
|
|
1669
|
+
#elif defined __AVX__
|
1670
|
+
|
1671
|
+
const __m128i m3 = _mm_set1_epi8(3);
|
1672
|
+
|
1673
|
+
__m256 acc = _mm256_setzero_ps();
|
1674
|
+
|
1675
|
+
uint32_t ud, um;
|
1676
|
+
const uint8_t * restrict db = (const uint8_t *)&ud;
|
1677
|
+
const uint8_t * restrict mb = (const uint8_t *)&um;
|
1678
|
+
|
1679
|
+
float summs = 0;
|
1680
|
+
|
1681
|
+
// TODO: optimize this
|
1682
|
+
|
1683
|
+
for (int i = 0; i < nb; ++i) {
|
1684
|
+
|
1685
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
1686
|
+
const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
1687
|
+
|
1688
|
+
const uint8_t * restrict q2 = x[i].qs;
|
1689
|
+
const int8_t * restrict q8 = y[i].qs;
|
1690
|
+
|
1691
|
+
const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
|
1692
|
+
ud = (sc[0] >> 0) & 0x0f0f0f0f;
|
1693
|
+
um = (sc[0] >> 4) & 0x0f0f0f0f;
|
1694
|
+
|
1695
|
+
int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
|
1696
|
+
summs += dmin * smin;
|
1697
|
+
|
1698
|
+
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
|
1699
|
+
const __m128i q2_0 = _mm_and_si128(q2bits, m3);
|
1700
|
+
const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
|
1701
|
+
const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
|
1702
|
+
const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
|
1703
|
+
|
1704
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
1705
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
1706
|
+
|
1707
|
+
const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
|
1708
|
+
const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
|
1709
|
+
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
|
1710
|
+
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
|
1711
|
+
|
1712
|
+
const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
|
1713
|
+
const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
|
1714
|
+
const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
|
1715
|
+
const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
|
1716
|
+
|
1717
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
|
1718
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
|
1719
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
|
1720
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
|
1721
|
+
}
|
1722
|
+
|
1723
|
+
*s = hsum_float_8(acc) + summs;
|
1724
|
+
|
1669
1725
|
#else
|
1670
1726
|
|
1671
1727
|
float sumf = 0;
|
@@ -2295,6 +2351,93 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2295
2351
|
|
2296
2352
|
*s = hsum_float_8(acc);
|
2297
2353
|
|
2354
|
+
#elif defined __AVX__
|
2355
|
+
|
2356
|
+
const __m128i m3 = _mm_set1_epi8(3);
|
2357
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
2358
|
+
|
2359
|
+
__m256 acc = _mm256_setzero_ps();
|
2360
|
+
|
2361
|
+
uint64_t aux64;
|
2362
|
+
|
2363
|
+
uint16_t aux16[2];
|
2364
|
+
const int8_t * aux8 = (const int8_t *)aux16;
|
2365
|
+
|
2366
|
+
for (int i = 0; i < nb; ++i) {
|
2367
|
+
|
2368
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
2369
|
+
|
2370
|
+
const uint8_t * restrict q3 = x[i].qs;
|
2371
|
+
const int8_t * restrict q8 = y[i].qs;
|
2372
|
+
|
2373
|
+
const uint16_t a = *(const uint16_t *)x[i].scales;
|
2374
|
+
aux16[0] = a & 0x0f0f;
|
2375
|
+
aux16[1] = (a >> 4) & 0x0f0f;
|
2376
|
+
|
2377
|
+
const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
|
2378
|
+
const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
|
2379
|
+
const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
|
2380
|
+
const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
|
2381
|
+
|
2382
|
+
memcpy(&aux64, x[i].hmask, 8);
|
2383
|
+
|
2384
|
+
__m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
|
2385
|
+
__m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
|
2386
|
+
__m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
|
2387
|
+
__m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
|
2388
|
+
q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
|
2389
|
+
q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
|
2390
|
+
q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
|
2391
|
+
q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
|
2392
|
+
|
2393
|
+
// load low 2 bits
|
2394
|
+
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
|
2395
|
+
|
2396
|
+
// prepare low and high bits
|
2397
|
+
const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
|
2398
|
+
const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
|
2399
|
+
const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
|
2400
|
+
const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
|
2401
|
+
|
2402
|
+
// load Q8 quants
|
2403
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
2404
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
2405
|
+
|
2406
|
+
// Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
|
2407
|
+
// and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
|
2408
|
+
// and 2 if the high bit was set)
|
2409
|
+
const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
|
2410
|
+
const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
|
2411
|
+
const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
|
2412
|
+
const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
|
2413
|
+
|
2414
|
+
__m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
|
2415
|
+
__m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
|
2416
|
+
__m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
|
2417
|
+
__m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
|
2418
|
+
|
2419
|
+
p16_0 = _mm_sub_epi16(p16_0, q8s_0);
|
2420
|
+
p16_1 = _mm_sub_epi16(p16_1, q8s_1);
|
2421
|
+
p16_2 = _mm_sub_epi16(p16_2, q8s_2);
|
2422
|
+
p16_3 = _mm_sub_epi16(p16_3, q8s_3);
|
2423
|
+
|
2424
|
+
// multiply with scales
|
2425
|
+
p16_0 = _mm_madd_epi16(scale_0, p16_0);
|
2426
|
+
p16_1 = _mm_madd_epi16(scale_1, p16_1);
|
2427
|
+
p16_2 = _mm_madd_epi16(scale_2, p16_2);
|
2428
|
+
p16_3 = _mm_madd_epi16(scale_3, p16_3);
|
2429
|
+
|
2430
|
+
p16_0 = _mm_add_epi32(p16_0, p16_2);
|
2431
|
+
p16_1 = _mm_add_epi32(p16_1, p16_3);
|
2432
|
+
__m256i p16 = _mm256_set_m128i(p16_1, p16_0);
|
2433
|
+
|
2434
|
+
// multiply with block scale and accumulate
|
2435
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
|
2436
|
+
|
2437
|
+
}
|
2438
|
+
|
2439
|
+
*s = hsum_float_8(acc);
|
2440
|
+
|
2298
2441
|
#else
|
2299
2442
|
|
2300
2443
|
int8_t aux8[QK_K];
|
@@ -2781,6 +2924,60 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2781
2924
|
|
2782
2925
|
*s = hsum_float_8(acc) - summs;
|
2783
2926
|
|
2927
|
+
#elif defined __AVX__
|
2928
|
+
|
2929
|
+
const __m128i m4 = _mm_set1_epi8(0xF);
|
2930
|
+
|
2931
|
+
__m256 acc = _mm256_setzero_ps();
|
2932
|
+
|
2933
|
+
float summs = 0;
|
2934
|
+
|
2935
|
+
uint16_t aux16[2];
|
2936
|
+
const uint8_t * scales = (const uint8_t *)aux16;
|
2937
|
+
|
2938
|
+
for (int i = 0; i < nb; ++i) {
|
2939
|
+
|
2940
|
+
const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
|
2941
|
+
const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
|
2942
|
+
const __m256 vd = _mm256_set1_ps(d);
|
2943
|
+
|
2944
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
2945
|
+
aux16[0] = a[0] & 0x0f0f;
|
2946
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2947
|
+
|
2948
|
+
summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
|
2949
|
+
|
2950
|
+
const uint8_t * restrict q4 = x[i].qs;
|
2951
|
+
const int8_t * restrict q8 = y[i].qs;
|
2952
|
+
|
2953
|
+
const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
|
2954
|
+
const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
|
2955
|
+
const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
|
2956
|
+
const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
|
2957
|
+
const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
|
2958
|
+
const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
|
2959
|
+
const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
|
2960
|
+
|
2961
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
2962
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
2963
|
+
|
2964
|
+
const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
|
2965
|
+
const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
|
2966
|
+
const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
|
2967
|
+
const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
|
2968
|
+
|
2969
|
+
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
|
2970
|
+
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
|
2971
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
|
2972
|
+
|
2973
|
+
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
|
2974
|
+
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
|
2975
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
|
2976
|
+
|
2977
|
+
}
|
2978
|
+
|
2979
|
+
*s = hsum_float_8(acc) - summs;
|
2980
|
+
|
2784
2981
|
#else
|
2785
2982
|
|
2786
2983
|
uint8_t aux8[QK_K];
|
@@ -3295,10 +3492,66 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3295
3492
|
|
3296
3493
|
*s = hsum_float_8(acc);
|
3297
3494
|
|
3298
|
-
#
|
3495
|
+
#elif defined __AVX__
|
3299
3496
|
|
3497
|
+
const __m128i m4 = _mm_set1_epi8(0xF);
|
3498
|
+
const __m128i mone = _mm_set1_epi8(1);
|
3300
3499
|
|
3301
|
-
|
3500
|
+
__m256 acc = _mm256_setzero_ps();
|
3501
|
+
|
3502
|
+
for (int i = 0; i < nb; ++i) {
|
3503
|
+
|
3504
|
+
const uint8_t * restrict q5 = x[i].qs;
|
3505
|
+
const int8_t * restrict q8 = y[i].qs;
|
3506
|
+
|
3507
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
3508
|
+
|
3509
|
+
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
|
3510
|
+
|
3511
|
+
const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
|
3512
|
+
const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
|
3513
|
+
const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
|
3514
|
+
const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
|
3515
|
+
|
3516
|
+
int64_t aux64;
|
3517
|
+
memcpy(&aux64, x[i].qh, 8);
|
3518
|
+
const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
|
3519
|
+
const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
|
3520
|
+
|
3521
|
+
const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
|
3522
|
+
const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
|
3523
|
+
const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
|
3524
|
+
const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
|
3525
|
+
|
3526
|
+
const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
|
3527
|
+
const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
|
3528
|
+
const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
|
3529
|
+
const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
|
3530
|
+
|
3531
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
3532
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
3533
|
+
|
3534
|
+
const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
|
3535
|
+
const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
|
3536
|
+
const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
|
3537
|
+
const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
|
3538
|
+
const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
|
3539
|
+
const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
|
3540
|
+
const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
|
3541
|
+
const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
|
3542
|
+
|
3543
|
+
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
|
3544
|
+
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
|
3545
|
+
|
3546
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
|
3547
|
+
|
3548
|
+
}
|
3549
|
+
|
3550
|
+
*s = hsum_float_8(acc);
|
3551
|
+
|
3552
|
+
#else
|
3553
|
+
|
3554
|
+
int8_t aux8[QK_K];
|
3302
3555
|
int16_t aux16[16];
|
3303
3556
|
float sums [8];
|
3304
3557
|
memset(sums, 0, 8*sizeof(float));
|
@@ -3308,7 +3561,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3308
3561
|
const uint8_t * restrict q4 = x[i].qs;
|
3309
3562
|
const uint8_t * restrict hm = x[i].qh;
|
3310
3563
|
const int8_t * restrict q8 = y[i].qs;
|
3311
|
-
|
3564
|
+
int8_t * restrict a = aux8;
|
3312
3565
|
for (int l = 0; l < 32; ++l) {
|
3313
3566
|
a[l+ 0] = q4[l] & 0xF;
|
3314
3567
|
a[l+32] = q4[l] >> 4;
|
@@ -3858,6 +4111,77 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3858
4111
|
|
3859
4112
|
*s = hsum_float_8(acc);
|
3860
4113
|
|
4114
|
+
#elif defined __AVX__
|
4115
|
+
|
4116
|
+
const __m128i m4 = _mm_set1_epi8(0xF);
|
4117
|
+
const __m128i m2 = _mm_set1_epi8(3);
|
4118
|
+
const __m128i m32s = _mm_set1_epi8(32);
|
4119
|
+
|
4120
|
+
__m256 acc = _mm256_setzero_ps();
|
4121
|
+
|
4122
|
+
for (int i = 0; i < nb; ++i) {
|
4123
|
+
|
4124
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
4125
|
+
|
4126
|
+
const uint8_t * restrict q4 = x[i].ql;
|
4127
|
+
const uint8_t * restrict qh = x[i].qh;
|
4128
|
+
const int8_t * restrict q8 = y[i].qs;
|
4129
|
+
|
4130
|
+
const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
|
4131
|
+
const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
|
4132
|
+
const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
|
4133
|
+
const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
|
4134
|
+
|
4135
|
+
__m128i sumi_0 = _mm_setzero_si128();
|
4136
|
+
__m128i sumi_1 = _mm_setzero_si128();
|
4137
|
+
|
4138
|
+
const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
|
4139
|
+
const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
|
4140
|
+
|
4141
|
+
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
|
4142
|
+
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
|
4143
|
+
|
4144
|
+
const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
|
4145
|
+
const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
|
4146
|
+
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
|
4147
|
+
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
|
4148
|
+
|
4149
|
+
const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
|
4150
|
+
const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
|
4151
|
+
const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
|
4152
|
+
const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
|
4153
|
+
|
4154
|
+
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
4155
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
4156
|
+
|
4157
|
+
__m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
|
4158
|
+
__m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
|
4159
|
+
__m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
|
4160
|
+
__m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
|
4161
|
+
|
4162
|
+
__m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
|
4163
|
+
__m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
|
4164
|
+
__m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
|
4165
|
+
__m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
|
4166
|
+
|
4167
|
+
p16_0 = _mm_sub_epi16(p16_0, q8s_0);
|
4168
|
+
p16_1 = _mm_sub_epi16(p16_1, q8s_1);
|
4169
|
+
p16_2 = _mm_sub_epi16(p16_2, q8s_2);
|
4170
|
+
p16_3 = _mm_sub_epi16(p16_3, q8s_3);
|
4171
|
+
|
4172
|
+
p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
|
4173
|
+
p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
|
4174
|
+
p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
|
4175
|
+
p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
|
4176
|
+
|
4177
|
+
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
4178
|
+
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
4179
|
+
|
4180
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
|
4181
|
+
}
|
4182
|
+
|
4183
|
+
*s = hsum_float_8(acc);
|
4184
|
+
|
3861
4185
|
#else
|
3862
4186
|
|
3863
4187
|
int8_t aux8[QK_K];
|