llama_cpp 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -199,6 +199,7 @@
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_SRC 6
201
201
  #define GGML_MAX_NAME 48
202
+ #define GGML_MAX_OP_PARAMS 32
202
203
  #define GGML_DEFAULT_N_THREADS 4
203
204
 
204
205
 
@@ -207,6 +208,7 @@
207
208
 
208
209
  #define GGML_UNUSED(x) (void)(x)
209
210
 
211
+ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
210
212
 
211
213
  #define GGML_ASSERT(x) \
212
214
  do { \
@@ -329,16 +331,6 @@ extern "C" {
329
331
  GGML_OP_ARGMAX,
330
332
  GGML_OP_REPEAT,
331
333
  GGML_OP_REPEAT_BACK,
332
- GGML_OP_ABS,
333
- GGML_OP_SGN,
334
- GGML_OP_NEG,
335
- GGML_OP_STEP,
336
- GGML_OP_TANH,
337
- GGML_OP_ELU,
338
- GGML_OP_RELU,
339
- GGML_OP_GELU,
340
- GGML_OP_GELU_QUICK,
341
- GGML_OP_SILU,
342
334
  GGML_OP_SILU_BACK,
343
335
  GGML_OP_NORM, // normalize
344
336
  GGML_OP_RMS_NORM,
@@ -377,6 +369,8 @@ extern "C" {
377
369
  GGML_OP_WIN_PART,
378
370
  GGML_OP_WIN_UNPART,
379
371
 
372
+ GGML_OP_UNARY,
373
+
380
374
  GGML_OP_MAP_UNARY,
381
375
  GGML_OP_MAP_BINARY,
382
376
 
@@ -390,6 +384,24 @@ extern "C" {
390
384
  GGML_OP_COUNT,
391
385
  };
392
386
 
387
+ enum ggml_unary_op {
388
+ GGML_UNARY_OP_ABS,
389
+ GGML_UNARY_OP_SGN,
390
+ GGML_UNARY_OP_NEG,
391
+ GGML_UNARY_OP_STEP,
392
+ GGML_UNARY_OP_TANH,
393
+ GGML_UNARY_OP_ELU,
394
+ GGML_UNARY_OP_RELU,
395
+ GGML_UNARY_OP_GELU,
396
+ GGML_UNARY_OP_GELU_QUICK,
397
+ GGML_UNARY_OP_SILU,
398
+ };
399
+
400
+ enum ggml_object_type {
401
+ GGML_OBJECT_TENSOR,
402
+ GGML_OBJECT_GRAPH,
403
+ GGML_OBJECT_WORK_BUFFER
404
+ };
393
405
 
394
406
  // ggml object
395
407
  struct ggml_object {
@@ -398,7 +410,9 @@ extern "C" {
398
410
 
399
411
  struct ggml_object * next;
400
412
 
401
- char padding[8];
413
+ enum ggml_object_type type;
414
+
415
+ char padding[4];
402
416
  };
403
417
 
404
418
  static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
@@ -418,6 +432,9 @@ extern "C" {
418
432
  // compute data
419
433
  enum ggml_op op;
420
434
 
435
+ // op params - allocated as int32_t for alignment
436
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
437
+
421
438
  bool is_param;
422
439
 
423
440
  struct ggml_tensor * grad;
@@ -434,7 +451,7 @@ extern "C" {
434
451
 
435
452
  void * extra; // extra things e.g. for ggml-cuda.cu
436
453
 
437
- char padding[8];
454
+ char padding[4];
438
455
  };
439
456
 
440
457
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -455,6 +472,11 @@ extern "C" {
455
472
  void * abort_callback_data;
456
473
  };
457
474
 
475
+ // next prime after GGML_MAX_NODES
476
+ // #define GGML_GRAPH_HASHTABLE_SIZE 4099
477
+ // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
478
+ #define GGML_GRAPH_HASHTABLE_SIZE 8273
479
+
458
480
  // computation graph
459
481
  struct ggml_cgraph {
460
482
  int n_nodes;
@@ -464,12 +486,16 @@ extern "C" {
464
486
  struct ggml_tensor * grads[GGML_MAX_NODES];
465
487
  struct ggml_tensor * leafs[GGML_MAX_NODES];
466
488
 
489
+ void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
490
+
467
491
  // performance
468
492
  int perf_runs;
469
493
  int64_t perf_cycles;
470
494
  int64_t perf_time_us;
471
495
  };
472
496
 
497
+ static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
498
+
473
499
  // scratch buffer
474
500
  struct ggml_scratch {
475
501
  size_t offs;
@@ -531,6 +557,7 @@ extern "C" {
531
557
 
532
558
  GGML_API const char * ggml_type_name(enum ggml_type type);
533
559
  GGML_API const char * ggml_op_name (enum ggml_op op);
560
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
534
561
 
535
562
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
536
563
 
@@ -554,6 +581,7 @@ extern "C" {
554
581
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
555
582
 
556
583
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
584
+ GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
557
585
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
558
586
 
559
587
  GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
@@ -613,9 +641,11 @@ extern "C" {
613
641
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
614
642
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
615
643
 
616
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
617
- GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
618
- GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
644
+ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
645
+
646
+ GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
647
+ GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
648
+ GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
619
649
 
620
650
  //
621
651
  // operations on tensors with backpropagation
@@ -625,6 +655,11 @@ extern "C" {
625
655
  struct ggml_context * ctx,
626
656
  struct ggml_tensor * a);
627
657
 
658
+ // in-place, returns view(a)
659
+ GGML_API struct ggml_tensor * ggml_dup_inplace(
660
+ struct ggml_context * ctx,
661
+ struct ggml_tensor * a);
662
+
628
663
  GGML_API struct ggml_tensor * ggml_add(
629
664
  struct ggml_context * ctx,
630
665
  struct ggml_tensor * a,
@@ -849,14 +884,17 @@ extern "C" {
849
884
 
850
885
  GGML_API struct ggml_tensor * ggml_rms_norm(
851
886
  struct ggml_context * ctx,
852
- struct ggml_tensor * a);
887
+ struct ggml_tensor * a,
888
+ float eps);
853
889
 
854
890
  GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
855
891
  struct ggml_context * ctx,
856
- struct ggml_tensor * a);
892
+ struct ggml_tensor * a,
893
+ float eps);
857
894
 
858
895
  // a - x
859
896
  // b - dy
897
+ // TODO: update with configurable eps
860
898
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
861
899
  struct ggml_context * ctx,
862
900
  struct ggml_tensor * a,
@@ -948,11 +986,22 @@ extern "C" {
948
986
  struct ggml_tensor * a,
949
987
  struct ggml_tensor * b);
950
988
 
989
+ // a -> b, in-place, return view(b)
990
+ GGML_API struct ggml_tensor * ggml_cpy_inplace(
991
+ struct ggml_context * ctx,
992
+ struct ggml_tensor * a,
993
+ struct ggml_tensor * b);
994
+
951
995
  // make contiguous
952
996
  GGML_API struct ggml_tensor * ggml_cont(
953
997
  struct ggml_context * ctx,
954
998
  struct ggml_tensor * a);
955
999
 
1000
+ // make contiguous, in-place
1001
+ GGML_API struct ggml_tensor * ggml_cont_inplace(
1002
+ struct ggml_context * ctx,
1003
+ struct ggml_tensor * a);
1004
+
956
1005
  // return view(a), b specifies the new shape
957
1006
  // TODO: when we start computing gradient, make a copy instead of view
958
1007
  GGML_API struct ggml_tensor * ggml_reshape(
@@ -1264,6 +1313,16 @@ extern "C" {
1264
1313
  typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1265
1314
  typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1266
1315
 
1316
+ GGML_API struct ggml_tensor * ggml_unary(
1317
+ struct ggml_context * ctx,
1318
+ struct ggml_tensor * a,
1319
+ enum ggml_unary_op op);
1320
+
1321
+ GGML_API struct ggml_tensor * ggml_unary_inplace(
1322
+ struct ggml_context * ctx,
1323
+ struct ggml_tensor * a,
1324
+ enum ggml_unary_op op);
1325
+
1267
1326
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
1268
1327
  struct ggml_context * ctx,
1269
1328
  struct ggml_tensor * a,
@@ -1343,11 +1402,17 @@ extern "C" {
1343
1402
  struct ggml_context * ctx,
1344
1403
  struct ggml_tensor * tensor);
1345
1404
 
1405
+
1346
1406
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1347
1407
 
1348
1408
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1349
1409
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1350
1410
 
1411
+ // graph allocation in a context
1412
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
1413
+ GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
1414
+ GGML_API size_t ggml_graph_overhead(void);
1415
+
1351
1416
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1352
1417
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1353
1418
  GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
@@ -1666,6 +1666,62 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1666
1666
 
1667
1667
  *s = hsum_float_8(acc) + summs;
1668
1668
 
1669
+ #elif defined __AVX__
1670
+
1671
+ const __m128i m3 = _mm_set1_epi8(3);
1672
+
1673
+ __m256 acc = _mm256_setzero_ps();
1674
+
1675
+ uint32_t ud, um;
1676
+ const uint8_t * restrict db = (const uint8_t *)&ud;
1677
+ const uint8_t * restrict mb = (const uint8_t *)&um;
1678
+
1679
+ float summs = 0;
1680
+
1681
+ // TODO: optimize this
1682
+
1683
+ for (int i = 0; i < nb; ++i) {
1684
+
1685
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
1686
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
1687
+
1688
+ const uint8_t * restrict q2 = x[i].qs;
1689
+ const int8_t * restrict q8 = y[i].qs;
1690
+
1691
+ const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
1692
+ ud = (sc[0] >> 0) & 0x0f0f0f0f;
1693
+ um = (sc[0] >> 4) & 0x0f0f0f0f;
1694
+
1695
+ int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
1696
+ summs += dmin * smin;
1697
+
1698
+ const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1699
+ const __m128i q2_0 = _mm_and_si128(q2bits, m3);
1700
+ const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
1701
+ const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
1702
+ const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
1703
+
1704
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
1705
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
1706
+
1707
+ const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
1708
+ const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
1709
+ const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
1710
+ const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
1711
+
1712
+ const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1713
+ const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1714
+ const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1715
+ const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1716
+
1717
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
1718
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
1719
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
1720
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
1721
+ }
1722
+
1723
+ *s = hsum_float_8(acc) + summs;
1724
+
1669
1725
  #else
1670
1726
 
1671
1727
  float sumf = 0;
@@ -2295,6 +2351,93 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2295
2351
 
2296
2352
  *s = hsum_float_8(acc);
2297
2353
 
2354
+ #elif defined __AVX__
2355
+
2356
+ const __m128i m3 = _mm_set1_epi8(3);
2357
+ const __m128i m1 = _mm_set1_epi8(1);
2358
+
2359
+ __m256 acc = _mm256_setzero_ps();
2360
+
2361
+ uint64_t aux64;
2362
+
2363
+ uint16_t aux16[2];
2364
+ const int8_t * aux8 = (const int8_t *)aux16;
2365
+
2366
+ for (int i = 0; i < nb; ++i) {
2367
+
2368
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
2369
+
2370
+ const uint8_t * restrict q3 = x[i].qs;
2371
+ const int8_t * restrict q8 = y[i].qs;
2372
+
2373
+ const uint16_t a = *(const uint16_t *)x[i].scales;
2374
+ aux16[0] = a & 0x0f0f;
2375
+ aux16[1] = (a >> 4) & 0x0f0f;
2376
+
2377
+ const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
2378
+ const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
2379
+ const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
2380
+ const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
2381
+
2382
+ memcpy(&aux64, x[i].hmask, 8);
2383
+
2384
+ __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2385
+ __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
2386
+ __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
2387
+ __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
2388
+ q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
2389
+ q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
2390
+ q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
2391
+ q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
2392
+
2393
+ // load low 2 bits
2394
+ const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
2395
+
2396
+ // prepare low and high bits
2397
+ const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
2398
+ const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
2399
+ const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
2400
+ const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
2401
+
2402
+ // load Q8 quants
2403
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
2404
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
2405
+
2406
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
2407
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
2408
+ // and 2 if the high bit was set)
2409
+ const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
2410
+ const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
2411
+ const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
2412
+ const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
2413
+
2414
+ __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
2415
+ __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
2416
+ __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
2417
+ __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
2418
+
2419
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
2420
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
2421
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
2422
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
2423
+
2424
+ // multiply with scales
2425
+ p16_0 = _mm_madd_epi16(scale_0, p16_0);
2426
+ p16_1 = _mm_madd_epi16(scale_1, p16_1);
2427
+ p16_2 = _mm_madd_epi16(scale_2, p16_2);
2428
+ p16_3 = _mm_madd_epi16(scale_3, p16_3);
2429
+
2430
+ p16_0 = _mm_add_epi32(p16_0, p16_2);
2431
+ p16_1 = _mm_add_epi32(p16_1, p16_3);
2432
+ __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
2433
+
2434
+ // multiply with block scale and accumulate
2435
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
2436
+
2437
+ }
2438
+
2439
+ *s = hsum_float_8(acc);
2440
+
2298
2441
  #else
2299
2442
 
2300
2443
  int8_t aux8[QK_K];
@@ -2781,6 +2924,60 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2781
2924
 
2782
2925
  *s = hsum_float_8(acc) - summs;
2783
2926
 
2927
+ #elif defined __AVX__
2928
+
2929
+ const __m128i m4 = _mm_set1_epi8(0xF);
2930
+
2931
+ __m256 acc = _mm256_setzero_ps();
2932
+
2933
+ float summs = 0;
2934
+
2935
+ uint16_t aux16[2];
2936
+ const uint8_t * scales = (const uint8_t *)aux16;
2937
+
2938
+ for (int i = 0; i < nb; ++i) {
2939
+
2940
+ const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
2941
+ const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
2942
+ const __m256 vd = _mm256_set1_ps(d);
2943
+
2944
+ const uint16_t * a = (const uint16_t *)x[i].scales;
2945
+ aux16[0] = a[0] & 0x0f0f;
2946
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
2947
+
2948
+ summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
2949
+
2950
+ const uint8_t * restrict q4 = x[i].qs;
2951
+ const int8_t * restrict q8 = y[i].qs;
2952
+
2953
+ const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
2954
+ const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
2955
+ const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
2956
+ const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
2957
+ const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
2958
+ const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
2959
+ const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
2960
+
2961
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
2962
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
2963
+
2964
+ const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
2965
+ const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
2966
+ const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
2967
+ const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
2968
+
2969
+ const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
2970
+ const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
2971
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
2972
+
2973
+ const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
2974
+ const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
2975
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
2976
+
2977
+ }
2978
+
2979
+ *s = hsum_float_8(acc) - summs;
2980
+
2784
2981
  #else
2785
2982
 
2786
2983
  uint8_t aux8[QK_K];
@@ -3295,10 +3492,66 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3295
3492
 
3296
3493
  *s = hsum_float_8(acc);
3297
3494
 
3298
- #else
3495
+ #elif defined __AVX__
3299
3496
 
3497
+ const __m128i m4 = _mm_set1_epi8(0xF);
3498
+ const __m128i mone = _mm_set1_epi8(1);
3300
3499
 
3301
- uint8_t aux8[QK_K];
3500
+ __m256 acc = _mm256_setzero_ps();
3501
+
3502
+ for (int i = 0; i < nb; ++i) {
3503
+
3504
+ const uint8_t * restrict q5 = x[i].qs;
3505
+ const int8_t * restrict q8 = y[i].qs;
3506
+
3507
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
3508
+
3509
+ const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
3510
+
3511
+ const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
3512
+ const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
3513
+ const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
3514
+ const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
3515
+
3516
+ int64_t aux64;
3517
+ memcpy(&aux64, x[i].qh, 8);
3518
+ const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
3519
+ const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
3520
+
3521
+ const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
3522
+ const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
3523
+ const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
3524
+ const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
3525
+
3526
+ const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
3527
+ const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
3528
+ const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
3529
+ const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
3530
+
3531
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
3532
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
3533
+
3534
+ const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
3535
+ const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
3536
+ const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
3537
+ const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
3538
+ const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
3539
+ const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
3540
+ const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
3541
+ const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
3542
+
3543
+ const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
3544
+ const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
3545
+
3546
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
3547
+
3548
+ }
3549
+
3550
+ *s = hsum_float_8(acc);
3551
+
3552
+ #else
3553
+
3554
+ int8_t aux8[QK_K];
3302
3555
  int16_t aux16[16];
3303
3556
  float sums [8];
3304
3557
  memset(sums, 0, 8*sizeof(float));
@@ -3308,7 +3561,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3308
3561
  const uint8_t * restrict q4 = x[i].qs;
3309
3562
  const uint8_t * restrict hm = x[i].qh;
3310
3563
  const int8_t * restrict q8 = y[i].qs;
3311
- uint8_t * restrict a = aux8;
3564
+ int8_t * restrict a = aux8;
3312
3565
  for (int l = 0; l < 32; ++l) {
3313
3566
  a[l+ 0] = q4[l] & 0xF;
3314
3567
  a[l+32] = q4[l] >> 4;
@@ -3858,6 +4111,77 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3858
4111
 
3859
4112
  *s = hsum_float_8(acc);
3860
4113
 
4114
+ #elif defined __AVX__
4115
+
4116
+ const __m128i m4 = _mm_set1_epi8(0xF);
4117
+ const __m128i m2 = _mm_set1_epi8(3);
4118
+ const __m128i m32s = _mm_set1_epi8(32);
4119
+
4120
+ __m256 acc = _mm256_setzero_ps();
4121
+
4122
+ for (int i = 0; i < nb; ++i) {
4123
+
4124
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
4125
+
4126
+ const uint8_t * restrict q4 = x[i].ql;
4127
+ const uint8_t * restrict qh = x[i].qh;
4128
+ const int8_t * restrict q8 = y[i].qs;
4129
+
4130
+ const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
4131
+ const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
4132
+ const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
4133
+ const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
4134
+
4135
+ __m128i sumi_0 = _mm_setzero_si128();
4136
+ __m128i sumi_1 = _mm_setzero_si128();
4137
+
4138
+ const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
4139
+ const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
4140
+
4141
+ const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
4142
+ const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
4143
+
4144
+ const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
4145
+ const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
4146
+ const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
4147
+ const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
4148
+
4149
+ const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
4150
+ const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
4151
+ const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
4152
+ const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
4153
+
4154
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
4155
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
4156
+
4157
+ __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
4158
+ __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
4159
+ __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
4160
+ __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
4161
+
4162
+ __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
4163
+ __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
4164
+ __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
4165
+ __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
4166
+
4167
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
4168
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
4169
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
4170
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
4171
+
4172
+ p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
4173
+ p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
4174
+ p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
4175
+ p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
4176
+
4177
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
4178
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
4179
+
4180
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
4181
+ }
4182
+
4183
+ *s = hsum_float_8(acc);
4184
+
3861
4185
  #else
3862
4186
 
3863
4187
  int8_t aux8[QK_K];