llama_cpp 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -199,6 +199,7 @@
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_SRC 6
201
201
  #define GGML_MAX_NAME 48
202
+ #define GGML_MAX_OP_PARAMS 32
202
203
  #define GGML_DEFAULT_N_THREADS 4
203
204
 
204
205
 
@@ -207,6 +208,7 @@
207
208
 
208
209
  #define GGML_UNUSED(x) (void)(x)
209
210
 
211
+ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
210
212
 
211
213
  #define GGML_ASSERT(x) \
212
214
  do { \
@@ -329,16 +331,6 @@ extern "C" {
329
331
  GGML_OP_ARGMAX,
330
332
  GGML_OP_REPEAT,
331
333
  GGML_OP_REPEAT_BACK,
332
- GGML_OP_ABS,
333
- GGML_OP_SGN,
334
- GGML_OP_NEG,
335
- GGML_OP_STEP,
336
- GGML_OP_TANH,
337
- GGML_OP_ELU,
338
- GGML_OP_RELU,
339
- GGML_OP_GELU,
340
- GGML_OP_GELU_QUICK,
341
- GGML_OP_SILU,
342
334
  GGML_OP_SILU_BACK,
343
335
  GGML_OP_NORM, // normalize
344
336
  GGML_OP_RMS_NORM,
@@ -377,6 +369,8 @@ extern "C" {
377
369
  GGML_OP_WIN_PART,
378
370
  GGML_OP_WIN_UNPART,
379
371
 
372
+ GGML_OP_UNARY,
373
+
380
374
  GGML_OP_MAP_UNARY,
381
375
  GGML_OP_MAP_BINARY,
382
376
 
@@ -390,6 +384,24 @@ extern "C" {
390
384
  GGML_OP_COUNT,
391
385
  };
392
386
 
387
+ enum ggml_unary_op {
388
+ GGML_UNARY_OP_ABS,
389
+ GGML_UNARY_OP_SGN,
390
+ GGML_UNARY_OP_NEG,
391
+ GGML_UNARY_OP_STEP,
392
+ GGML_UNARY_OP_TANH,
393
+ GGML_UNARY_OP_ELU,
394
+ GGML_UNARY_OP_RELU,
395
+ GGML_UNARY_OP_GELU,
396
+ GGML_UNARY_OP_GELU_QUICK,
397
+ GGML_UNARY_OP_SILU,
398
+ };
399
+
400
+ enum ggml_object_type {
401
+ GGML_OBJECT_TENSOR,
402
+ GGML_OBJECT_GRAPH,
403
+ GGML_OBJECT_WORK_BUFFER
404
+ };
393
405
 
394
406
  // ggml object
395
407
  struct ggml_object {
@@ -398,7 +410,9 @@ extern "C" {
398
410
 
399
411
  struct ggml_object * next;
400
412
 
401
- char padding[8];
413
+ enum ggml_object_type type;
414
+
415
+ char padding[4];
402
416
  };
403
417
 
404
418
  static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
@@ -418,6 +432,9 @@ extern "C" {
418
432
  // compute data
419
433
  enum ggml_op op;
420
434
 
435
+ // op params - allocated as int32_t for alignment
436
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
437
+
421
438
  bool is_param;
422
439
 
423
440
  struct ggml_tensor * grad;
@@ -434,7 +451,7 @@ extern "C" {
434
451
 
435
452
  void * extra; // extra things e.g. for ggml-cuda.cu
436
453
 
437
- char padding[8];
454
+ char padding[4];
438
455
  };
439
456
 
440
457
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -455,6 +472,11 @@ extern "C" {
455
472
  void * abort_callback_data;
456
473
  };
457
474
 
475
+ // next prime after GGML_MAX_NODES
476
+ // #define GGML_GRAPH_HASHTABLE_SIZE 4099
477
+ // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
478
+ #define GGML_GRAPH_HASHTABLE_SIZE 8273
479
+
458
480
  // computation graph
459
481
  struct ggml_cgraph {
460
482
  int n_nodes;
@@ -464,12 +486,16 @@ extern "C" {
464
486
  struct ggml_tensor * grads[GGML_MAX_NODES];
465
487
  struct ggml_tensor * leafs[GGML_MAX_NODES];
466
488
 
489
+ void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
490
+
467
491
  // performance
468
492
  int perf_runs;
469
493
  int64_t perf_cycles;
470
494
  int64_t perf_time_us;
471
495
  };
472
496
 
497
+ static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
498
+
473
499
  // scratch buffer
474
500
  struct ggml_scratch {
475
501
  size_t offs;
@@ -531,6 +557,7 @@ extern "C" {
531
557
 
532
558
  GGML_API const char * ggml_type_name(enum ggml_type type);
533
559
  GGML_API const char * ggml_op_name (enum ggml_op op);
560
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
534
561
 
535
562
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
536
563
 
@@ -554,6 +581,7 @@ extern "C" {
554
581
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
555
582
 
556
583
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
584
+ GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
557
585
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
558
586
 
559
587
  GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
@@ -613,9 +641,11 @@ extern "C" {
613
641
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
614
642
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
615
643
 
616
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
617
- GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
618
- GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
644
+ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
645
+
646
+ GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
647
+ GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
648
+ GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
619
649
 
620
650
  //
621
651
  // operations on tensors with backpropagation
@@ -625,6 +655,11 @@ extern "C" {
625
655
  struct ggml_context * ctx,
626
656
  struct ggml_tensor * a);
627
657
 
658
+ // in-place, returns view(a)
659
+ GGML_API struct ggml_tensor * ggml_dup_inplace(
660
+ struct ggml_context * ctx,
661
+ struct ggml_tensor * a);
662
+
628
663
  GGML_API struct ggml_tensor * ggml_add(
629
664
  struct ggml_context * ctx,
630
665
  struct ggml_tensor * a,
@@ -849,14 +884,17 @@ extern "C" {
849
884
 
850
885
  GGML_API struct ggml_tensor * ggml_rms_norm(
851
886
  struct ggml_context * ctx,
852
- struct ggml_tensor * a);
887
+ struct ggml_tensor * a,
888
+ float eps);
853
889
 
854
890
  GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
855
891
  struct ggml_context * ctx,
856
- struct ggml_tensor * a);
892
+ struct ggml_tensor * a,
893
+ float eps);
857
894
 
858
895
  // a - x
859
896
  // b - dy
897
+ // TODO: update with configurable eps
860
898
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
861
899
  struct ggml_context * ctx,
862
900
  struct ggml_tensor * a,
@@ -948,11 +986,22 @@ extern "C" {
948
986
  struct ggml_tensor * a,
949
987
  struct ggml_tensor * b);
950
988
 
989
+ // a -> b, in-place, return view(b)
990
+ GGML_API struct ggml_tensor * ggml_cpy_inplace(
991
+ struct ggml_context * ctx,
992
+ struct ggml_tensor * a,
993
+ struct ggml_tensor * b);
994
+
951
995
  // make contiguous
952
996
  GGML_API struct ggml_tensor * ggml_cont(
953
997
  struct ggml_context * ctx,
954
998
  struct ggml_tensor * a);
955
999
 
1000
+ // make contiguous, in-place
1001
+ GGML_API struct ggml_tensor * ggml_cont_inplace(
1002
+ struct ggml_context * ctx,
1003
+ struct ggml_tensor * a);
1004
+
956
1005
  // return view(a), b specifies the new shape
957
1006
  // TODO: when we start computing gradient, make a copy instead of view
958
1007
  GGML_API struct ggml_tensor * ggml_reshape(
@@ -1264,6 +1313,16 @@ extern "C" {
1264
1313
  typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1265
1314
  typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1266
1315
 
1316
+ GGML_API struct ggml_tensor * ggml_unary(
1317
+ struct ggml_context * ctx,
1318
+ struct ggml_tensor * a,
1319
+ enum ggml_unary_op op);
1320
+
1321
+ GGML_API struct ggml_tensor * ggml_unary_inplace(
1322
+ struct ggml_context * ctx,
1323
+ struct ggml_tensor * a,
1324
+ enum ggml_unary_op op);
1325
+
1267
1326
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
1268
1327
  struct ggml_context * ctx,
1269
1328
  struct ggml_tensor * a,
@@ -1343,11 +1402,17 @@ extern "C" {
1343
1402
  struct ggml_context * ctx,
1344
1403
  struct ggml_tensor * tensor);
1345
1404
 
1405
+
1346
1406
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1347
1407
 
1348
1408
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1349
1409
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1350
1410
 
1411
+ // graph allocation in a context
1412
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
1413
+ GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
1414
+ GGML_API size_t ggml_graph_overhead(void);
1415
+
1351
1416
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1352
1417
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1353
1418
  GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
@@ -1666,6 +1666,62 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1666
1666
 
1667
1667
  *s = hsum_float_8(acc) + summs;
1668
1668
 
1669
+ #elif defined __AVX__
1670
+
1671
+ const __m128i m3 = _mm_set1_epi8(3);
1672
+
1673
+ __m256 acc = _mm256_setzero_ps();
1674
+
1675
+ uint32_t ud, um;
1676
+ const uint8_t * restrict db = (const uint8_t *)&ud;
1677
+ const uint8_t * restrict mb = (const uint8_t *)&um;
1678
+
1679
+ float summs = 0;
1680
+
1681
+ // TODO: optimize this
1682
+
1683
+ for (int i = 0; i < nb; ++i) {
1684
+
1685
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
1686
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
1687
+
1688
+ const uint8_t * restrict q2 = x[i].qs;
1689
+ const int8_t * restrict q8 = y[i].qs;
1690
+
1691
+ const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
1692
+ ud = (sc[0] >> 0) & 0x0f0f0f0f;
1693
+ um = (sc[0] >> 4) & 0x0f0f0f0f;
1694
+
1695
+ int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
1696
+ summs += dmin * smin;
1697
+
1698
+ const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1699
+ const __m128i q2_0 = _mm_and_si128(q2bits, m3);
1700
+ const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
1701
+ const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
1702
+ const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
1703
+
1704
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
1705
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
1706
+
1707
+ const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
1708
+ const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
1709
+ const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
1710
+ const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
1711
+
1712
+ const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1713
+ const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1714
+ const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1715
+ const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1716
+
1717
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
1718
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
1719
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
1720
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
1721
+ }
1722
+
1723
+ *s = hsum_float_8(acc) + summs;
1724
+
1669
1725
  #else
1670
1726
 
1671
1727
  float sumf = 0;
@@ -2295,6 +2351,93 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2295
2351
 
2296
2352
  *s = hsum_float_8(acc);
2297
2353
 
2354
+ #elif defined __AVX__
2355
+
2356
+ const __m128i m3 = _mm_set1_epi8(3);
2357
+ const __m128i m1 = _mm_set1_epi8(1);
2358
+
2359
+ __m256 acc = _mm256_setzero_ps();
2360
+
2361
+ uint64_t aux64;
2362
+
2363
+ uint16_t aux16[2];
2364
+ const int8_t * aux8 = (const int8_t *)aux16;
2365
+
2366
+ for (int i = 0; i < nb; ++i) {
2367
+
2368
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
2369
+
2370
+ const uint8_t * restrict q3 = x[i].qs;
2371
+ const int8_t * restrict q8 = y[i].qs;
2372
+
2373
+ const uint16_t a = *(const uint16_t *)x[i].scales;
2374
+ aux16[0] = a & 0x0f0f;
2375
+ aux16[1] = (a >> 4) & 0x0f0f;
2376
+
2377
+ const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
2378
+ const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
2379
+ const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
2380
+ const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
2381
+
2382
+ memcpy(&aux64, x[i].hmask, 8);
2383
+
2384
+ __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2385
+ __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
2386
+ __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
2387
+ __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
2388
+ q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
2389
+ q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
2390
+ q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
2391
+ q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
2392
+
2393
+ // load low 2 bits
2394
+ const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
2395
+
2396
+ // prepare low and high bits
2397
+ const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
2398
+ const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
2399
+ const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
2400
+ const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
2401
+
2402
+ // load Q8 quants
2403
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
2404
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
2405
+
2406
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
2407
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
2408
+ // and 2 if the high bit was set)
2409
+ const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
2410
+ const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
2411
+ const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
2412
+ const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
2413
+
2414
+ __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
2415
+ __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
2416
+ __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
2417
+ __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
2418
+
2419
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
2420
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
2421
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
2422
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
2423
+
2424
+ // multiply with scales
2425
+ p16_0 = _mm_madd_epi16(scale_0, p16_0);
2426
+ p16_1 = _mm_madd_epi16(scale_1, p16_1);
2427
+ p16_2 = _mm_madd_epi16(scale_2, p16_2);
2428
+ p16_3 = _mm_madd_epi16(scale_3, p16_3);
2429
+
2430
+ p16_0 = _mm_add_epi32(p16_0, p16_2);
2431
+ p16_1 = _mm_add_epi32(p16_1, p16_3);
2432
+ __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
2433
+
2434
+ // multiply with block scale and accumulate
2435
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
2436
+
2437
+ }
2438
+
2439
+ *s = hsum_float_8(acc);
2440
+
2298
2441
  #else
2299
2442
 
2300
2443
  int8_t aux8[QK_K];
@@ -2781,6 +2924,60 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2781
2924
 
2782
2925
  *s = hsum_float_8(acc) - summs;
2783
2926
 
2927
+ #elif defined __AVX__
2928
+
2929
+ const __m128i m4 = _mm_set1_epi8(0xF);
2930
+
2931
+ __m256 acc = _mm256_setzero_ps();
2932
+
2933
+ float summs = 0;
2934
+
2935
+ uint16_t aux16[2];
2936
+ const uint8_t * scales = (const uint8_t *)aux16;
2937
+
2938
+ for (int i = 0; i < nb; ++i) {
2939
+
2940
+ const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
2941
+ const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
2942
+ const __m256 vd = _mm256_set1_ps(d);
2943
+
2944
+ const uint16_t * a = (const uint16_t *)x[i].scales;
2945
+ aux16[0] = a[0] & 0x0f0f;
2946
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
2947
+
2948
+ summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
2949
+
2950
+ const uint8_t * restrict q4 = x[i].qs;
2951
+ const int8_t * restrict q8 = y[i].qs;
2952
+
2953
+ const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
2954
+ const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
2955
+ const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
2956
+ const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
2957
+ const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
2958
+ const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
2959
+ const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
2960
+
2961
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
2962
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
2963
+
2964
+ const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
2965
+ const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
2966
+ const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
2967
+ const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
2968
+
2969
+ const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
2970
+ const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
2971
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
2972
+
2973
+ const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
2974
+ const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
2975
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
2976
+
2977
+ }
2978
+
2979
+ *s = hsum_float_8(acc) - summs;
2980
+
2784
2981
  #else
2785
2982
 
2786
2983
  uint8_t aux8[QK_K];
@@ -3295,10 +3492,66 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3295
3492
 
3296
3493
  *s = hsum_float_8(acc);
3297
3494
 
3298
- #else
3495
+ #elif defined __AVX__
3299
3496
 
3497
+ const __m128i m4 = _mm_set1_epi8(0xF);
3498
+ const __m128i mone = _mm_set1_epi8(1);
3300
3499
 
3301
- uint8_t aux8[QK_K];
3500
+ __m256 acc = _mm256_setzero_ps();
3501
+
3502
+ for (int i = 0; i < nb; ++i) {
3503
+
3504
+ const uint8_t * restrict q5 = x[i].qs;
3505
+ const int8_t * restrict q8 = y[i].qs;
3506
+
3507
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
3508
+
3509
+ const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
3510
+
3511
+ const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
3512
+ const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
3513
+ const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
3514
+ const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
3515
+
3516
+ int64_t aux64;
3517
+ memcpy(&aux64, x[i].qh, 8);
3518
+ const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
3519
+ const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
3520
+
3521
+ const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
3522
+ const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
3523
+ const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
3524
+ const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
3525
+
3526
+ const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
3527
+ const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
3528
+ const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
3529
+ const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
3530
+
3531
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
3532
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
3533
+
3534
+ const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
3535
+ const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
3536
+ const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
3537
+ const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
3538
+ const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
3539
+ const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
3540
+ const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
3541
+ const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
3542
+
3543
+ const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
3544
+ const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
3545
+
3546
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
3547
+
3548
+ }
3549
+
3550
+ *s = hsum_float_8(acc);
3551
+
3552
+ #else
3553
+
3554
+ int8_t aux8[QK_K];
3302
3555
  int16_t aux16[16];
3303
3556
  float sums [8];
3304
3557
  memset(sums, 0, 8*sizeof(float));
@@ -3308,7 +3561,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3308
3561
  const uint8_t * restrict q4 = x[i].qs;
3309
3562
  const uint8_t * restrict hm = x[i].qh;
3310
3563
  const int8_t * restrict q8 = y[i].qs;
3311
- uint8_t * restrict a = aux8;
3564
+ int8_t * restrict a = aux8;
3312
3565
  for (int l = 0; l < 32; ++l) {
3313
3566
  a[l+ 0] = q4[l] & 0xF;
3314
3567
  a[l+32] = q4[l] >> 4;
@@ -3858,6 +4111,77 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3858
4111
 
3859
4112
  *s = hsum_float_8(acc);
3860
4113
 
4114
+ #elif defined __AVX__
4115
+
4116
+ const __m128i m4 = _mm_set1_epi8(0xF);
4117
+ const __m128i m2 = _mm_set1_epi8(3);
4118
+ const __m128i m32s = _mm_set1_epi8(32);
4119
+
4120
+ __m256 acc = _mm256_setzero_ps();
4121
+
4122
+ for (int i = 0; i < nb; ++i) {
4123
+
4124
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
4125
+
4126
+ const uint8_t * restrict q4 = x[i].ql;
4127
+ const uint8_t * restrict qh = x[i].qh;
4128
+ const int8_t * restrict q8 = y[i].qs;
4129
+
4130
+ const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
4131
+ const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
4132
+ const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
4133
+ const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
4134
+
4135
+ __m128i sumi_0 = _mm_setzero_si128();
4136
+ __m128i sumi_1 = _mm_setzero_si128();
4137
+
4138
+ const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
4139
+ const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
4140
+
4141
+ const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
4142
+ const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
4143
+
4144
+ const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
4145
+ const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
4146
+ const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
4147
+ const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
4148
+
4149
+ const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
4150
+ const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
4151
+ const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
4152
+ const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
4153
+
4154
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
4155
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
4156
+
4157
+ __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
4158
+ __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
4159
+ __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
4160
+ __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
4161
+
4162
+ __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
4163
+ __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
4164
+ __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
4165
+ __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
4166
+
4167
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
4168
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
4169
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
4170
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
4171
+
4172
+ p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
4173
+ p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
4174
+ p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
4175
+ p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
4176
+
4177
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
4178
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
4179
+
4180
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
4181
+ }
4182
+
4183
+ *s = hsum_float_8(acc);
4184
+
3861
4185
  #else
3862
4186
 
3863
4187
  int8_t aux8[QK_K];