llama_cpp 0.12.7 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,6 +51,7 @@
51
51
 
52
52
  #define UNUSED GGML_UNUSED
53
53
 
54
+ // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
54
55
  #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
55
56
 
56
57
  #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@@ -462,6 +463,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
462
463
  return res;
463
464
  }
464
465
 
466
+ // NOTE: not tested
467
+ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
468
+ uint8x16_t res;
469
+
470
+ res[ 0] = a[b[ 0]];
471
+ res[ 1] = a[b[ 1]];
472
+ res[ 2] = a[b[ 2]];
473
+ res[ 3] = a[b[ 3]];
474
+ res[ 4] = a[b[ 4]];
475
+ res[ 5] = a[b[ 5]];
476
+ res[ 6] = a[b[ 6]];
477
+ res[ 7] = a[b[ 7]];
478
+ res[ 8] = a[b[ 8]];
479
+ res[ 9] = a[b[ 9]];
480
+ res[10] = a[b[10]];
481
+ res[11] = a[b[11]];
482
+ res[12] = a[b[12]];
483
+ res[13] = a[b[13]];
484
+ res[14] = a[b[14]];
485
+ res[15] = a[b[15]];
486
+
487
+ return res;
488
+ }
489
+
465
490
  #else
466
491
 
467
492
  #define ggml_int16x8x2_t int16x8x2_t
@@ -476,6 +501,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
476
501
  #define ggml_vld1q_s8_x2 vld1q_s8_x2
477
502
  #define ggml_vld1q_s8_x4 vld1q_s8_x4
478
503
  #define ggml_vqtbl1q_s8 vqtbl1q_s8
504
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
479
505
 
480
506
  #endif
481
507
 
@@ -1852,7 +1878,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1852
1878
  float mins[QK_K/16];
1853
1879
  float scales[QK_K/16];
1854
1880
  float sw[QK_K/16];
1855
- float weight[QK_K/16];
1881
+ float weight[16];
1856
1882
  uint8_t Ls[QK_K/16], Lm[QK_K/16];
1857
1883
 
1858
1884
  for (int i = 0; i < nb; i++) {
@@ -1862,13 +1888,42 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1862
1888
  float sigma2 = sumx2/QK_K;
1863
1889
  for (int j = 0; j < QK_K/16; ++j) {
1864
1890
  const float * restrict qw = quant_weights + QK_K * i + 16*j;
1865
- for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
1891
+ for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
1866
1892
  for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
1867
- scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
1893
+ scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
1868
1894
  }
1869
1895
 
1870
- float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
1871
- float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
1896
+ float dm, mm;
1897
+ #if QK_K == 64
1898
+ float max_scale = 0, max_min = 0;
1899
+ for (int j = 0; j < QK_K/16; ++j) {
1900
+ max_scale = MAX(max_scale, scales[j]);
1901
+ max_min = MAX(max_min, mins[j]);
1902
+ }
1903
+ dm = max_scale/15;
1904
+ mm = max_min/15;
1905
+ if (max_scale) {
1906
+ float id = 1/dm;
1907
+ for (int j = 0; j < QK_K/16; ++j) {
1908
+ int l = nearest_int(id*scales[j]);
1909
+ Ls[j] = MAX(0, MIN(15, l));
1910
+ }
1911
+ } else {
1912
+ memset(Ls, 0, QK_K/16);
1913
+ }
1914
+ if (max_min) {
1915
+ float id = 1/mm;
1916
+ for (int j = 0; j < QK_K/16; ++j) {
1917
+ int l = nearest_int(id*mins[j]);
1918
+ Lm[j] = MAX(0, MIN(15, l));
1919
+ }
1920
+ } else {
1921
+ memset(Lm, 0, QK_K/16);
1922
+ }
1923
+ #else
1924
+ dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
1925
+ mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
1926
+ #endif
1872
1927
  y[i].d = GGML_FP32_TO_FP16(dm);
1873
1928
  y[i].dmin = GGML_FP32_TO_FP16(mm);
1874
1929
  dm = GGML_FP16_TO_FP32(y[i].d);
@@ -3470,6 +3525,265 @@ static const uint64_t iq2xs_grid[512] = {
3470
3525
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3471
3526
  };
3472
3527
 
3528
+ static const uint64_t iq2s_grid[1024] = {
3529
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3530
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3531
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3532
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3533
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3534
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
3535
+ 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
3536
+ 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
3537
+ 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
3538
+ 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
3539
+ 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
3540
+ 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
3541
+ 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
3542
+ 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
3543
+ 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
3544
+ 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
3545
+ 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
3546
+ 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
3547
+ 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
3548
+ 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
3549
+ 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
3550
+ 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
3551
+ 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
3552
+ 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
3553
+ 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
3554
+ 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
3555
+ 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
3556
+ 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
3557
+ 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
3558
+ 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
3559
+ 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
3560
+ 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
3561
+ 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
3562
+ 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
3563
+ 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
3564
+ 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
3565
+ 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
3566
+ 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
3567
+ 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
3568
+ 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
3569
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
3570
+ 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
3571
+ 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
3572
+ 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
3573
+ 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
3574
+ 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
3575
+ 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
3576
+ 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
3577
+ 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
3578
+ 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
3579
+ 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
3580
+ 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
3581
+ 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
3582
+ 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
3583
+ 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
3584
+ 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
3585
+ 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
3586
+ 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
3587
+ 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
3588
+ 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
3589
+ 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
3590
+ 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
3591
+ 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
3592
+ 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
3593
+ 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
3594
+ 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
3595
+ 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
3596
+ 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
3597
+ 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
3598
+ 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
3599
+ 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
3600
+ 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
3601
+ 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
3602
+ 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
3603
+ 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
3604
+ 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
3605
+ 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
3606
+ 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
3607
+ 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
3608
+ 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
3609
+ 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
3610
+ 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
3611
+ 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
3612
+ 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
3613
+ 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
3614
+ 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
3615
+ 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
3616
+ 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
3617
+ 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
3618
+ 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
3619
+ 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
3620
+ 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
3621
+ 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
3622
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
3623
+ 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
3624
+ 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
3625
+ 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
3626
+ 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
3627
+ 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
3628
+ 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
3629
+ 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
3630
+ 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
3631
+ 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
3632
+ 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
3633
+ 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
3634
+ 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
3635
+ 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
3636
+ 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
3637
+ 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
3638
+ 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
3639
+ 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
3640
+ 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
3641
+ 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
3642
+ 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
3643
+ 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
3644
+ 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
3645
+ 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
3646
+ 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
3647
+ 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
3648
+ 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
3649
+ 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
3650
+ 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
3651
+ 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
3652
+ 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
3653
+ 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
3654
+ 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
3655
+ 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
3656
+ 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
3657
+ 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
3658
+ 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
3659
+ 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
3660
+ 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
3661
+ 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
3662
+ 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
3663
+ 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
3664
+ 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
3665
+ 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
3666
+ 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
3667
+ 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
3668
+ 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
3669
+ 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
3670
+ 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
3671
+ 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
3672
+ 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
3673
+ 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
3674
+ 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
3675
+ 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
3676
+ 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
3677
+ 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
3678
+ 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
3679
+ 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
3680
+ 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
3681
+ 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
3682
+ 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
3683
+ 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
3684
+ 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
3685
+ 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
3686
+ 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
3687
+ 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
3688
+ 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
3689
+ 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
3690
+ 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
3691
+ 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
3692
+ 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
3693
+ 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
3694
+ 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
3695
+ 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
3696
+ 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
3697
+ 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
3698
+ 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
3699
+ 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
3700
+ 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
3701
+ 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
3702
+ 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
3703
+ 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
3704
+ 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
3705
+ 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
3706
+ 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
3707
+ 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
3708
+ 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
3709
+ 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
3710
+ 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
3711
+ 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
3712
+ 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
3713
+ 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
3714
+ 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
3715
+ 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
3716
+ 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
3717
+ 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
3718
+ 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
3719
+ 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
3720
+ 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
3721
+ 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
3722
+ 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
3723
+ 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
3724
+ 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
3725
+ 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
3726
+ 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
3727
+ 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
3728
+ 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
3729
+ 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
3730
+ 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
3731
+ 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
3732
+ 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
3733
+ 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
3734
+ 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
3735
+ 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
3736
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
3737
+ 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
3738
+ 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
3739
+ 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
3740
+ 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
3741
+ 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
3742
+ 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
3743
+ 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
3744
+ 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
3745
+ 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
3746
+ 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
3747
+ 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
3748
+ 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
3749
+ 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
3750
+ 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
3751
+ 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
3752
+ 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
3753
+ 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
3754
+ 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
3755
+ 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
3756
+ 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
3757
+ 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
3758
+ 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
3759
+ 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
3760
+ 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
3761
+ 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
3762
+ 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
3763
+ 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
3764
+ 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
3765
+ 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
3766
+ 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
3767
+ 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
3768
+ 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
3769
+ 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
3770
+ 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
3771
+ 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
3772
+ 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
3773
+ 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
3774
+ 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
3775
+ 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
3776
+ 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
3777
+ 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
3778
+ 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
3779
+ 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
3780
+ 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
3781
+ 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
3782
+ 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
3783
+ 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
3784
+ 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
3785
+ };
3786
+
3473
3787
  static const uint32_t iq3xxs_grid[256] = {
3474
3788
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
3475
3789
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
@@ -3505,6 +3819,73 @@ static const uint32_t iq3xxs_grid[256] = {
3505
3819
  0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
3506
3820
  };
3507
3821
 
3822
+ static const uint32_t iq3s_grid[512] = {
3823
+ 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
3824
+ 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
3825
+ 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
3826
+ 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
3827
+ 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
3828
+ 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
3829
+ 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
3830
+ 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
3831
+ 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
3832
+ 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
3833
+ 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
3834
+ 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
3835
+ 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
3836
+ 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
3837
+ 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
3838
+ 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
3839
+ 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
3840
+ 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
3841
+ 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
3842
+ 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
3843
+ 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
3844
+ 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
3845
+ 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
3846
+ 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
3847
+ 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
3848
+ 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
3849
+ 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
3850
+ 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
3851
+ 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
3852
+ 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
3853
+ 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
3854
+ 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
3855
+ 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
3856
+ 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
3857
+ 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
3858
+ 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
3859
+ 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
3860
+ 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
3861
+ 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
3862
+ 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
3863
+ 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
3864
+ 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
3865
+ 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
3866
+ 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
3867
+ 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
3868
+ 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
3869
+ 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
3870
+ 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
3871
+ 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
3872
+ 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
3873
+ 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
3874
+ 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
3875
+ 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
3876
+ 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
3877
+ 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
3878
+ 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
3879
+ 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
3880
+ 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
3881
+ 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
3882
+ 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
3883
+ 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
3884
+ 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
3885
+ 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
3886
+ 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
3887
+ };
3888
+
3508
3889
  #define NGRID_IQ2XXS 512
3509
3890
  static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
3510
3891
  0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
@@ -3704,6 +4085,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
3704
4085
  }
3705
4086
  }
3706
4087
 
4088
+ // ====================== 2.5625 bpw (de)-quantization
4089
+
4090
+ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
4091
+ assert(k % QK_K == 0);
4092
+ const int nb = k / QK_K;
4093
+
4094
+ float db[2];
4095
+
4096
+ for (int i = 0; i < nb; i++) {
4097
+
4098
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4099
+ const uint8_t * qs = x[i].qs;
4100
+ const uint8_t * qh = x[i].qh;
4101
+ const uint8_t * signs = qs + QK_K/8;
4102
+
4103
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
4104
+ db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
4105
+ db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
4106
+ for (int l = 0; l < 4; ++l) {
4107
+ const float dl = db[l/2];
4108
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
4109
+ for (int j = 0; j < 8; ++j) {
4110
+ y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
4111
+ }
4112
+ y += 8;
4113
+ }
4114
+ qs += 4;
4115
+ signs += 4;
4116
+ }
4117
+ }
4118
+ }
4119
+
3707
4120
  // ====================== 3.0625 bpw (de)-quantization
3708
4121
 
3709
4122
  void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
@@ -3736,6 +4149,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
3736
4149
  }
3737
4150
  }
3738
4151
 
4152
+ // ====================== 3.3125 bpw (de)-quantization
4153
+
4154
+ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
4155
+ assert(k % QK_K == 0);
4156
+ const int nb = k / QK_K;
4157
+
4158
+ for (int i = 0; i < nb; i++) {
4159
+
4160
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4161
+ const uint8_t * qs = x[i].qs;
4162
+ const uint8_t * qh = x[i].qh;
4163
+ const uint8_t * signs = x[i].signs;
4164
+
4165
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
4166
+ const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
4167
+ const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
4168
+ for (int l = 0; l < 4; ++l) {
4169
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
4170
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
4171
+ for (int j = 0; j < 4; ++j) {
4172
+ y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4173
+ y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
4174
+ }
4175
+ y += 8;
4176
+ }
4177
+ qs += 8;
4178
+ signs += 4;
4179
+ for (int l = 0; l < 4; ++l) {
4180
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
4181
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
4182
+ for (int j = 0; j < 4; ++j) {
4183
+ y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4184
+ y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
4185
+ }
4186
+ y += 8;
4187
+ }
4188
+ qh += 2;
4189
+ qs += 8;
4190
+ signs += 4;
4191
+ }
4192
+ }
4193
+ }
4194
+
3739
4195
  // ====================== 1.5625 bpw (de)-quantization
3740
4196
 
3741
4197
  void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
@@ -3799,6 +4255,33 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
3799
4255
  }
3800
4256
  }
3801
4257
 
4258
+ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
4259
+ assert(k % QK_K == 0);
4260
+ #if QK_K == 64
4261
+ dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
4262
+ #else
4263
+ const int nb = k / QK_K;
4264
+
4265
+ for (int i = 0; i < nb; i++) {
4266
+
4267
+ const uint8_t * qs = x[i].qs;
4268
+
4269
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4270
+
4271
+ for (int ib = 0; ib < QK_K/32; ++ib) {
4272
+ const int ls = ((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4);
4273
+ const float dl = d * (ls - 32);
4274
+ for (int j = 0; j < 16; ++j) {
4275
+ y[j+ 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
4276
+ y[j+16] = dl * kvalues_iq4nl[qs[j] >> 4];
4277
+ }
4278
+ y += 32;
4279
+ qs += 16;
4280
+ }
4281
+ }
4282
+ #endif
4283
+ }
4284
+
3802
4285
  //===================================== Q8_K ==============================================
3803
4286
 
3804
4287
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -5857,7 +6340,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5857
6340
 
5858
6341
  float sumf = 0;
5859
6342
 
5860
- int isum[4];
6343
+ int isum[QK_K/16];
5861
6344
 
5862
6345
  for (int i = 0; i < nb; ++i) {
5863
6346
 
@@ -5873,14 +6356,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5873
6356
  const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5874
6357
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5875
6358
 
5876
- isum[0] = isum[1] = isum[2] = isum[3] = 0;
6359
+ memset(isum, 0, (QK_K/16)*sizeof(int));
5877
6360
  for (int l = 0; l < 16; ++l) {
5878
6361
  isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
5879
6362
  isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
5880
6363
  isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
5881
6364
  isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
5882
6365
  }
5883
- for (int l = 0; l < 4; ++l) {
6366
+ for (int l = 0; l < QK_K/16; ++l) {
5884
6367
  isum[l] *= (sc[l] & 0xF);
5885
6368
  }
5886
6369
  sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
@@ -8806,6 +9289,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8806
9289
 
8807
9290
  #endif
8808
9291
 
9292
+ #if defined (__AVX2__) || defined (__ARM_NEON)
8809
9293
  static const int8_t keven_signs_q2xs[1024] = {
8810
9294
  1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
8811
9295
  1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
@@ -8840,6 +9324,7 @@ static const int8_t keven_signs_q2xs[1024] = {
8840
9324
  1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
8841
9325
  1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
8842
9326
  };
9327
+ #endif
8843
9328
 
8844
9329
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8845
9330
  assert(n % QK_K == 0);
@@ -9037,15 +9522,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9037
9522
 
9038
9523
  #elif defined(__AVX2__)
9039
9524
 
9040
- const __m128i m4 = _mm_set1_epi8(0xf);
9041
- const __m128i m1 = _mm_set1_epi8(1);
9042
- const __m256i m511 = _mm256_set1_epi16(511);
9043
9525
  const __m256i mone = _mm256_set1_epi8(1);
9044
-
9045
- static const uint8_t k_bit_helper[32] = {
9046
- 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9047
- 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9048
- };
9049
9526
  static const char block_sign_shuffle_mask_1[32] = {
9050
9527
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
9051
9528
  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
@@ -9059,11 +9536,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9059
9536
  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9060
9537
  };
9061
9538
 
9062
- const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
9063
9539
  const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
9064
9540
  const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
9065
9541
  const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
9066
9542
 
9543
+ #if QK_K == 64
9544
+ static const uint8_t k_bit_helper[16] = {
9545
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9546
+ };
9547
+ const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
9548
+ const __m128i m511 = _mm_set1_epi16(511);
9549
+ typedef union {
9550
+ __m128i vec_index;
9551
+ uint16_t index[8];
9552
+ } index_t;
9553
+
9554
+ index_t idx;
9555
+ __m256 accumf = _mm256_setzero_ps();
9556
+ for (int i = 0; i < nb; ++i) {
9557
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9558
+ const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
9559
+ idx.vec_index = _mm_and_si128(q2_data, m511);
9560
+
9561
+ const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
9562
+ const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
9563
+ const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
9564
+
9565
+ const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
9566
+ const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
9567
+ const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
9568
+
9569
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
9570
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
9571
+
9572
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
9573
+ iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
9574
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
9575
+ iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
9576
+
9577
+ __m256i signs;
9578
+ signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
9579
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
9580
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
9581
+
9582
+ signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
9583
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
9584
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
9585
+
9586
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
9587
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
9588
+
9589
+ const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
9590
+ const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
9591
+
9592
+ const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
9593
+
9594
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
9595
+
9596
+ }
9597
+
9598
+ *s = 0.125f * hsum_float_8(accumf);
9599
+ #else
9600
+
9601
+ static const uint8_t k_bit_helper[32] = {
9602
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9603
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9604
+ };
9605
+ const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
9606
+ const __m256i m511 = _mm256_set1_epi16(511);
9607
+ const __m128i m4 = _mm_set1_epi8(0xf);
9608
+ const __m128i m1 = _mm_set1_epi8(1);
9609
+
9067
9610
  uint64_t aux64;
9068
9611
 
9069
9612
  // somewhat hacky, but gives a significant boost in performance
@@ -9111,8 +9654,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9111
9654
 
9112
9655
  const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
9113
9656
  const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
9114
- const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
9115
- const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
9657
+ const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
9658
+ const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
9116
9659
 
9117
9660
  __m256i signs;
9118
9661
  signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
@@ -9152,6 +9695,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9152
9695
  }
9153
9696
 
9154
9697
  *s = 0.125f * hsum_float_8(accumf);
9698
+ #endif
9155
9699
 
9156
9700
  #else
9157
9701
 
@@ -9193,7 +9737,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9193
9737
  #endif
9194
9738
  }
9195
9739
 
9196
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9740
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9197
9741
  assert(n % QK_K == 0);
9198
9742
  assert(nrc == 1);
9199
9743
  UNUSED(nrc);
@@ -9201,88 +9745,148 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9201
9745
  UNUSED(by);
9202
9746
  UNUSED(bs);
9203
9747
 
9204
- const block_iq3_xxs * restrict x = vx;
9205
- const block_q8_K * restrict y = vy;
9748
+ const block_iq2_s * restrict x = vx;
9749
+ const block_q8_K * restrict y = vy;
9206
9750
 
9207
9751
  const int nb = n / QK_K;
9208
9752
 
9209
9753
  #if defined(__ARM_NEON)
9210
9754
 
9211
- const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9755
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9756
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9757
+ };
9212
9758
 
9213
- uint32_t aux32[2];
9759
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9214
9760
 
9215
- ggml_int8x16x4_t q3s;
9761
+ const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
9762
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
9763
+ const uint8x16_t m1 = vdupq_n_u8(1);
9764
+ const int32x4_t vzero = vdupq_n_s32(0);
9765
+
9766
+ uint8x16x2_t vs;
9767
+ ggml_int8x16x4_t q2s;
9216
9768
  ggml_int8x16x4_t q8b;
9217
9769
 
9218
9770
  float sumf = 0;
9219
9771
  for (int i = 0; i < nb; ++i) {
9772
+
9220
9773
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9221
- const uint8_t * restrict q3 = x[i].qs;
9222
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9223
- const int8_t * restrict q8 = y[i].qs;
9224
- float sumf1 = 0, sumf2 = 0;
9774
+
9775
+ const uint8_t * restrict qs = x[i].qs;
9776
+ const uint8_t * restrict qh = x[i].qh;
9777
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9778
+ const int8_t * restrict q8 = y[i].qs;
9779
+
9780
+ int sumi1 = 0, sumi2 = 0;
9225
9781
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9226
9782
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9227
- memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
9228
- const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
9229
- const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
9230
- const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
9231
- const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
9232
- q3 += 16;
9233
- q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
9234
- q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
9235
- q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
9236
- q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
9237
- q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
9238
- q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
9239
- q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
9240
- q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
9241
- const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
9242
- const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
9243
- sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
9244
- sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
9783
+ q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
9784
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
9785
+ q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
9786
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
9787
+ q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
9788
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
9789
+ q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
9790
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
9791
+ qs += 8;
9792
+
9793
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9794
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9795
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9796
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
9797
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
9798
+
9799
+ q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
9800
+ q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
9801
+
9802
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9803
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9804
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9805
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
9806
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
9807
+
9808
+ signs += 4;
9809
+
9810
+ q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
9811
+ q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
9812
+
9813
+ const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
9814
+ const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
9815
+ const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
9816
+ const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
9817
+
9818
+ sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
9819
+ sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
9820
+ sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
9821
+ sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
9245
9822
  }
9246
- sumf += d*(sumf1 + sumf2);
9823
+ sumf += d*(sumi1 + sumi2);
9247
9824
  }
9248
- *s = 0.5f * sumf;
9825
+
9826
+ *s = 0.125f * sumf;
9249
9827
 
9250
9828
  #elif defined(__AVX2__)
9251
9829
 
9252
- const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9830
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9831
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9832
+ };
9253
9833
 
9254
- uint32_t aux32[2];
9834
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9835
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9836
+ };
9837
+
9838
+ const __m128i m4 = _mm_set1_epi8(0xf);
9839
+ const __m128i m1 = _mm_set1_epi8(1);
9840
+
9841
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
9842
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
9843
+
9844
+ uint64_t aux64;
9255
9845
 
9256
9846
  __m256 accumf = _mm256_setzero_ps();
9257
9847
  for (int i = 0; i < nb; ++i) {
9258
9848
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9259
- const uint8_t * restrict q3 = x[i].qs;
9260
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9849
+ const uint8_t * restrict qs = x[i].qs;
9850
+ const uint8_t * restrict qh = x[i].qh;
9851
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9261
9852
  const int8_t * restrict q8 = y[i].qs;
9853
+
9854
+ memcpy(&aux64, x[i].scales, 8);
9855
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
9856
+ const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
9857
+
9262
9858
  __m256i sumi1 = _mm256_setzero_si256();
9263
9859
  __m256i sumi2 = _mm256_setzero_si256();
9264
9860
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9265
9861
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
9266
9862
  const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
9267
- const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
9268
- iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
9269
- q3 += 8;
9270
- const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
9271
- iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
9272
- q3 += 8;
9273
- memcpy(aux32, gas, 8); gas += 8;
9274
- const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
9275
- signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
9276
- const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
9277
- signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
9278
- const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
9279
- const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
9280
- const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
9281
- const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
9282
- const uint16_t ls1 = aux32[0] >> 28;
9283
- const uint16_t ls2 = aux32[1] >> 28;
9284
- const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
9285
- const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
9863
+ const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
9864
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
9865
+ iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
9866
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
9867
+ const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
9868
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
9869
+ iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
9870
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
9871
+ qs += 8;
9872
+
9873
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
9874
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9875
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
9876
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
9877
+
9878
+ aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
9879
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9880
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
9881
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
9882
+
9883
+ signs += 4;
9884
+
9885
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
9886
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
9887
+
9888
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
9889
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
9286
9890
  sumi1 = _mm256_add_epi32(sumi1, p1);
9287
9891
  sumi2 = _mm256_add_epi32(sumi2, p2);
9288
9892
  }
@@ -9291,19 +9895,163 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9291
9895
 
9292
9896
  }
9293
9897
 
9294
- *s = 0.25f * hsum_float_8(accumf);
9898
+ *s = 0.125f * hsum_float_8(accumf);
9295
9899
 
9296
9900
  #else
9297
9901
 
9298
- uint32_t aux32;
9902
+ float sumf = 0;
9903
+ for (int i = 0; i < nb; i++) {
9299
9904
 
9300
- float sumf = 0.f;
9301
- for (int i = 0; i < nb; ++i) {
9302
9905
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9303
- const uint8_t * restrict q3 = x[i].qs;
9304
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9305
- const int8_t * restrict q8 = y[i].qs;
9306
- int32_t bsum = 0;
9906
+ const int8_t * q8 = y[i].qs;
9907
+ const uint8_t * qs = x[i].qs;
9908
+ const uint8_t * qh = x[i].qh;
9909
+ const uint8_t * signs = qs + QK_K/8;
9910
+
9911
+ int bsum = 0;
9912
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9913
+ int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
9914
+ int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
9915
+ int sumi1 = 0, sumi2 = 0;
9916
+ for (int l = 0; l < 2; ++l) {
9917
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
9918
+ for (int j = 0; j < 8; ++j) {
9919
+ sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
9920
+ }
9921
+ q8 += 8;
9922
+ }
9923
+ for (int l = 2; l < 4; ++l) {
9924
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
9925
+ for (int j = 0; j < 8; ++j) {
9926
+ sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
9927
+ }
9928
+ q8 += 8;
9929
+ }
9930
+ bsum += ls1 * sumi1 + ls2 * sumi2;
9931
+ qs += 4;
9932
+ signs += 4;
9933
+ }
9934
+
9935
+ sumf += d * bsum;
9936
+ }
9937
+
9938
+ *s = 0.125f * sumf;
9939
+
9940
+ #endif
9941
+
9942
+ }
9943
+
9944
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9945
+ assert(n % QK_K == 0);
9946
+ assert(nrc == 1);
9947
+ UNUSED(nrc);
9948
+ UNUSED(bx);
9949
+ UNUSED(by);
9950
+ UNUSED(bs);
9951
+
9952
+ const block_iq3_xxs * restrict x = vx;
9953
+ const block_q8_K * restrict y = vy;
9954
+
9955
+ const int nb = n / QK_K;
9956
+
9957
+ #if defined(__ARM_NEON)
9958
+
9959
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9960
+
9961
+ uint32_t aux32[2];
9962
+
9963
+ ggml_int8x16x4_t q3s;
9964
+ ggml_int8x16x4_t q8b;
9965
+
9966
+ float sumf = 0;
9967
+ for (int i = 0; i < nb; ++i) {
9968
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9969
+ const uint8_t * restrict q3 = x[i].qs;
9970
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
9971
+ const int8_t * restrict q8 = y[i].qs;
9972
+ float sumf1 = 0, sumf2 = 0;
9973
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9974
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9975
+ memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
9976
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
9977
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
9978
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
9979
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
9980
+ q3 += 16;
9981
+ q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
9982
+ q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
9983
+ q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
9984
+ q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
9985
+ q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
9986
+ q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
9987
+ q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
9988
+ q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
9989
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
9990
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
9991
+ sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
9992
+ sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
9993
+ }
9994
+ sumf += d*(sumf1 + sumf2);
9995
+ }
9996
+ *s = 0.5f * sumf;
9997
+
9998
+ #elif defined(__AVX2__)
9999
+
10000
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10001
+
10002
+ uint32_t aux32[2];
10003
+
10004
+ __m256 accumf = _mm256_setzero_ps();
10005
+ for (int i = 0; i < nb; ++i) {
10006
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10007
+ const uint8_t * restrict q3 = x[i].qs;
10008
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
10009
+ const int8_t * restrict q8 = y[i].qs;
10010
+ __m256i sumi1 = _mm256_setzero_si256();
10011
+ __m256i sumi2 = _mm256_setzero_si256();
10012
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10013
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10014
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10015
+ const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
10016
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10017
+ q3 += 8;
10018
+ const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
10019
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10020
+ q3 += 8;
10021
+ memcpy(aux32, gas, 8); gas += 8;
10022
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
10023
+ signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
10024
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
10025
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
10026
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
10027
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
10028
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
10029
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
10030
+ const uint16_t ls1 = aux32[0] >> 28;
10031
+ const uint16_t ls2 = aux32[1] >> 28;
10032
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
10033
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
10034
+ sumi1 = _mm256_add_epi32(sumi1, p1);
10035
+ sumi2 = _mm256_add_epi32(sumi2, p2);
10036
+ }
10037
+
10038
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
10039
+
10040
+ }
10041
+
10042
+ *s = 0.25f * hsum_float_8(accumf);
10043
+
10044
+ #else
10045
+
10046
+ uint32_t aux32;
10047
+
10048
+ float sumf = 0.f;
10049
+ for (int i = 0; i < nb; ++i) {
10050
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10051
+ const uint8_t * restrict q3 = x[i].qs;
10052
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
10053
+ const int8_t * restrict q8 = y[i].qs;
10054
+ int32_t bsum = 0;
9307
10055
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9308
10056
  memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
9309
10057
  const uint32_t ls = 2*(aux32 >> 28) + 1;
@@ -9327,6 +10075,245 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9327
10075
  #endif
9328
10076
  }
9329
10077
 
10078
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10079
+ assert(n % QK_K == 0);
10080
+ assert(nrc == 1);
10081
+ UNUSED(nrc);
10082
+ UNUSED(bx);
10083
+ UNUSED(by);
10084
+ UNUSED(bs);
10085
+
10086
+ const block_iq3_s * restrict x = vx;
10087
+ const block_q8_K * restrict y = vy;
10088
+
10089
+ const int nb = n / QK_K;
10090
+
10091
+ #if defined(__ARM_NEON)
10092
+
10093
+ typedef union {
10094
+ uint16x8_t vec_index;
10095
+ uint16_t index[8];
10096
+ } vec_index_t;
10097
+
10098
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10099
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10100
+ };
10101
+
10102
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10103
+
10104
+ static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
10105
+
10106
+ const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
10107
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
10108
+ const int16x8_t hshift = vld1q_s16(k_shift);
10109
+ const uint16x8_t m256 = vdupq_n_u16(256);
10110
+ const uint8x16_t m1 = vdupq_n_u8(1);
10111
+
10112
+ uint8x16x2_t vs;
10113
+ ggml_int8x16x4_t q3s;
10114
+ ggml_int8x16x4_t q8b;
10115
+ vec_index_t idx;
10116
+
10117
+ #if QK_K == 256
10118
+ uint32_t scales32[2];
10119
+ const uint8_t * scales8 = (const uint8_t *)scales32;
10120
+ #endif
10121
+
10122
+ float sumf = 0;
10123
+ for (int i = 0; i < nb; ++i) {
10124
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10125
+ const uint8_t * restrict qs = x[i].qs;
10126
+ const uint8_t * restrict qh = x[i].qh;
10127
+ const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10128
+ const int8_t * restrict q8 = y[i].qs;
10129
+
10130
+ #if QK_K == 256
10131
+ memcpy(scales32, x[i].scales, 4);
10132
+ scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
10133
+ scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
10134
+ #endif
10135
+
10136
+ int sumi1 = 0, sumi2 = 0;
10137
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10138
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
10139
+
10140
+ const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
10141
+ idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
10142
+ const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
10143
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
10144
+ const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
10145
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
10146
+ idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
10147
+ const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
10148
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
10149
+ const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
10150
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
10151
+
10152
+
10153
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
10154
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10155
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10156
+ vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
10157
+ vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
10158
+
10159
+ q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
10160
+ q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
10161
+
10162
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
10163
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10164
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10165
+ vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
10166
+ vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
10167
+
10168
+ signs += 4;
10169
+
10170
+ q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
10171
+ q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
10172
+
10173
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
10174
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
10175
+ #if QK_K == 256
10176
+ sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
10177
+ sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
10178
+ #else
10179
+ sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
10180
+ sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
10181
+ #endif
10182
+ }
10183
+ sumf += d*(sumi1 + sumi2);
10184
+ }
10185
+ *s = sumf;
10186
+
10187
+ #elif defined(__AVX2__)
10188
+
10189
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10190
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10191
+ };
10192
+
10193
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10194
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10195
+ };
10196
+
10197
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
10198
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
10199
+
10200
+ const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
10201
+ const __m256i idx_mask = _mm256_set1_epi32(256);
10202
+
10203
+ typedef union {
10204
+ __m256i vec[2];
10205
+ uint32_t index[16];
10206
+ } index_t;
10207
+
10208
+ index_t idx;
10209
+
10210
+ __m256 accumf = _mm256_setzero_ps();
10211
+ for (int i = 0; i < nb; ++i) {
10212
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10213
+ const uint8_t * restrict qs = x[i].qs;
10214
+ const uint8_t * restrict qh = x[i].qh;
10215
+ const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10216
+ const int8_t * restrict q8 = y[i].qs;
10217
+ __m256i sumi1 = _mm256_setzero_si256();
10218
+ __m256i sumi2 = _mm256_setzero_si256();
10219
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10220
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10221
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10222
+ const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
10223
+ idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
10224
+ idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
10225
+ idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
10226
+ idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
10227
+ idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
10228
+ idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
10229
+
10230
+ // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
10231
+ //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
10232
+ //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
10233
+ const __m256i q2_1 = _mm256_set_epi32(
10234
+ iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
10235
+ iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
10236
+ );
10237
+ const __m256i q2_2 = _mm256_set_epi32(
10238
+ iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
10239
+ iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
10240
+ );
10241
+
10242
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
10243
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
10244
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
10245
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
10246
+
10247
+ aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
10248
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
10249
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
10250
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
10251
+
10252
+ signs += 4;
10253
+
10254
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
10255
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
10256
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
10257
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
10258
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
10259
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
10260
+ sumi1 = _mm256_add_epi32(sumi1, p1);
10261
+ sumi2 = _mm256_add_epi32(sumi2, p2);
10262
+ }
10263
+
10264
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
10265
+
10266
+ }
10267
+
10268
+ *s = hsum_float_8(accumf);
10269
+
10270
+ #else
10271
+
10272
+ float sumf = 0.f;
10273
+ for (int i = 0; i < nb; ++i) {
10274
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10275
+ const uint8_t * restrict qs = x[i].qs;
10276
+ const uint8_t * restrict qh = x[i].qh;
10277
+ const uint8_t * restrict signs = x[i].signs;
10278
+ const int8_t * restrict q8 = y[i].qs;
10279
+ int32_t bsum = 0;
10280
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10281
+ const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
10282
+ const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
10283
+ int32_t sumi = 0;
10284
+ for (int l = 0; l < 4; ++l) {
10285
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
10286
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
10287
+ for (int j = 0; j < 4; ++j) {
10288
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10289
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
10290
+ }
10291
+ q8 += 8;
10292
+ }
10293
+ qs += 8;
10294
+ signs += 4;
10295
+ bsum += sumi * ls1;
10296
+ sumi = 0;
10297
+ for (int l = 0; l < 4; ++l) {
10298
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
10299
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
10300
+ for (int j = 0; j < 4; ++j) {
10301
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10302
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
10303
+ }
10304
+ q8 += 8;
10305
+ }
10306
+ qs += 8;
10307
+ signs += 4;
10308
+ bsum += sumi * ls2;
10309
+ }
10310
+ sumf += d * bsum;
10311
+ }
10312
+ *s = sumf;
10313
+ #endif
10314
+ }
10315
+
10316
+
9330
10317
  #ifdef __AVX2__
9331
10318
  static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
9332
10319
  const __m256i ax = _mm256_sign_epi8(x, x);
@@ -9348,7 +10335,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
9348
10335
 
9349
10336
  const int nb = n / QK_K;
9350
10337
 
9351
- #if defined __ARM_NEON
10338
+ // TODO: implement for QK_K = 64
10339
+ #if defined __ARM_NEON && QK_K == 256
9352
10340
 
9353
10341
  const uint8x16_t m8 = vdupq_n_u8(0x08);
9354
10342
  const uint8x16_t m7 = vdupq_n_u8(0x07);
@@ -9405,7 +10393,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
9405
10393
 
9406
10394
  *s = sumf;
9407
10395
 
9408
- #elif defined __AVX2__
10396
+ // TODO: implement for QK_K = 64
10397
+ #elif defined __AVX2__ && QK_K == 256
9409
10398
 
9410
10399
  const __m128i m8 = _mm_set1_epi8(0x08);
9411
10400
  const __m128i m7 = _mm_set1_epi8(0x07);
@@ -9420,8 +10409,12 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
9420
10409
 
9421
10410
  uint64_t aux64;
9422
10411
 
9423
- __m256i v_gindex;
9424
- const uint16_t * gindex = (const uint16_t *)&v_gindex;
10412
+ typedef union m256i_uint16 {
10413
+ __m256i reg;
10414
+ uint16_t s[16];
10415
+ } m256i_uint16_t;
10416
+
10417
+ m256i_uint16_t v_gindex;
9425
10418
 
9426
10419
  __m256 accum = _mm256_setzero_ps();
9427
10420
  for (int i = 0; i < nb; ++i) {
@@ -9436,13 +10429,13 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
9436
10429
  memcpy(&aux64, sc, 8); sc += 8;
9437
10430
  const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
9438
10431
  const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
9439
- v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
10432
+ v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
9440
10433
  const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
9441
10434
 
9442
10435
  for (int i32 = 0; i32 < 4; ++i32) {
9443
10436
  const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9444
- const __m256i q1b = _mm256_set_epi64x(iq1s_grid[gindex[4*i32+3]], iq1s_grid[gindex[4*i32+2]],
9445
- iq1s_grid[gindex[4*i32+1]], iq1s_grid[gindex[4*i32+0]]);
10437
+ const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
10438
+ iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
9446
10439
  const __m256i dot = mul_add_epi8(q1b, q8b);
9447
10440
  const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
9448
10441
  const __m256i p = _mm256_madd_epi16(s16, dot);
@@ -9520,27 +10513,134 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9520
10513
  int8x16x4_t q8b;
9521
10514
  int32x4_t prod_1, prod_2;
9522
10515
 
9523
- float sumf = 0;
10516
+ float sumf = 0;
10517
+
10518
+ for (int ib = 0; ib < nb; ib += 2) {
10519
+
10520
+ q4bits.val[0] = vld1q_u8(x[ib+0].qs);
10521
+ q4bits.val[1] = vld1q_u8(x[ib+1].qs);
10522
+ q8b.val[0] = vld1q_s8(y[ib+0].qs);
10523
+ q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
10524
+ q8b.val[2] = vld1q_s8(y[ib+1].qs);
10525
+ q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
10526
+
10527
+ q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
10528
+ q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
10529
+ q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
10530
+ q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
10531
+
10532
+ prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
10533
+ prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
10534
+
10535
+ sumf +=
10536
+ GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
10537
+ GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
10538
+ }
10539
+
10540
+ *s = sumf;
10541
+
10542
+ #elif defined __AVX2__
10543
+
10544
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
10545
+ const __m128i m4b = _mm_set1_epi8(0x0f);
10546
+ const __m256i mone = _mm256_set1_epi16(1);
10547
+
10548
+ __m256 accum1 = _mm256_setzero_ps();
10549
+ __m256 accum2 = _mm256_setzero_ps();
10550
+ for (int ib = 0; ib < nb; ib += 2) {
10551
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
10552
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
10553
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
10554
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
10555
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10556
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10557
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10558
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10559
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10560
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10561
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
10562
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
10563
+ accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
10564
+ _mm256_cvtepi32_ps(p_1), accum1);
10565
+ accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
10566
+ _mm256_cvtepi32_ps(p_2), accum2);
10567
+
10568
+ y += 2;
10569
+ x += 2;
10570
+ }
10571
+
10572
+ *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
10573
+
10574
+ #else
10575
+ float sumf = 0;
10576
+ for (int ib = 0; ib < nb; ++ib) {
10577
+ const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
10578
+ int sumi1 = 0, sumi2 = 0;
10579
+ for (int j = 0; j < QK4_NL/2; ++j) {
10580
+ sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
10581
+ sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
10582
+ }
10583
+ sumf += d * (sumi1 + sumi2);
10584
+ }
10585
+ *s = sumf;
10586
+ #endif
10587
+ }
10588
+
10589
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10590
+ assert(nrc == 1);
10591
+ UNUSED(nrc);
10592
+ UNUSED(bx);
10593
+ UNUSED(by);
10594
+ UNUSED(bs);
10595
+ assert(n % QK_K == 0);
10596
+ #if QK_K == 64
10597
+ ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
10598
+ #else
10599
+
10600
+ const block_iq4_xs * restrict x = vx;
10601
+ const block_q8_K * restrict y = vy;
10602
+
10603
+ const int nb = n / QK_K;
10604
+
10605
+ #if defined __ARM_NEON
10606
+ const int8x16_t values = vld1q_s8(kvalues_iq4nl);
10607
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
10608
+ ggml_uint8x16x2_t q4bits;
10609
+ ggml_int8x16x4_t q4b;
10610
+ ggml_int8x16x4_t q8b;
10611
+ int32x4_t prod_1, prod_2;
10612
+
10613
+ float sumf = 0;
10614
+
10615
+ for (int ibl = 0; ibl < nb; ++ibl) {
10616
+
10617
+ const int8_t * q8 = y[ibl].qs;
10618
+ const uint8_t * q4 = x[ibl].qs;
10619
+ uint16_t h = x[ibl].scales_h;
10620
+
10621
+ int sumi1 = 0, sumi2 = 0;
10622
+ for (int ib = 0; ib < QK_K/64; ++ib) {
10623
+
10624
+ q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
10625
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9524
10626
 
9525
- for (int ib = 0; ib < nb; ib += 2) {
9526
- q4bits.val[0] = vld1q_u8(x[ib+0].qs);
9527
- q4bits.val[1] = vld1q_u8(x[ib+1].qs);
9528
- q8b.val[0] = vld1q_s8(y[ib+0].qs);
9529
- q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
9530
- q8b.val[2] = vld1q_s8(y[ib+1].qs);
9531
- q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
10627
+ q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
10628
+ q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
10629
+ q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
10630
+ q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
9532
10631
 
9533
- q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
9534
- q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
9535
- q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
9536
- q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
10632
+ prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
10633
+ prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
9537
10634
 
9538
- prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
9539
- prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
10635
+ int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
10636
+ int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
10637
+ h >>= 4;
10638
+ sumi1 += vaddvq_s32(prod_1) * ls1;
10639
+ sumi2 += vaddvq_s32(prod_2) * ls2;
9540
10640
 
9541
- sumf +=
9542
- GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
9543
- GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
10641
+ }
10642
+
10643
+ sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
9544
10644
  }
9545
10645
 
9546
10646
  *s = sumf;
@@ -9549,47 +10649,73 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9549
10649
 
9550
10650
  const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
9551
10651
  const __m128i m4b = _mm_set1_epi8(0x0f);
9552
- const __m256i mone = _mm256_set1_epi16(1);
9553
-
9554
- __m256 accum1 = _mm256_setzero_ps();
9555
- __m256 accum2 = _mm256_setzero_ps();
9556
- for (int ib = 0; ib < nb; ib += 2) {
9557
- const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
9558
- const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
9559
- const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
9560
- const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
9561
- const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
9562
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
9563
- const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
9564
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
9565
- const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
9566
- const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
9567
- const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
9568
- const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
9569
- accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
9570
- _mm256_cvtepi32_ps(p_1), accum1);
9571
- accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
9572
- _mm256_cvtepi32_ps(p_2), accum2);
9573
10652
 
9574
- y += 2;
9575
- x += 2;
10653
+ __m256 accum = _mm256_setzero_ps();
10654
+ for (int ibl = 0; ibl < nb; ++ibl) {
10655
+ const uint8_t * qs = x[ibl].qs;
10656
+ const int8_t * q8 = y[ibl].qs;
10657
+ uint16_t sh = x[ibl].scales_h;
10658
+ __m256i sumi1 = _mm256_setzero_si256();
10659
+ __m256i sumi2 = _mm256_setzero_si256();
10660
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
10661
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10662
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10663
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10664
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10665
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10666
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10667
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10668
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10669
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10670
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10671
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
10672
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
10673
+ sh >>= 4;
10674
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
10675
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
10676
+ sumi1 = _mm256_add_epi32(p_1, sumi1);
10677
+ sumi2 = _mm256_add_epi32(p_2, sumi2);
10678
+ }
10679
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
10680
+ _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
9576
10681
  }
9577
10682
 
9578
- *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
10683
+ *s = hsum_float_8(accum);
9579
10684
 
9580
10685
  #else
9581
10686
  float sumf = 0;
9582
- for (int ib = 0; ib < nb; ++ib) {
9583
- const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
9584
- int sumi1 = 0, sumi2 = 0;
9585
- for (int j = 0; j < QK4_NL/2; ++j) {
9586
- sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
9587
- sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
10687
+ for (int ibl = 0; ibl < nb; ++ibl) {
10688
+ const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
10689
+ uint16_t h = x[ibl].scales_h;
10690
+ const uint8_t * qs = x[ibl].qs;
10691
+ const int8_t * q8 = y[ibl].qs;
10692
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
10693
+ const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
10694
+ const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
10695
+ h >>= 4;
10696
+ const float d1 = d4d8*(ls1 - 32);
10697
+ const float d2 = d4d8*(ls2 - 32);
10698
+ int sumi1 = 0, sumi2 = 0;
10699
+ for (int j = 0; j < 16; ++j) {
10700
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
10701
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
10702
+ }
10703
+ sumf += d1 * (sumi1 + sumi2);
10704
+ qs += 16;
10705
+ q8 += 32;
10706
+ sumi1 = sumi2 = 0;
10707
+ for (int j = 0; j < 16; ++j) {
10708
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
10709
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
10710
+ }
10711
+ sumf += d2 * (sumi1 + sumi2);
10712
+ qs += 16;
10713
+ q8 += 32;
9588
10714
  }
9589
- sumf += d * (sumi1 + sumi2);
9590
10715
  }
9591
10716
  *s = sumf;
9592
10717
  #endif
10718
+ #endif
9593
10719
  }
9594
10720
 
9595
10721
  // ================================ IQ2 quantization =============================================
@@ -9600,22 +10726,25 @@ typedef struct {
9600
10726
  uint16_t * neighbours;
9601
10727
  } iq2_entry_t;
9602
10728
 
9603
- static iq2_entry_t iq2_data[3] = {
10729
+ static iq2_entry_t iq2_data[4] = {
10730
+ {NULL, NULL, NULL},
9604
10731
  {NULL, NULL, NULL},
9605
10732
  {NULL, NULL, NULL},
9606
10733
  {NULL, NULL, NULL},
9607
10734
  };
9608
10735
 
9609
10736
  static inline int iq2_data_index(enum ggml_type type) {
9610
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
10737
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
9611
10738
  return type == GGML_TYPE_IQ2_XXS ? 0 :
9612
- type == GGML_TYPE_IQ2_XS ? 1 : 2;
10739
+ type == GGML_TYPE_IQ2_XS ? 1 :
10740
+ type == GGML_TYPE_IQ1_S ? 2 : 3;
9613
10741
  }
9614
10742
 
9615
10743
  static inline int iq2_grid_size(enum ggml_type type) {
9616
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
10744
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
9617
10745
  return type == GGML_TYPE_IQ2_XXS ? 256 :
9618
- type == GGML_TYPE_IQ2_XS ? 512 : 512;
10746
+ type == GGML_TYPE_IQ2_XS ? 512 :
10747
+ type == GGML_TYPE_IQ1_S ? 512 : 1024;
9619
10748
  }
9620
10749
 
9621
10750
  static int iq2_compare_func(const void * left, const void * right) {
@@ -9716,11 +10845,79 @@ void iq2xs_init_impl(enum ggml_type type) {
9716
10845
  41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
9717
10846
  42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
9718
10847
  };
10848
+ static const uint16_t kgrid_2bit_1024[1024] = {
10849
+ 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
10850
+ 73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
10851
+ 165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
10852
+ 337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
10853
+ 517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
10854
+ 674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
10855
+ 1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
10856
+ 1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
10857
+ 1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
10858
+ 1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
10859
+ 2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
10860
+ 2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
10861
+ 2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
10862
+ 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
10863
+ 4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
10864
+ 4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
10865
+ 4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
10866
+ 4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
10867
+ 5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
10868
+ 5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
10869
+ 5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
10870
+ 5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
10871
+ 6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
10872
+ 6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
10873
+ 8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
10874
+ 8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
10875
+ 8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
10876
+ 9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
10877
+ 9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
10878
+ 10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
10879
+ 16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
10880
+ 16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
10881
+ 16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
10882
+ 16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
10883
+ 17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
10884
+ 17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
10885
+ 17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
10886
+ 17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
10887
+ 18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
10888
+ 18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
10889
+ 18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
10890
+ 20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
10891
+ 20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
10892
+ 20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
10893
+ 21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
10894
+ 21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
10895
+ 22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
10896
+ 22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
10897
+ 24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
10898
+ 24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
10899
+ 25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
10900
+ 26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
10901
+ 32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
10902
+ 33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
10903
+ 33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
10904
+ 33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
10905
+ 34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
10906
+ 35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
10907
+ 36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
10908
+ 37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
10909
+ 38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
10910
+ 39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
10911
+ 41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
10912
+ 42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
10913
+ };
9719
10914
 
9720
10915
  const int kmap_size = 43692;
9721
- const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
10916
+ //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
10917
+ const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
9722
10918
  const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
9723
- type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : kgrid_1bit_512;
10919
+ type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
10920
+ type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
9724
10921
  uint64_t * kgrid_q2xs;
9725
10922
  int * kmap_q2xs;
9726
10923
  uint16_t * kneighbors_q2xs;
@@ -9817,7 +11014,7 @@ void iq2xs_init_impl(enum ggml_type type) {
9817
11014
  }
9818
11015
 
9819
11016
  void iq2xs_free_impl(enum ggml_type type) {
9820
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
11017
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
9821
11018
  const int gindex = iq2_data_index(type);
9822
11019
  if (iq2_data[gindex].grid) {
9823
11020
  free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
@@ -9866,7 +11063,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9866
11063
 
9867
11064
  const int kMaxQ = 3;
9868
11065
 
9869
- const int nbl = n/256;
11066
+ const int nbl = n/QK_K;
9870
11067
 
9871
11068
  block_iq2_xxs * y = vy;
9872
11069
 
@@ -10039,7 +11236,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
10039
11236
 
10040
11237
  const int kMaxQ = 3;
10041
11238
 
10042
- const int nbl = n/256;
11239
+ const int nbl = n/QK_K;
10043
11240
 
10044
11241
  block_iq2_xs * y = vy;
10045
11242
 
@@ -10239,14 +11436,15 @@ typedef struct {
10239
11436
  uint16_t * neighbours;
10240
11437
  } iq3_entry_t;
10241
11438
 
10242
- static iq3_entry_t iq3_data[1] = {
11439
+ static iq3_entry_t iq3_data[2] = {
11440
+ {NULL, NULL, NULL},
10243
11441
  {NULL, NULL, NULL},
10244
11442
  };
10245
11443
 
10246
11444
  static inline int iq3_data_index(int grid_size) {
10247
11445
  (void)grid_size;
10248
- GGML_ASSERT(grid_size == 256);
10249
- return 0;
11446
+ GGML_ASSERT(grid_size == 256 || grid_size == 512);
11447
+ return grid_size == 256 ? 0 : 1;
10250
11448
  }
10251
11449
 
10252
11450
  static int iq3_compare_func(const void * left, const void * right) {
@@ -10278,9 +11476,44 @@ void iq3xs_init_impl(int grid_size) {
10278
11476
  3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
10279
11477
  3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
10280
11478
  };
11479
+ static const uint16_t kgrid_512[512] = {
11480
+ 0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34,
11481
+ 37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77,
11482
+ 80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142,
11483
+ 145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210,
11484
+ 217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288,
11485
+ 291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393,
11486
+ 395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514,
11487
+ 516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576,
11488
+ 577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653,
11489
+ 655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727,
11490
+ 728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833,
11491
+ 840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977,
11492
+ 989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047,
11493
+ 1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103,
11494
+ 1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199,
11495
+ 1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296,
11496
+ 1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415,
11497
+ 1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561,
11498
+ 1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648,
11499
+ 1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761,
11500
+ 1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877,
11501
+ 1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068,
11502
+ 2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177,
11503
+ 2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269,
11504
+ 2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520,
11505
+ 2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634,
11506
+ 2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805,
11507
+ 2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083,
11508
+ 3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276,
11509
+ 3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591,
11510
+ 3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729,
11511
+ 3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032,
11512
+ };
11513
+
10281
11514
  const int kmap_size = 4096;
10282
- const int nwant = 2;
10283
- const uint16_t * kgrid = kgrid_256;
11515
+ const int nwant = grid_size == 256 ? 2 : 3;
11516
+ const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
10284
11517
  uint32_t * kgrid_q3xs;
10285
11518
  int * kmap_q3xs;
10286
11519
  uint16_t * kneighbors_q3xs;
@@ -10377,7 +11610,7 @@ void iq3xs_init_impl(int grid_size) {
10377
11610
  }
10378
11611
 
10379
11612
  void iq3xs_free_impl(int grid_size) {
10380
- GGML_ASSERT(grid_size == 256);
11613
+ GGML_ASSERT(grid_size == 256 || grid_size == 512);
10381
11614
  const int gindex = iq3_data_index(grid_size);
10382
11615
  if (iq3_data[gindex].grid) {
10383
11616
  free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
@@ -10410,9 +11643,10 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
10410
11643
  return grid_index;
10411
11644
  }
10412
11645
 
10413
- static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
11646
+ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
11647
+ const float * restrict quant_weights) {
10414
11648
 
10415
- const int gindex = iq3_data_index(256);
11649
+ const int gindex = iq3_data_index(grid_size);
10416
11650
 
10417
11651
  const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
10418
11652
  const int * kmap_q3xs = iq3_data[gindex].map;
@@ -10426,9 +11660,23 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10426
11660
 
10427
11661
  const int kMaxQ = 8;
10428
11662
 
10429
- const int nbl = n/256;
11663
+ const int nbl = n/QK_K;
10430
11664
 
10431
- block_iq3_xxs * y = vy;
11665
+ ggml_fp16_t * dh;
11666
+ uint8_t * qs;
11667
+ int block_size;
11668
+ if (grid_size == 256) {
11669
+ block_iq3_xxs * y = vy;
11670
+ dh = &y->d;
11671
+ qs = y->qs;
11672
+ block_size = sizeof(block_iq3_xxs);
11673
+ } else {
11674
+ block_iq3_s * y = vy;
11675
+ dh = &y->d;
11676
+ qs = y->qs;
11677
+ block_size = sizeof(block_iq3_s);
11678
+ }
11679
+ int quant_size = block_size - sizeof(ggml_fp16_t);
10432
11680
 
10433
11681
  float scales[QK_K/32];
10434
11682
  float weight[32];
@@ -10439,65 +11687,280 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10439
11687
  bool is_on_grid[8];
10440
11688
  bool is_on_grid_aux[8];
10441
11689
  uint8_t block_signs[8];
10442
- uint8_t q3[3*(QK_K/8)];
11690
+ uint8_t q3[3*(QK_K/8)+QK_K/32];
10443
11691
  uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
11692
+ uint8_t * qh = q3 + 3*(QK_K/8);
10444
11693
 
10445
11694
  for (int ibl = 0; ibl < nbl; ++ibl) {
10446
11695
 
10447
- y[ibl].d = GGML_FP32_TO_FP16(0.f);
10448
- memset(q3, 0, 3*QK_K/8);
11696
+ dh[0] = GGML_FP32_TO_FP16(0.f);
11697
+ memset(q3, 0, 3*QK_K/8+QK_K/32);
10449
11698
 
10450
11699
  float max_scale = 0;
10451
11700
 
10452
11701
  const float * xbl = x + QK_K*ibl;
10453
11702
  float sumx2 = 0;
10454
11703
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
10455
- float sigma2 = sumx2/QK_K;
11704
+ float sigma2 = 2*sumx2/QK_K;
10456
11705
 
10457
11706
  for (int ib = 0; ib < QK_K/32; ++ib) {
10458
11707
  const float * xb = xbl + 32*ib;
10459
11708
  if (quant_weights) {
10460
- const float * qw = quant_weights + QK_K*ibl + 32*ib;
10461
- for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11709
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
11710
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11711
+ } else {
11712
+ for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
11713
+ }
11714
+ for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
11715
+ for (int k = 0; k < 4; ++k) {
11716
+ int nflip = 0;
11717
+ uint8_t s = 0;
11718
+ for (int i = 0; i < 8; ++i) {
11719
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
11720
+ else {
11721
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
11722
+ }
11723
+ }
11724
+ if (nflip%2) {
11725
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
11726
+ for (int i = 1; i < 8; ++i) {
11727
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
11728
+ if (ax < min) {
11729
+ min = ax; imin = i;
11730
+ }
11731
+ }
11732
+ xval[8*k+imin] = -xval[8*k+imin];
11733
+ s ^= (1 << imin);
11734
+ }
11735
+ block_signs[k] = s & 127;
11736
+ }
11737
+ float max = xval[0];
11738
+ for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
11739
+ if (!max) {
11740
+ scales[ib] = 0;
11741
+ memset(L, 0, 32);
11742
+ continue;
11743
+ }
11744
+ float best = 0;
11745
+ float scale = max/(2*kMaxQ-1);
11746
+ for (int is = -15; is <= 15; ++is) {
11747
+ float id = (2*kMaxQ-1+is*0.2f)/max;
11748
+ float this_scale = 1/id;
11749
+ for (int k = 0; k < 8; ++k) {
11750
+ for (int i = 0; i < 4; ++i) {
11751
+ int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
11752
+ Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
11753
+ }
11754
+ uint16_t u = 0;
11755
+ for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
11756
+ int grid_index = kmap_q3xs[u];
11757
+ is_on_grid_aux[k] = true;
11758
+ if (grid_index < 0) {
11759
+ is_on_grid_aux[k] = false;
11760
+ const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
11761
+ grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
11762
+ }
11763
+ }
11764
+ float sumqx = 0, sumq2 = 0;
11765
+ for (int i = 0; i < 32; ++i) {
11766
+ float w = weight[i];
11767
+ float q = 2*Laux[i] + 1;
11768
+ sumqx += w*xval[i]*q;
11769
+ sumq2 += w*q*q;
11770
+ }
11771
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
11772
+ scale = sumqx/sumq2; best = scale*sumqx;
11773
+ for (int i = 0; i < 32; ++i) L[i] = Laux[i];
11774
+ for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
11775
+ }
11776
+ }
11777
+ int n_not_ongrid = 0;
11778
+ for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
11779
+ if (n_not_ongrid > 0 && scale > 0) {
11780
+ float id = 1/scale;
11781
+ for (int k = 0; k < 8; ++k) {
11782
+ if (is_on_grid[k]) continue;
11783
+ uint16_t u = 0;
11784
+ for (int i = 0; i < 4; ++i) {
11785
+ int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
11786
+ l = MAX(0, MIN(kMaxQ-1, l));
11787
+ u |= (l << 3*i);
11788
+ }
11789
+ int grid_index = kmap_q3xs[u];
11790
+ if (grid_index < 0) {
11791
+ const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
11792
+ grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
11793
+ }
11794
+ const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
11795
+ for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
11796
+ }
11797
+ float sumqx = 0, sumq2 = 0;
11798
+ for (int i = 0; i < 32; ++i) {
11799
+ float w = weight[i];
11800
+ float q = 2*L[i] + 1;
11801
+ sumqx += w*xval[i]*q;
11802
+ sumq2 += w*q*q;
11803
+ }
11804
+ if (sumq2 > 0) scale = sumqx/sumq2;
11805
+ }
11806
+ if (scale < 0) {
11807
+ // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
11808
+ // and correspondingly flip quant signs.
11809
+ scale = -scale;
11810
+ for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
11811
+ }
11812
+ for (int k = 0; k < 8; ++k) {
11813
+ uint16_t u = 0;
11814
+ for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
11815
+ int grid_index = kmap_q3xs[u];
11816
+ if (grid_index < 0) {
11817
+ printf("Oops: found point %u not on grid:", u);
11818
+ for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
11819
+ printf("\n");
11820
+ GGML_ASSERT(false);
11821
+ }
11822
+ if (grid_size == 256) {
11823
+ q3[8*ib+k] = grid_index;
11824
+ } else {
11825
+ q3[8*ib+k] = grid_index & 255;
11826
+ qh[ib] |= ((grid_index >> 8) << k);
11827
+ }
11828
+
11829
+ }
11830
+ scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
11831
+ GGML_ASSERT(scale >= 0);
11832
+ scales[ib] = scale;
11833
+ max_scale = MAX(max_scale, scale);
11834
+ }
11835
+
11836
+ if (!max_scale) {
11837
+ memset(qs, 0, quant_size);
11838
+ dh += block_size/sizeof(ggml_fp16_t);
11839
+ qs += block_size;
11840
+ continue;
11841
+ }
11842
+
11843
+ float d = max_scale/31;
11844
+ dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor
11845
+ float id = 1/d;
11846
+ for (int ib = 0; ib < QK_K/32; ++ib) {
11847
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
11848
+ l = MAX(0, MIN(15, l));
11849
+ scales_and_signs[ib] |= ((uint32_t)l << 28);
11850
+ }
11851
+ memcpy(qs, q3, quant_size);
11852
+
11853
+ dh += block_size/sizeof(ggml_fp16_t);
11854
+ qs += block_size;
11855
+
11856
+ }
11857
+ }
11858
+
11859
+ size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11860
+ (void)hist;
11861
+ GGML_ASSERT(n_per_row%QK_K == 0);
11862
+ int nblock = n_per_row/QK_K;
11863
+ char * qrow = (char *)dst;
11864
+ for (int row = 0; row < nrow; ++row) {
11865
+ quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
11866
+ src += n_per_row;
11867
+ qrow += nblock*sizeof(block_iq3_xxs);
11868
+ }
11869
+ return nrow * nblock * sizeof(block_iq3_xxs);
11870
+ }
11871
+
11872
+ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
11873
+ assert(k % QK_K == 0);
11874
+ block_iq3_xxs * restrict y = vy;
11875
+ quantize_row_iq3_xxs_reference(x, y, k);
11876
+ }
11877
+
11878
+ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
11879
+ assert(k % QK_K == 0);
11880
+ quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
11881
+ }
11882
+
11883
+ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
11884
+ const float * restrict quant_weights,
11885
+ float * scales,
11886
+ float * weight,
11887
+ float * xval,
11888
+ int8_t * L,
11889
+ int8_t * Laux,
11890
+ float * waux,
11891
+ bool * is_on_grid,
11892
+ bool * is_on_grid_aux,
11893
+ uint8_t * block_signs) {
11894
+
11895
+ const int gindex = iq3_data_index(512);
11896
+
11897
+ const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
11898
+ const int * kmap_q3xs = iq3_data[gindex].map;
11899
+ const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
11900
+
11901
+ //GGML_ASSERT(quant_weights && "missing quantization weights");
11902
+ GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
11903
+ GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
11904
+ GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
11905
+ GGML_ASSERT(n%QK_K == 0);
11906
+
11907
+ const int kMaxQ = 8;
11908
+
11909
+ const int nbl = n/QK_K;
11910
+
11911
+ block_iq3_s * y = vy;
11912
+
11913
+ const int bs4 = block_size/4;
11914
+ const int bs8 = block_size/8;
11915
+
11916
+ for (int ibl = 0; ibl < nbl; ++ibl) {
11917
+
11918
+ memset(&y[ibl], 0, sizeof(block_iq3_s));
11919
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
11920
+
11921
+ uint8_t * qs = y[ibl].qs;
11922
+ uint8_t * qh = y[ibl].qh;
11923
+ uint8_t * signs = y[ibl].signs;
11924
+
11925
+ float max_scale = 0;
11926
+
11927
+ const float * xbl = x + QK_K*ibl;
11928
+ float sumx2 = 0;
11929
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
11930
+ float sigma2 = 2*sumx2/QK_K;
11931
+
11932
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
11933
+ const float * xb = xbl + block_size*ib;
11934
+ if (quant_weights) {
11935
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
11936
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
10462
11937
  } else {
10463
- for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
11938
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
10464
11939
  }
10465
- for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
10466
- for (int k = 0; k < 4; ++k) {
10467
- int nflip = 0;
11940
+ for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
11941
+ for (int k = 0; k < bs8; ++k) {
10468
11942
  uint8_t s = 0;
10469
11943
  for (int i = 0; i < 8; ++i) {
10470
11944
  if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
10471
11945
  else {
10472
- xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
10473
- }
10474
- }
10475
- if (nflip%2) {
10476
- int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
10477
- for (int i = 1; i < 8; ++i) {
10478
- float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
10479
- if (ax < min) {
10480
- min = ax; imin = i;
10481
- }
11946
+ xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
10482
11947
  }
10483
- xval[8*k+imin] = -xval[8*k+imin];
10484
- s ^= (1 << imin);
10485
11948
  }
10486
- block_signs[k] = s & 127;
11949
+ block_signs[k] = s;
10487
11950
  }
10488
11951
  float max = xval[0];
10489
- for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
11952
+ for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
10490
11953
  if (!max) {
10491
11954
  scales[ib] = 0;
10492
- memset(L, 0, 32);
10493
11955
  continue;
10494
11956
  }
10495
11957
  float best = 0;
10496
11958
  float scale = max/(2*kMaxQ-1);
10497
- for (int is = -15; is <= 15; ++is) {
11959
+ for (int k = 0; k < bs4; ++k) is_on_grid[k] = false;
11960
+ for (int is = -9; is <= 9; ++is) {
10498
11961
  float id = (2*kMaxQ-1+is*0.2f)/max;
10499
11962
  float this_scale = 1/id;
10500
- for (int k = 0; k < 8; ++k) {
11963
+ for (int k = 0; k < bs4; ++k) {
10501
11964
  for (int i = 0; i < 4; ++i) {
10502
11965
  int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
10503
11966
  Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
@@ -10513,7 +11976,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10513
11976
  }
10514
11977
  }
10515
11978
  float sumqx = 0, sumq2 = 0;
10516
- for (int i = 0; i < 32; ++i) {
11979
+ for (int i = 0; i < block_size; ++i) {
10517
11980
  float w = weight[i];
10518
11981
  float q = 2*Laux[i] + 1;
10519
11982
  sumqx += w*xval[i]*q;
@@ -10521,16 +11984,16 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10521
11984
  }
10522
11985
  if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
10523
11986
  scale = sumqx/sumq2; best = scale*sumqx;
10524
- for (int i = 0; i < 32; ++i) L[i] = Laux[i];
10525
- for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
11987
+ for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
11988
+ for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
10526
11989
  }
10527
11990
  }
10528
11991
  int n_not_ongrid = 0;
10529
- for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
11992
+ for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
10530
11993
  if (n_not_ongrid > 0 && scale > 0) {
10531
11994
  float id = 1/scale;
10532
- for (int k = 0; k < 8; ++k) {
10533
- if (is_on_grid[k]) continue;
11995
+ for (int k = 0; k < bs4; ++k) {
11996
+ //if (is_on_grid[k]) continue;
10534
11997
  uint16_t u = 0;
10535
11998
  for (int i = 0; i < 4; ++i) {
10536
11999
  int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
@@ -10546,7 +12009,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10546
12009
  for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
10547
12010
  }
10548
12011
  float sumqx = 0, sumq2 = 0;
10549
- for (int i = 0; i < 32; ++i) {
12012
+ for (int i = 0; i < block_size; ++i) {
10550
12013
  float w = weight[i];
10551
12014
  float q = 2*L[i] + 1;
10552
12015
  sumqx += w*xval[i]*q;
@@ -10558,9 +12021,9 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10558
12021
  // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
10559
12022
  // and correspondingly flip quant signs.
10560
12023
  scale = -scale;
10561
- for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
12024
+ for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
10562
12025
  }
10563
- for (int k = 0; k < 8; ++k) {
12026
+ for (int k = 0; k < bs4; ++k) {
10564
12027
  uint16_t u = 0;
10565
12028
  for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
10566
12029
  int grid_index = kmap_q3xs[u];
@@ -10570,99 +12033,71 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10570
12033
  printf("\n");
10571
12034
  GGML_ASSERT(false);
10572
12035
  }
10573
- q3[8*ib+k] = grid_index;
12036
+ qs[k] = grid_index & 255;
12037
+ qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
10574
12038
  }
10575
- scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
12039
+ qs += bs4;
12040
+ for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
12041
+ signs += bs8;
10576
12042
  GGML_ASSERT(scale >= 0);
10577
12043
  scales[ib] = scale;
10578
12044
  max_scale = MAX(max_scale, scale);
10579
12045
  }
10580
12046
 
10581
12047
  if (!max_scale) {
10582
- memset(y[ibl].qs, 0, 3*QK_K/8);
10583
12048
  continue;
10584
12049
  }
10585
12050
 
10586
12051
  float d = max_scale/31;
10587
- y[ibl].d = GGML_FP32_TO_FP16(d);
12052
+ y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f);
10588
12053
  float id = 1/d;
10589
- float sumqx = 0, sumq2 = 0;
10590
- for (int ib = 0; ib < QK_K/32; ++ib) {
10591
- int l = nearest_int(0.5f*(id*scales[ib]-1));
10592
- l = MAX(0, MIN(15, l));
10593
- scales_and_signs[ib] |= ((uint32_t)l << 28);
10594
- if (false) {
10595
- const float * xb = xbl + 32*ib;
10596
- if (quant_weights) {
10597
- const float * qw = quant_weights + QK_K*ibl + 32*ib;
10598
- for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
10599
- } else {
10600
- for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
10601
- }
10602
- const float db = 0.25f * d * (1 + 2*l);
10603
- for (int k = 0; k < 8; ++k) {
10604
- const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
10605
- const float * xk = xb + 4*k;
10606
- const float * wk = weight + 4*k;
10607
- //const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
10608
- const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
10609
- float best_mse = 0; int best_index = q3[8*ib+k];
10610
- for (int j = 0; j < 4; ++j) {
10611
- float diff = db * grid[j] * signs[j] - xk[j];
10612
- best_mse += wk[j] * diff * diff;
10613
- }
10614
- for (int idx = 0; idx < 256; ++idx) {
10615
- //grid = (const uint8_t *)(kgrid_q3xs + idx);
10616
- grid = (const uint8_t *)(iq3xxs_grid + idx);
10617
- float mse = 0;
10618
- for (int j = 0; j < 4; ++j) {
10619
- float diff = db * grid[j] * signs[j] - xk[j];
10620
- mse += wk[j] * diff * diff;
10621
- }
10622
- if (mse < best_mse) {
10623
- best_mse = mse; best_index = idx;
10624
- }
10625
- }
10626
- q3[8*ib+k] = best_index;
10627
- //grid = (const uint8_t *)(kgrid_q3xs + best_index);
10628
- grid = (const uint8_t *)(iq3xxs_grid + best_index);
10629
- for (int j = 0; j < 4; ++j) {
10630
- float q = db * grid[j] * signs[j];
10631
- sumqx += wk[j] * q * xk[j];
10632
- sumq2 += wk[j] * q * q;
10633
- }
10634
- }
10635
- if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
10636
- }
12054
+ for (int ib = 0; ib < QK_K/block_size; ib += 2) {
12055
+ int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
12056
+ l1 = MAX(0, MIN(15, l1));
12057
+ int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
12058
+ l2 = MAX(0, MIN(15, l2));
12059
+ y[ibl].scales[ib/2] = l1 | (l2 << 4);
10637
12060
  }
10638
- memcpy(y[ibl].qs, q3, 3*QK_K/8);
12061
+
10639
12062
  }
10640
12063
  }
10641
12064
 
10642
- size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12065
+ #define IQ3S_BLOCK_SIZE 32
12066
+ size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
10643
12067
  (void)hist;
10644
12068
  GGML_ASSERT(n_per_row%QK_K == 0);
10645
12069
  int nblock = n_per_row/QK_K;
12070
+ float scales[QK_K/IQ3S_BLOCK_SIZE];
12071
+ float weight[IQ3S_BLOCK_SIZE];
12072
+ float xval[IQ3S_BLOCK_SIZE];
12073
+ int8_t L[IQ3S_BLOCK_SIZE];
12074
+ int8_t Laux[IQ3S_BLOCK_SIZE];
12075
+ float waux[IQ3S_BLOCK_SIZE];
12076
+ bool is_on_grid[IQ3S_BLOCK_SIZE/4];
12077
+ bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
12078
+ uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
10646
12079
  char * qrow = (char *)dst;
10647
12080
  for (int row = 0; row < nrow; ++row) {
10648
- quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights);
12081
+ quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
12082
+ scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
10649
12083
  src += n_per_row;
10650
- qrow += nblock*sizeof(block_iq3_xxs);
12084
+ qrow += nblock*sizeof(block_iq3_s);
10651
12085
  }
10652
- return nrow * nblock * sizeof(block_iq3_xxs);
12086
+ return nrow * nblock * sizeof(block_iq3_s);
10653
12087
  }
10654
12088
 
10655
- void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
12089
+ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
10656
12090
  assert(k % QK_K == 0);
10657
- block_iq3_xxs * restrict y = vy;
10658
- quantize_row_iq3_xxs_reference(x, y, k);
12091
+ block_iq3_s * restrict y = vy;
12092
+ quantize_row_iq3_s_reference(x, y, k);
10659
12093
  }
10660
12094
 
10661
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
12095
+ void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
10662
12096
  assert(k % QK_K == 0);
10663
- quantize_row_iq3_xxs_impl(x, y, k, NULL);
12097
+ quantize_iq3_s(x, y, 1, k, NULL, NULL);
10664
12098
  }
10665
12099
 
12100
+
10666
12101
  // =================================== 1.5 bpw ===================================================
10667
12102
 
10668
12103
  static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
@@ -10745,7 +12180,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
10745
12180
  GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
10746
12181
  GGML_ASSERT(n%QK_K == 0);
10747
12182
 
10748
- const int nbl = n/256;
12183
+ const int nbl = n/QK_K;
10749
12184
 
10750
12185
  block_iq1_s * y = vy;
10751
12186
 
@@ -10880,23 +12315,23 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
10880
12315
  return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
10881
12316
  }
10882
12317
 
10883
- static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x,
10884
- ggml_fp16_t * dh, uint8_t * q4,
10885
- float * weight, uint8_t * L,
12318
+ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
12319
+ ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
12320
+ float * scales, float * weight, uint8_t * L,
10886
12321
  const int8_t * values,
10887
12322
  const float * quant_weights) {
10888
12323
 
10889
12324
  const int ntry = 7;
10890
12325
 
10891
12326
  float sigma2 = 0;
10892
- for (int j = 0; j < QK4_NL; ++j) sigma2 += x[j]*x[j];
10893
- sigma2 *= 2.f/QK4_NL;
12327
+ for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
12328
+ sigma2 *= 2.f/super_block_size;
10894
12329
 
10895
- const int nb = QK4_NL/block_size;
12330
+ memset(q4, 0, super_block_size/2);
12331
+ dh[0] = GGML_FP32_TO_FP16(0.f);
10896
12332
 
10897
- memset(q4, 0, QK4_NL/2);
10898
- for (int ib = 0; ib < nb; ++ib) {
10899
- dh[ib] = GGML_FP32_TO_FP16(0.f);
12333
+ float max_scale = 0, amax_scale = 0;
12334
+ for (int ib = 0; ib < super_block_size/block_size; ++ib) {
10900
12335
  const float * xb = x + ib*block_size;
10901
12336
  if (quant_weights) {
10902
12337
  const float * qw = quant_weights + ib*block_size;
@@ -10912,6 +12347,7 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
10912
12347
  }
10913
12348
  }
10914
12349
  if (!amax) {
12350
+ scales[ib] = 0;
10915
12351
  continue;
10916
12352
  }
10917
12353
  float d = -max/values[0];
@@ -10925,7 +12361,6 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
10925
12361
  sumqx += w*q*xb[j];
10926
12362
  sumq2 += w*q*q;
10927
12363
  }
10928
- float best_id = id;
10929
12364
  d = sumqx/sumq2;
10930
12365
  float best = d*sumqx;
10931
12366
  for (int itry = -ntry; itry <= ntry; ++itry) {
@@ -10941,15 +12376,47 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
10941
12376
  }
10942
12377
  if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
10943
12378
  d = sumqx/sumq2; best = d * sumqx;
10944
- best_id = id;
10945
12379
  }
10946
12380
  }
10947
- dh[ib] = GGML_FP32_TO_FP16(d);
10948
- for (int j = 0; j < block_size; ++j) {
10949
- L[ib*block_size + j] = best_index_int8(16, values, best_id*xb[j]);
12381
+ scales[ib] = d;
12382
+ float abs_d = fabsf(d);
12383
+ if (abs_d > amax_scale) {
12384
+ amax_scale = abs_d; max_scale = d;
12385
+ }
12386
+ }
12387
+
12388
+ if (super_block_size/block_size > 1) {
12389
+ int nb = super_block_size/block_size;
12390
+ memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
12391
+ float d = -max_scale/32;
12392
+ dh[0] = GGML_FP32_TO_FP16(d);
12393
+ float id = d ? 1/d : 0.f;
12394
+ for (int ib = 0; ib < super_block_size/block_size; ++ib) {
12395
+ int l = nearest_int(id*scales[ib]);
12396
+ l = MAX(-32, MIN(31, l));
12397
+ float dl = d * l;
12398
+ float idl = dl ? 1/dl : 0.f;
12399
+ uint8_t * Lb = L + ib*block_size;
12400
+ const float * xb = x + ib*block_size;
12401
+ for (int j = 0; j < block_size; ++j) {
12402
+ Lb[j] = best_index_int8(16, values, idl*xb[j]);
12403
+ }
12404
+ l += 32;
12405
+ uint8_t l_l = l & 0xf;
12406
+ uint8_t l_h = l >> 4;
12407
+ if (ib%2 == 0) scales_l[ib/2] = l_l;
12408
+ else scales_l[ib/2] |= (l_l << 4);
12409
+ scales_h[ib/8] |= (l_h << 2*(ib%8));
12410
+ }
12411
+ } else {
12412
+ dh[0] = GGML_FP32_TO_FP16(scales[0]);
12413
+ float id = scales[0] ? 1/scales[0] : 0;
12414
+ for (int j = 0; j < super_block_size; ++j) {
12415
+ L[j] = best_index_int8(16, values, id*x[j]);
10950
12416
  }
10951
12417
  }
10952
- for (int i = 0; i < QK4_NL/32; ++i) {
12418
+
12419
+ for (int i = 0; i < super_block_size/32; ++i) {
10953
12420
  for (int j = 0; j < 16; ++j) {
10954
12421
  q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
10955
12422
  }
@@ -10962,12 +12429,16 @@ size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, i
10962
12429
  int nblock = n_per_row/QK4_NL;
10963
12430
  char * qrow = (char *)dst;
10964
12431
  uint8_t L[QK4_NL];
10965
- float weight[32];
12432
+ float weight[QK4_NL];
12433
+ uint16_t unused_h;
12434
+ uint8_t * unused_l = NULL;
12435
+ float scale;
10966
12436
  for (int row = 0; row < nrow; ++row) {
10967
12437
  block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
10968
12438
  for (int ibl = 0; ibl < nblock; ++ibl) {
10969
12439
  const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
10970
- quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, weight, L, kvalues_iq4nl, qw);
12440
+ quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
12441
+ &scale, weight, L, kvalues_iq4nl, qw);
10971
12442
  }
10972
12443
  src += n_per_row;
10973
12444
  qrow += nblock*sizeof(block_iq4_nl);
@@ -10986,3 +12457,232 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
10986
12457
  quantize_iq4_nl(x, y, 1, k, NULL, NULL);
10987
12458
  }
10988
12459
 
12460
+ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12461
+ #if QK_K == 64
12462
+ return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
12463
+ #else
12464
+ (void)hist;
12465
+ GGML_ASSERT(n_per_row%QK_K == 0);
12466
+ int nblock = n_per_row/QK_K;
12467
+ char * qrow = (char *)dst;
12468
+ uint8_t L[QK_K];
12469
+ float weight[32];
12470
+ float scales[QK_K/32];
12471
+ for (int row = 0; row < nrow; ++row) {
12472
+ block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
12473
+ for (int ibl = 0; ibl < nblock; ++ibl) {
12474
+ const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
12475
+ quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
12476
+ scales, weight, L, kvalues_iq4nl, qw);
12477
+ }
12478
+ src += n_per_row;
12479
+ qrow += nblock*sizeof(block_iq4_xs);
12480
+ }
12481
+ return nrow * nblock * sizeof(block_iq4_xs);
12482
+ #endif
12483
+ }
12484
+
12485
+ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
12486
+ assert(k % QK_K == 0);
12487
+ block_iq4_xs * restrict y = vy;
12488
+ quantize_row_iq4_xs_reference(x, y, k);
12489
+ }
12490
+
12491
+ void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
12492
+ assert(k % QK_K == 0);
12493
+ quantize_iq4_xs(x, y, 1, k, NULL, NULL);
12494
+ }
12495
+
12496
+ // =============================== 2.5625 bpw
12497
+
12498
+ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12499
+
12500
+ const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
12501
+
12502
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
12503
+ const int * kmap_q2xs = iq2_data[gindex].map;
12504
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
12505
+
12506
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
12507
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
12508
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
12509
+ GGML_ASSERT(n%QK_K == 0);
12510
+
12511
+ const int kMaxQ = 3;
12512
+
12513
+ const int nbl = n/QK_K;
12514
+
12515
+ block_iq2_s * y = vy;
12516
+
12517
+ float scales[QK_K/16];
12518
+ float weight[16];
12519
+ float xval[16];
12520
+ int8_t L[16];
12521
+ int8_t Laux[16];
12522
+ float waux[16];
12523
+ bool is_on_grid[2];
12524
+ bool is_on_grid_aux[2];
12525
+ uint8_t block_signs[2];
12526
+
12527
+ for (int ibl = 0; ibl < nbl; ++ibl) {
12528
+
12529
+ memset(&y[ibl], 0, sizeof(block_iq2_s));
12530
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
12531
+
12532
+ float max_scale = 0;
12533
+
12534
+ const float * xbl = x + QK_K*ibl;
12535
+ float sumx2 = 0;
12536
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
12537
+ float sigma2 = 2*sumx2/QK_K;
12538
+
12539
+ for (int ib = 0; ib < QK_K/16; ++ib) {
12540
+ const float * xb = xbl + 16*ib;
12541
+ if (quant_weights) {
12542
+ const float * qw = quant_weights + QK_K*ibl + 16*ib;
12543
+ for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12544
+ } else {
12545
+ for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
12546
+ }
12547
+ for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
12548
+ for (int k = 0; k < 2; ++k) {
12549
+ uint8_t s = 0;
12550
+ for (int i = 0; i < 8; ++i) {
12551
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
12552
+ else {
12553
+ xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
12554
+ }
12555
+ }
12556
+ block_signs[k] = s;
12557
+ }
12558
+ float max = xval[0];
12559
+ for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
12560
+ if (!max) {
12561
+ scales[ib] = 0;
12562
+ continue;
12563
+ }
12564
+ float best = 0;
12565
+ float scale = max/(2*kMaxQ-1);
12566
+ is_on_grid[0] = is_on_grid[1] = true;
12567
+ for (int is = -9; is <= 9; ++is) {
12568
+ float id = (2*kMaxQ-1+is*0.1f)/max;
12569
+ float this_scale = 1/id;
12570
+ for (int k = 0; k < 2; ++k) {
12571
+ for (int i = 0; i < 8; ++i) {
12572
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
12573
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
12574
+ }
12575
+ uint16_t u = 0;
12576
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
12577
+ int grid_index = kmap_q2xs[u];
12578
+ is_on_grid_aux[k] = true;
12579
+ if (grid_index < 0) {
12580
+ is_on_grid_aux[k] = false;
12581
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12582
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
12583
+ }
12584
+ }
12585
+ float sumqx = 0, sumq2 = 0;
12586
+ for (int i = 0; i < 16; ++i) {
12587
+ float w = weight[i];
12588
+ float q = 2*Laux[i] + 1;
12589
+ sumqx += w*xval[i]*q;
12590
+ sumq2 += w*q*q;
12591
+ }
12592
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
12593
+ scale = sumqx/sumq2; best = scale*sumqx;
12594
+ for (int i = 0; i < 16; ++i) L[i] = Laux[i];
12595
+ for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
12596
+ }
12597
+ }
12598
+ int n_not_ongrid = 0;
12599
+ for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
12600
+ if (n_not_ongrid > 0 && scale > 0) {
12601
+ float id = 1/scale;
12602
+ for (int k = 0; k < 2; ++k) {
12603
+ if (is_on_grid[k]) continue;
12604
+ uint16_t u = 0;
12605
+ for (int i = 0; i < 8; ++i) {
12606
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
12607
+ l = MAX(0, MIN(kMaxQ-1, l));
12608
+ u |= (l << 2*i);
12609
+ L[8*k + i] = l;
12610
+ }
12611
+ int grid_index = kmap_q2xs[u];
12612
+ if (grid_index < 0) {
12613
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12614
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
12615
+ }
12616
+ }
12617
+ float sumqx = 0, sumq2 = 0;
12618
+ for (int i = 0; i < 16; ++i) {
12619
+ float w = weight[i];
12620
+ float q = 2*L[i] + 1;
12621
+ sumqx += w*xval[i]*q;
12622
+ sumq2 += w*q*q;
12623
+ }
12624
+ if (sumq2 > 0) scale = sumqx/sumq2;
12625
+ }
12626
+ if (scale < 0) {
12627
+ scale = -scale;
12628
+ for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
12629
+ }
12630
+ for (int k = 0; k < 2; ++k) {
12631
+ uint16_t u = 0;
12632
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
12633
+ int grid_index = kmap_q2xs[u];
12634
+ if (grid_index < 0) {
12635
+ printf("Oops: found point %u not on grid:", u);
12636
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
12637
+ printf("\n");
12638
+ GGML_ASSERT(false);
12639
+ }
12640
+ const int i8 = 2*ib + k;
12641
+ y[ibl].qs[i8] = grid_index & 255;
12642
+ y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
12643
+ y[ibl].qs[QK_K/8 + i8] = block_signs[k];
12644
+ }
12645
+ GGML_ASSERT(scale >= 0);
12646
+ scales[ib] = scale;
12647
+ max_scale = MAX(max_scale, scale);
12648
+ }
12649
+
12650
+ if (!max_scale) {
12651
+ continue;
12652
+ }
12653
+
12654
+ float d = max_scale/31;
12655
+ y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
12656
+ float id = 1/d;
12657
+ for (int ib = 0; ib < QK_K/16; ++ib) {
12658
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
12659
+ l = MAX(0, MIN(15, l));
12660
+ if (ib%2 == 0) y[ibl].scales[ib/2] = l;
12661
+ else y[ibl].scales[ib/2] |= (l << 4);
12662
+ }
12663
+ }
12664
+ }
12665
+
12666
+ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12667
+ (void)hist;
12668
+ GGML_ASSERT(n_per_row%QK_K == 0);
12669
+ int nblock = n_per_row/QK_K;
12670
+ char * qrow = (char *)dst;
12671
+ for (int row = 0; row < nrow; ++row) {
12672
+ quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
12673
+ src += n_per_row;
12674
+ qrow += nblock*sizeof(block_iq2_s);
12675
+ }
12676
+ return nrow * nblock * sizeof(block_iq2_s);
12677
+ }
12678
+
12679
+ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
12680
+ assert(k % QK_K == 0);
12681
+ quantize_iq2_s(x, y, 1, k, NULL, NULL);
12682
+ }
12683
+
12684
+ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
12685
+ assert(k % QK_K == 0);
12686
+ block_iq2_s * restrict y = vy;
12687
+ quantize_row_iq2_s_reference(x, y, k);
12688
+ }