llama_cpp 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
462
462
  return res;
463
463
  }
464
464
 
465
+ // NOTE: not tested
466
+ inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
467
+ int8x16_t res;
468
+
469
+ res[ 0] = a[b[ 0]];
470
+ res[ 1] = a[b[ 1]];
471
+ res[ 2] = a[b[ 2]];
472
+ res[ 3] = a[b[ 3]];
473
+ res[ 4] = a[b[ 4]];
474
+ res[ 5] = a[b[ 5]];
475
+ res[ 6] = a[b[ 6]];
476
+ res[ 7] = a[b[ 7]];
477
+ res[ 8] = a[b[ 8]];
478
+ res[ 9] = a[b[ 9]];
479
+ res[10] = a[b[10]];
480
+ res[11] = a[b[11]];
481
+ res[12] = a[b[12]];
482
+ res[13] = a[b[13]];
483
+ res[14] = a[b[14]];
484
+ res[15] = a[b[15]];
485
+
486
+ return res;
487
+ }
488
+
465
489
  #else
466
490
 
467
491
  #define ggml_int16x8x2_t int16x8x2_t
@@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
476
500
  #define ggml_vld1q_s8_x2 vld1q_s8_x2
477
501
  #define ggml_vld1q_s8_x4 vld1q_s8_x4
478
502
  #define ggml_vqtbl1q_s8 vqtbl1q_s8
503
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
479
504
 
480
505
  #endif
481
506
 
@@ -1852,7 +1877,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1852
1877
  float mins[QK_K/16];
1853
1878
  float scales[QK_K/16];
1854
1879
  float sw[QK_K/16];
1855
- float weight[QK_K/16];
1880
+ float weight[16];
1856
1881
  uint8_t Ls[QK_K/16], Lm[QK_K/16];
1857
1882
 
1858
1883
  for (int i = 0; i < nb; i++) {
@@ -1862,13 +1887,42 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1862
1887
  float sigma2 = sumx2/QK_K;
1863
1888
  for (int j = 0; j < QK_K/16; ++j) {
1864
1889
  const float * restrict qw = quant_weights + QK_K * i + 16*j;
1865
- for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
1890
+ for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
1866
1891
  for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
1867
- scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
1892
+ scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
1868
1893
  }
1869
1894
 
1870
- float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
1871
- float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
1895
+ float dm, mm;
1896
+ #if QK_K == 64
1897
+ float max_scale = 0, max_min = 0;
1898
+ for (int j = 0; j < QK_K/16; ++j) {
1899
+ max_scale = MAX(max_scale, scales[j]);
1900
+ max_min = MAX(max_min, mins[j]);
1901
+ }
1902
+ dm = max_scale/15;
1903
+ mm = max_min/15;
1904
+ if (max_scale) {
1905
+ float id = 1/dm;
1906
+ for (int j = 0; j < QK_K/16; ++j) {
1907
+ int l = nearest_int(id*scales[j]);
1908
+ Ls[j] = MAX(0, MIN(15, l));
1909
+ }
1910
+ } else {
1911
+ memset(Ls, 0, QK_K/16);
1912
+ }
1913
+ if (max_min) {
1914
+ float id = 1/mm;
1915
+ for (int j = 0; j < QK_K/16; ++j) {
1916
+ int l = nearest_int(id*mins[j]);
1917
+ Lm[j] = MAX(0, MIN(15, l));
1918
+ }
1919
+ } else {
1920
+ memset(Lm, 0, QK_K/16);
1921
+ }
1922
+ #else
1923
+ dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
1924
+ mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
1925
+ #endif
1872
1926
  y[i].d = GGML_FP32_TO_FP16(dm);
1873
1927
  y[i].dmin = GGML_FP32_TO_FP16(mm);
1874
1928
  dm = GGML_FP16_TO_FP32(y[i].d);
@@ -3470,6 +3524,265 @@ static const uint64_t iq2xs_grid[512] = {
3470
3524
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3471
3525
  };
3472
3526
 
3527
+ static const uint64_t iq2s_grid[1024] = {
3528
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3529
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3530
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3531
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3532
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3533
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
3534
+ 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
3535
+ 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
3536
+ 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
3537
+ 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
3538
+ 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
3539
+ 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
3540
+ 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
3541
+ 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
3542
+ 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
3543
+ 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
3544
+ 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
3545
+ 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
3546
+ 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
3547
+ 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
3548
+ 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
3549
+ 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
3550
+ 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
3551
+ 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
3552
+ 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
3553
+ 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
3554
+ 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
3555
+ 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
3556
+ 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
3557
+ 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
3558
+ 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
3559
+ 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
3560
+ 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
3561
+ 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
3562
+ 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
3563
+ 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
3564
+ 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
3565
+ 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
3566
+ 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
3567
+ 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
3568
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
3569
+ 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
3570
+ 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
3571
+ 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
3572
+ 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
3573
+ 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
3574
+ 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
3575
+ 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
3576
+ 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
3577
+ 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
3578
+ 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
3579
+ 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
3580
+ 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
3581
+ 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
3582
+ 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
3583
+ 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
3584
+ 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
3585
+ 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
3586
+ 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
3587
+ 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
3588
+ 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
3589
+ 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
3590
+ 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
3591
+ 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
3592
+ 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
3593
+ 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
3594
+ 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
3595
+ 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
3596
+ 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
3597
+ 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
3598
+ 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
3599
+ 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
3600
+ 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
3601
+ 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
3602
+ 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
3603
+ 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
3604
+ 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
3605
+ 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
3606
+ 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
3607
+ 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
3608
+ 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
3609
+ 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
3610
+ 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
3611
+ 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
3612
+ 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
3613
+ 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
3614
+ 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
3615
+ 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
3616
+ 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
3617
+ 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
3618
+ 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
3619
+ 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
3620
+ 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
3621
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
3622
+ 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
3623
+ 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
3624
+ 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
3625
+ 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
3626
+ 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
3627
+ 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
3628
+ 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
3629
+ 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
3630
+ 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
3631
+ 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
3632
+ 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
3633
+ 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
3634
+ 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
3635
+ 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
3636
+ 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
3637
+ 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
3638
+ 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
3639
+ 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
3640
+ 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
3641
+ 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
3642
+ 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
3643
+ 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
3644
+ 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
3645
+ 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
3646
+ 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
3647
+ 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
3648
+ 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
3649
+ 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
3650
+ 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
3651
+ 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
3652
+ 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
3653
+ 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
3654
+ 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
3655
+ 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
3656
+ 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
3657
+ 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
3658
+ 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
3659
+ 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
3660
+ 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
3661
+ 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
3662
+ 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
3663
+ 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
3664
+ 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
3665
+ 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
3666
+ 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
3667
+ 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
3668
+ 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
3669
+ 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
3670
+ 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
3671
+ 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
3672
+ 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
3673
+ 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
3674
+ 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
3675
+ 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
3676
+ 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
3677
+ 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
3678
+ 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
3679
+ 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
3680
+ 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
3681
+ 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
3682
+ 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
3683
+ 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
3684
+ 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
3685
+ 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
3686
+ 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
3687
+ 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
3688
+ 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
3689
+ 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
3690
+ 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
3691
+ 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
3692
+ 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
3693
+ 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
3694
+ 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
3695
+ 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
3696
+ 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
3697
+ 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
3698
+ 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
3699
+ 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
3700
+ 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
3701
+ 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
3702
+ 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
3703
+ 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
3704
+ 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
3705
+ 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
3706
+ 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
3707
+ 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
3708
+ 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
3709
+ 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
3710
+ 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
3711
+ 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
3712
+ 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
3713
+ 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
3714
+ 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
3715
+ 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
3716
+ 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
3717
+ 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
3718
+ 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
3719
+ 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
3720
+ 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
3721
+ 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
3722
+ 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
3723
+ 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
3724
+ 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
3725
+ 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
3726
+ 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
3727
+ 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
3728
+ 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
3729
+ 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
3730
+ 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
3731
+ 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
3732
+ 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
3733
+ 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
3734
+ 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
3735
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
3736
+ 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
3737
+ 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
3738
+ 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
3739
+ 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
3740
+ 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
3741
+ 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
3742
+ 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
3743
+ 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
3744
+ 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
3745
+ 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
3746
+ 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
3747
+ 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
3748
+ 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
3749
+ 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
3750
+ 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
3751
+ 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
3752
+ 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
3753
+ 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
3754
+ 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
3755
+ 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
3756
+ 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
3757
+ 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
3758
+ 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
3759
+ 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
3760
+ 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
3761
+ 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
3762
+ 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
3763
+ 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
3764
+ 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
3765
+ 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
3766
+ 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
3767
+ 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
3768
+ 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
3769
+ 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
3770
+ 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
3771
+ 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
3772
+ 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
3773
+ 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
3774
+ 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
3775
+ 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
3776
+ 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
3777
+ 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
3778
+ 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
3779
+ 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
3780
+ 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
3781
+ 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
3782
+ 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
3783
+ 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
3784
+ };
3785
+
3473
3786
  static const uint32_t iq3xxs_grid[256] = {
3474
3787
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
3475
3788
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
@@ -3505,6 +3818,73 @@ static const uint32_t iq3xxs_grid[256] = {
3505
3818
  0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
3506
3819
  };
3507
3820
 
3821
+ static const uint32_t iq3xs_grid[512] = {
3822
+ 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
3823
+ 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
3824
+ 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
3825
+ 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
3826
+ 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
3827
+ 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
3828
+ 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
3829
+ 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
3830
+ 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
3831
+ 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
3832
+ 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
3833
+ 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
3834
+ 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
3835
+ 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
3836
+ 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
3837
+ 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
3838
+ 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
3839
+ 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
3840
+ 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
3841
+ 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
3842
+ 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
3843
+ 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
3844
+ 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
3845
+ 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
3846
+ 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
3847
+ 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
3848
+ 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
3849
+ 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
3850
+ 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
3851
+ 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
3852
+ 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
3853
+ 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
3854
+ 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
3855
+ 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
3856
+ 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
3857
+ 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
3858
+ 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
3859
+ 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
3860
+ 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
3861
+ 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
3862
+ 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
3863
+ 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
3864
+ 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
3865
+ 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
3866
+ 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
3867
+ 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
3868
+ 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
3869
+ 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
3870
+ 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
3871
+ 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
3872
+ 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
3873
+ 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
3874
+ 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
3875
+ 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
3876
+ 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
3877
+ 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
3878
+ 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
3879
+ 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
3880
+ 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
3881
+ 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
3882
+ 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
3883
+ 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
3884
+ 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
3885
+ 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
3886
+ };
3887
+
3508
3888
  #define NGRID_IQ2XXS 512
3509
3889
  static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
3510
3890
  0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
@@ -3704,6 +4084,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
3704
4084
  }
3705
4085
  }
3706
4086
 
4087
+ // ====================== 2.5625 bpw (de)-quantization
4088
+
4089
+ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
4090
+ assert(k % QK_K == 0);
4091
+ const int nb = k / QK_K;
4092
+
4093
+ float db[2];
4094
+
4095
+ for (int i = 0; i < nb; i++) {
4096
+
4097
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4098
+ const uint8_t * qs = x[i].qs;
4099
+ const uint8_t * qh = x[i].qh;
4100
+ const uint8_t * signs = qs + QK_K/8;
4101
+
4102
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
4103
+ db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
4104
+ db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
4105
+ for (int l = 0; l < 4; ++l) {
4106
+ const float dl = db[l/2];
4107
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
4108
+ for (int j = 0; j < 8; ++j) {
4109
+ y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
4110
+ }
4111
+ y += 8;
4112
+ }
4113
+ qs += 4;
4114
+ signs += 4;
4115
+ }
4116
+ }
4117
+ }
4118
+
3707
4119
  // ====================== 3.0625 bpw (de)-quantization
3708
4120
 
3709
4121
  void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
@@ -3736,6 +4148,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
3736
4148
  }
3737
4149
  }
3738
4150
 
4151
+ // ====================== 3.3125 bpw (de)-quantization
4152
+
4153
+ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
4154
+ assert(k % QK_K == 0);
4155
+ const int nb = k / QK_K;
4156
+
4157
+ for (int i = 0; i < nb; i++) {
4158
+
4159
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4160
+ const uint8_t * qs = x[i].qs;
4161
+ const uint8_t * qh = x[i].qh;
4162
+ const uint8_t * signs = x[i].signs;
4163
+
4164
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
4165
+ const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f;
4166
+ const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f;
4167
+ for (int l = 0; l < 4; ++l) {
4168
+ const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
4169
+ const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
4170
+ for (int j = 0; j < 4; ++j) {
4171
+ y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4172
+ y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
4173
+ }
4174
+ y += 8;
4175
+ }
4176
+ qs += 8;
4177
+ signs += 4;
4178
+ for (int l = 0; l < 4; ++l) {
4179
+ const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
4180
+ const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
4181
+ for (int j = 0; j < 4; ++j) {
4182
+ y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4183
+ y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
4184
+ }
4185
+ y += 8;
4186
+ }
4187
+ qh += 2;
4188
+ qs += 8;
4189
+ signs += 4;
4190
+ }
4191
+ }
4192
+ }
4193
+
3739
4194
  // ====================== 1.5625 bpw (de)-quantization
3740
4195
 
3741
4196
  void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
@@ -3799,6 +4254,33 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
3799
4254
  }
3800
4255
  }
3801
4256
 
4257
+ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
4258
+ assert(k % QK_K == 0);
4259
+ #if QK_K == 64
4260
+ dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
4261
+ #else
4262
+ const int nb = k / QK_K;
4263
+
4264
+ for (int i = 0; i < nb; i++) {
4265
+
4266
+ const uint8_t * qs = x[i].qs;
4267
+
4268
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4269
+
4270
+ for (int ib = 0; ib < QK_K/32; ++ib) {
4271
+ const int ls = ((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4);
4272
+ const float dl = d * (ls - 32);
4273
+ for (int j = 0; j < 16; ++j) {
4274
+ y[j+ 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
4275
+ y[j+16] = dl * kvalues_iq4nl[qs[j] >> 4];
4276
+ }
4277
+ y += 32;
4278
+ qs += 16;
4279
+ }
4280
+ }
4281
+ #endif
4282
+ }
4283
+
3802
4284
  //===================================== Q8_K ==============================================
3803
4285
 
3804
4286
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -5857,7 +6339,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5857
6339
 
5858
6340
  float sumf = 0;
5859
6341
 
5860
- int isum[4];
6342
+ int isum[QK_K/16];
5861
6343
 
5862
6344
  for (int i = 0; i < nb; ++i) {
5863
6345
 
@@ -5873,14 +6355,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5873
6355
  const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5874
6356
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5875
6357
 
5876
- isum[0] = isum[1] = isum[2] = isum[3] = 0;
6358
+ memset(isum, 0, (QK_K/16)*sizeof(int));
5877
6359
  for (int l = 0; l < 16; ++l) {
5878
6360
  isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
5879
6361
  isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
5880
6362
  isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
5881
6363
  isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
5882
6364
  }
5883
- for (int l = 0; l < 4; ++l) {
6365
+ for (int l = 0; l < QK_K/16; ++l) {
5884
6366
  isum[l] *= (sc[l] & 0xF);
5885
6367
  }
5886
6368
  sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
@@ -8806,6 +9288,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8806
9288
 
8807
9289
  #endif
8808
9290
 
9291
+ #if defined (__AVX2__) || defined (__ARM_NEON)
8809
9292
  static const int8_t keven_signs_q2xs[1024] = {
8810
9293
  1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
8811
9294
  1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
@@ -8840,6 +9323,7 @@ static const int8_t keven_signs_q2xs[1024] = {
8840
9323
  1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
8841
9324
  1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
8842
9325
  };
9326
+ #endif
8843
9327
 
8844
9328
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8845
9329
  assert(n % QK_K == 0);
@@ -9037,15 +9521,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9037
9521
 
9038
9522
  #elif defined(__AVX2__)
9039
9523
 
9040
- const __m128i m4 = _mm_set1_epi8(0xf);
9041
- const __m128i m1 = _mm_set1_epi8(1);
9042
- const __m256i m511 = _mm256_set1_epi16(511);
9043
9524
  const __m256i mone = _mm256_set1_epi8(1);
9044
-
9045
- static const uint8_t k_bit_helper[32] = {
9046
- 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9047
- 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9048
- };
9049
9525
  static const char block_sign_shuffle_mask_1[32] = {
9050
9526
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
9051
9527
  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
@@ -9059,11 +9535,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9059
9535
  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9060
9536
  };
9061
9537
 
9062
- const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
9063
9538
  const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
9064
9539
  const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
9065
9540
  const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
9066
9541
 
9542
+ #if QK_K == 64
9543
+ static const uint8_t k_bit_helper[16] = {
9544
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9545
+ };
9546
+ const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
9547
+ const __m128i m511 = _mm_set1_epi16(511);
9548
+ typedef union {
9549
+ __m128i vec_index;
9550
+ uint16_t index[8];
9551
+ } index_t;
9552
+
9553
+ index_t idx;
9554
+ __m256 accumf = _mm256_setzero_ps();
9555
+ for (int i = 0; i < nb; ++i) {
9556
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9557
+ const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
9558
+ idx.vec_index = _mm_and_si128(q2_data, m511);
9559
+
9560
+ const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
9561
+ const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
9562
+ const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
9563
+
9564
+ const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
9565
+ const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
9566
+ const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
9567
+
9568
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
9569
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
9570
+
9571
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
9572
+ iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
9573
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
9574
+ iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
9575
+
9576
+ __m256i signs;
9577
+ signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
9578
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
9579
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
9580
+
9581
+ signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
9582
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
9583
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
9584
+
9585
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
9586
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
9587
+
9588
+ const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
9589
+ const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
9590
+
9591
+ const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
9592
+
9593
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
9594
+
9595
+ }
9596
+
9597
+ *s = 0.125f * hsum_float_8(accumf);
9598
+ #else
9599
+
9600
+ static const uint8_t k_bit_helper[32] = {
9601
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9602
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9603
+ };
9604
+ const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
9605
+ const __m256i m511 = _mm256_set1_epi16(511);
9606
+ const __m128i m4 = _mm_set1_epi8(0xf);
9607
+ const __m128i m1 = _mm_set1_epi8(1);
9608
+
9067
9609
  uint64_t aux64;
9068
9610
 
9069
9611
  // somewhat hacky, but gives a significant boost in performance
@@ -9152,6 +9694,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9152
9694
  }
9153
9695
 
9154
9696
  *s = 0.125f * hsum_float_8(accumf);
9697
+ #endif
9155
9698
 
9156
9699
  #else
9157
9700
 
@@ -9193,7 +9736,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9193
9736
  #endif
9194
9737
  }
9195
9738
 
9196
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9739
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9197
9740
  assert(n % QK_K == 0);
9198
9741
  assert(nrc == 1);
9199
9742
  UNUSED(nrc);
@@ -9201,88 +9744,148 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9201
9744
  UNUSED(by);
9202
9745
  UNUSED(bs);
9203
9746
 
9204
- const block_iq3_xxs * restrict x = vx;
9205
- const block_q8_K * restrict y = vy;
9747
+ const block_iq2_s * restrict x = vx;
9748
+ const block_q8_K * restrict y = vy;
9206
9749
 
9207
9750
  const int nb = n / QK_K;
9208
9751
 
9209
9752
  #if defined(__ARM_NEON)
9210
9753
 
9211
- const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9754
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9755
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9756
+ };
9212
9757
 
9213
- uint32_t aux32[2];
9758
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9214
9759
 
9215
- ggml_int8x16x4_t q3s;
9760
+ const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
9761
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
9762
+ const uint8x16_t m1 = vdupq_n_u8(1);
9763
+ const int32x4_t vzero = vdupq_n_s32(0);
9764
+
9765
+ uint8x16x2_t vs;
9766
+ ggml_int8x16x4_t q2s;
9216
9767
  ggml_int8x16x4_t q8b;
9217
9768
 
9218
9769
  float sumf = 0;
9219
9770
  for (int i = 0; i < nb; ++i) {
9771
+
9220
9772
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9221
- const uint8_t * restrict q3 = x[i].qs;
9222
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9223
- const int8_t * restrict q8 = y[i].qs;
9224
- float sumf1 = 0, sumf2 = 0;
9773
+
9774
+ const uint8_t * restrict qs = x[i].qs;
9775
+ const uint8_t * restrict qh = x[i].qh;
9776
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9777
+ const int8_t * restrict q8 = y[i].qs;
9778
+
9779
+ int sumi1 = 0, sumi2 = 0;
9225
9780
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9226
9781
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9227
- memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
9228
- const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
9229
- const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
9230
- const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
9231
- const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
9232
- q3 += 16;
9233
- q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
9234
- q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
9235
- q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
9236
- q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
9237
- q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
9238
- q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
9239
- q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
9240
- q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
9241
- const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
9242
- const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
9243
- sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
9244
- sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
9782
+ q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
9783
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
9784
+ q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
9785
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
9786
+ q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
9787
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
9788
+ q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
9789
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
9790
+ qs += 8;
9791
+
9792
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9793
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9794
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9795
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
9796
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
9797
+
9798
+ q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
9799
+ q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
9800
+
9801
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9802
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9803
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9804
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
9805
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
9806
+
9807
+ signs += 4;
9808
+
9809
+ q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
9810
+ q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
9811
+
9812
+ const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
9813
+ const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
9814
+ const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
9815
+ const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
9816
+
9817
+ sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
9818
+ sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
9819
+ sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
9820
+ sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
9245
9821
  }
9246
- sumf += d*(sumf1 + sumf2);
9822
+ sumf += d*(sumi1 + sumi2);
9247
9823
  }
9248
- *s = 0.5f * sumf;
9824
+
9825
+ *s = 0.125f * sumf;
9249
9826
 
9250
9827
  #elif defined(__AVX2__)
9251
9828
 
9252
- const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9829
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9830
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9831
+ };
9253
9832
 
9254
- uint32_t aux32[2];
9833
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9834
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9835
+ };
9836
+
9837
+ const __m128i m4 = _mm_set1_epi8(0xf);
9838
+ const __m128i m1 = _mm_set1_epi8(1);
9839
+
9840
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
9841
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
9842
+
9843
+ uint64_t aux64;
9255
9844
 
9256
9845
  __m256 accumf = _mm256_setzero_ps();
9257
9846
  for (int i = 0; i < nb; ++i) {
9258
9847
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9259
- const uint8_t * restrict q3 = x[i].qs;
9260
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9848
+ const uint8_t * restrict qs = x[i].qs;
9849
+ const uint8_t * restrict qh = x[i].qh;
9850
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9261
9851
  const int8_t * restrict q8 = y[i].qs;
9852
+
9853
+ memcpy(&aux64, x[i].scales, 8);
9854
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
9855
+ const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
9856
+
9262
9857
  __m256i sumi1 = _mm256_setzero_si256();
9263
9858
  __m256i sumi2 = _mm256_setzero_si256();
9264
9859
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9265
9860
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
9266
9861
  const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
9267
- const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
9268
- iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
9269
- q3 += 8;
9270
- const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
9271
- iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
9272
- q3 += 8;
9273
- memcpy(aux32, gas, 8); gas += 8;
9274
- const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
9275
- signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
9276
- const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
9277
- signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
9278
- const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
9279
- const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
9280
- const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
9281
- const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
9282
- const uint16_t ls1 = aux32[0] >> 28;
9283
- const uint16_t ls2 = aux32[1] >> 28;
9284
- const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
9285
- const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
9862
+ const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
9863
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
9864
+ iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
9865
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
9866
+ const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
9867
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
9868
+ iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
9869
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
9870
+ qs += 8;
9871
+
9872
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
9873
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9874
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
9875
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
9876
+
9877
+ aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
9878
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9879
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
9880
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
9881
+
9882
+ signs += 4;
9883
+
9884
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
9885
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
9886
+
9887
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
9888
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
9286
9889
  sumi1 = _mm256_add_epi32(sumi1, p1);
9287
9890
  sumi2 = _mm256_add_epi32(sumi2, p2);
9288
9891
  }
@@ -9291,18 +9894,162 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9291
9894
 
9292
9895
  }
9293
9896
 
9294
- *s = 0.25f * hsum_float_8(accumf);
9897
+ *s = 0.125f * hsum_float_8(accumf);
9295
9898
 
9296
9899
  #else
9297
9900
 
9298
- uint32_t aux32;
9901
+ float sumf = 0;
9902
+ for (int i = 0; i < nb; i++) {
9299
9903
 
9300
- float sumf = 0.f;
9301
- for (int i = 0; i < nb; ++i) {
9302
9904
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9303
- const uint8_t * restrict q3 = x[i].qs;
9304
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9305
- const int8_t * restrict q8 = y[i].qs;
9905
+ const int8_t * q8 = y[i].qs;
9906
+ const uint8_t * qs = x[i].qs;
9907
+ const uint8_t * qh = x[i].qh;
9908
+ const uint8_t * signs = qs + QK_K/8;
9909
+
9910
+ int bsum = 0;
9911
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9912
+ int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
9913
+ int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
9914
+ int sumi1 = 0, sumi2 = 0;
9915
+ for (int l = 0; l < 2; ++l) {
9916
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
9917
+ for (int j = 0; j < 8; ++j) {
9918
+ sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
9919
+ }
9920
+ q8 += 8;
9921
+ }
9922
+ for (int l = 2; l < 4; ++l) {
9923
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
9924
+ for (int j = 0; j < 8; ++j) {
9925
+ sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
9926
+ }
9927
+ q8 += 8;
9928
+ }
9929
+ bsum += ls1 * sumi1 + ls2 * sumi2;
9930
+ qs += 4;
9931
+ signs += 4;
9932
+ }
9933
+
9934
+ sumf += d * bsum;
9935
+ }
9936
+
9937
+ *s = 0.125f * sumf;
9938
+
9939
+ #endif
9940
+
9941
+ }
9942
+
9943
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9944
+ assert(n % QK_K == 0);
9945
+ assert(nrc == 1);
9946
+ UNUSED(nrc);
9947
+ UNUSED(bx);
9948
+ UNUSED(by);
9949
+ UNUSED(bs);
9950
+
9951
+ const block_iq3_xxs * restrict x = vx;
9952
+ const block_q8_K * restrict y = vy;
9953
+
9954
+ const int nb = n / QK_K;
9955
+
9956
+ #if defined(__ARM_NEON)
9957
+
9958
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9959
+
9960
+ uint32_t aux32[2];
9961
+
9962
+ ggml_int8x16x4_t q3s;
9963
+ ggml_int8x16x4_t q8b;
9964
+
9965
+ float sumf = 0;
9966
+ for (int i = 0; i < nb; ++i) {
9967
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9968
+ const uint8_t * restrict q3 = x[i].qs;
9969
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
9970
+ const int8_t * restrict q8 = y[i].qs;
9971
+ float sumf1 = 0, sumf2 = 0;
9972
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9973
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9974
+ memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
9975
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
9976
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
9977
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
9978
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
9979
+ q3 += 16;
9980
+ q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
9981
+ q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
9982
+ q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
9983
+ q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
9984
+ q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
9985
+ q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
9986
+ q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
9987
+ q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
9988
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
9989
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
9990
+ sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
9991
+ sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
9992
+ }
9993
+ sumf += d*(sumf1 + sumf2);
9994
+ }
9995
+ *s = 0.5f * sumf;
9996
+
9997
+ #elif defined(__AVX2__)
9998
+
9999
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10000
+
10001
+ uint32_t aux32[2];
10002
+
10003
+ __m256 accumf = _mm256_setzero_ps();
10004
+ for (int i = 0; i < nb; ++i) {
10005
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10006
+ const uint8_t * restrict q3 = x[i].qs;
10007
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
10008
+ const int8_t * restrict q8 = y[i].qs;
10009
+ __m256i sumi1 = _mm256_setzero_si256();
10010
+ __m256i sumi2 = _mm256_setzero_si256();
10011
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10012
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10013
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10014
+ const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
10015
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10016
+ q3 += 8;
10017
+ const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
10018
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10019
+ q3 += 8;
10020
+ memcpy(aux32, gas, 8); gas += 8;
10021
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
10022
+ signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
10023
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
10024
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
10025
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
10026
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
10027
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
10028
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
10029
+ const uint16_t ls1 = aux32[0] >> 28;
10030
+ const uint16_t ls2 = aux32[1] >> 28;
10031
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
10032
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
10033
+ sumi1 = _mm256_add_epi32(sumi1, p1);
10034
+ sumi2 = _mm256_add_epi32(sumi2, p2);
10035
+ }
10036
+
10037
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
10038
+
10039
+ }
10040
+
10041
+ *s = 0.25f * hsum_float_8(accumf);
10042
+
10043
+ #else
10044
+
10045
+ uint32_t aux32;
10046
+
10047
+ float sumf = 0.f;
10048
+ for (int i = 0; i < nb; ++i) {
10049
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10050
+ const uint8_t * restrict q3 = x[i].qs;
10051
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
10052
+ const int8_t * restrict q8 = y[i].qs;
9306
10053
  int32_t bsum = 0;
9307
10054
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9308
10055
  memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
@@ -9327,6 +10074,202 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9327
10074
  #endif
9328
10075
  }
9329
10076
 
10077
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10078
+ assert(n % QK_K == 0);
10079
+ assert(nrc == 1);
10080
+ UNUSED(nrc);
10081
+ UNUSED(bx);
10082
+ UNUSED(by);
10083
+ UNUSED(bs);
10084
+
10085
+ const block_iq3_s * restrict x = vx;
10086
+ const block_q8_K * restrict y = vy;
10087
+
10088
+ const int nb = n / QK_K;
10089
+
10090
+ #if defined(__ARM_NEON)
10091
+
10092
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10093
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10094
+ };
10095
+
10096
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10097
+
10098
+ const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
10099
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
10100
+
10101
+ uint8x16x2_t vs;
10102
+ ggml_int8x16x4_t q3s;
10103
+ ggml_int8x16x4_t q8b;
10104
+
10105
+ float sumf = 0;
10106
+ for (int i = 0; i < nb; ++i) {
10107
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10108
+ const uint8_t * restrict qs = x[i].qs;
10109
+ const uint8_t * restrict qh = x[i].qh;
10110
+ const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10111
+ const int8_t * restrict q8 = y[i].qs;
10112
+ int sumi1 = 0, sumi2 = 0;
10113
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10114
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
10115
+ const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
10116
+ iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
10117
+ const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
10118
+ iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
10119
+ const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
10120
+ iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
10121
+ const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
10122
+ iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
10123
+ qs += 16;
10124
+
10125
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
10126
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10127
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10128
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
10129
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
10130
+
10131
+ q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0]));
10132
+ q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
10133
+
10134
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
10135
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10136
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10137
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
10138
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
10139
+
10140
+ signs += 4;
10141
+
10142
+ q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0]));
10143
+ q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1]));
10144
+
10145
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
10146
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
10147
+ sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
10148
+ sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
10149
+ }
10150
+ sumf += d*(sumi1 + sumi2);
10151
+ }
10152
+ *s = 0.25f * sumf;
10153
+
10154
+ #elif defined(__AVX2__)
10155
+
10156
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10157
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10158
+ };
10159
+
10160
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10161
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10162
+ };
10163
+
10164
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
10165
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
10166
+
10167
+ __m256 accumf = _mm256_setzero_ps();
10168
+ for (int i = 0; i < nb; ++i) {
10169
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10170
+ const uint8_t * restrict qs = x[i].qs;
10171
+ const uint8_t * restrict qh = x[i].qh;
10172
+ const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10173
+ const int8_t * restrict q8 = y[i].qs;
10174
+ __m256i sumi1 = _mm256_setzero_si256();
10175
+ __m256i sumi2 = _mm256_setzero_si256();
10176
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10177
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10178
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10179
+ const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)],
10180
+ iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)],
10181
+ iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)],
10182
+ iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)],
10183
+ iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)],
10184
+ iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)],
10185
+ iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)],
10186
+ iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
10187
+ qs += 8;
10188
+ const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)],
10189
+ iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)],
10190
+ iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)],
10191
+ iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)],
10192
+ iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)],
10193
+ iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)],
10194
+ iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)],
10195
+ iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]);
10196
+ qs += 8;
10197
+
10198
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
10199
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
10200
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
10201
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
10202
+
10203
+ aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
10204
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
10205
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
10206
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
10207
+
10208
+ signs += 4;
10209
+
10210
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
10211
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
10212
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
10213
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
10214
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
10215
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
10216
+ sumi1 = _mm256_add_epi32(sumi1, p1);
10217
+ sumi2 = _mm256_add_epi32(sumi2, p2);
10218
+ }
10219
+
10220
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
10221
+
10222
+ }
10223
+
10224
+ *s = 0.25f * hsum_float_8(accumf);
10225
+
10226
+ #else
10227
+
10228
+ float sumf = 0.f;
10229
+ for (int i = 0; i < nb; ++i) {
10230
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10231
+ const uint8_t * restrict qs = x[i].qs;
10232
+ const uint8_t * restrict qh = x[i].qh;
10233
+ const uint8_t * restrict signs = x[i].signs;
10234
+ const int8_t * restrict q8 = y[i].qs;
10235
+ int32_t bsum = 0;
10236
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10237
+ const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
10238
+ const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
10239
+ int32_t sumi = 0;
10240
+ for (int l = 0; l < 4; ++l) {
10241
+ const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
10242
+ const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
10243
+ for (int j = 0; j < 4; ++j) {
10244
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10245
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
10246
+ }
10247
+ q8 += 8;
10248
+ }
10249
+ qs += 8;
10250
+ signs += 4;
10251
+ bsum += sumi * ls1;
10252
+ sumi = 0;
10253
+ for (int l = 0; l < 4; ++l) {
10254
+ const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
10255
+ const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
10256
+ for (int j = 0; j < 4; ++j) {
10257
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10258
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
10259
+ }
10260
+ q8 += 8;
10261
+ }
10262
+ qs += 8;
10263
+ signs += 4;
10264
+ bsum += sumi * ls2;
10265
+ }
10266
+ sumf += d * bsum;
10267
+ }
10268
+ *s = 0.25f * sumf;
10269
+ #endif
10270
+ }
10271
+
10272
+
9330
10273
  #ifdef __AVX2__
9331
10274
  static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
9332
10275
  const __m256i ax = _mm256_sign_epi8(x, x);
@@ -9348,7 +10291,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
9348
10291
 
9349
10292
  const int nb = n / QK_K;
9350
10293
 
9351
- #if defined __ARM_NEON
10294
+ // TODO: implement for QK_K = 64
10295
+ #if defined __ARM_NEON && QK_K == 256
9352
10296
 
9353
10297
  const uint8x16_t m8 = vdupq_n_u8(0x08);
9354
10298
  const uint8x16_t m7 = vdupq_n_u8(0x07);
@@ -9405,7 +10349,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
9405
10349
 
9406
10350
  *s = sumf;
9407
10351
 
9408
- #elif defined __AVX2__
10352
+ // TODO: implement for QK_K = 64
10353
+ #elif defined __AVX2__ && QK_K == 256
9409
10354
 
9410
10355
  const __m128i m8 = _mm_set1_epi8(0x08);
9411
10356
  const __m128i m7 = _mm_set1_epi8(0x07);
@@ -9420,8 +10365,12 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
9420
10365
 
9421
10366
  uint64_t aux64;
9422
10367
 
9423
- __m256i v_gindex;
9424
- const uint16_t * gindex = (const uint16_t *)&v_gindex;
10368
+ typedef union m256i_uint16 {
10369
+ __m256i reg;
10370
+ uint16_t s[16];
10371
+ } m256i_uint16_t;
10372
+
10373
+ m256i_uint16_t v_gindex;
9425
10374
 
9426
10375
  __m256 accum = _mm256_setzero_ps();
9427
10376
  for (int i = 0; i < nb; ++i) {
@@ -9436,13 +10385,13 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
9436
10385
  memcpy(&aux64, sc, 8); sc += 8;
9437
10386
  const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
9438
10387
  const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
9439
- v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
10388
+ v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
9440
10389
  const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
9441
10390
 
9442
10391
  for (int i32 = 0; i32 < 4; ++i32) {
9443
10392
  const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9444
- const __m256i q1b = _mm256_set_epi64x(iq1s_grid[gindex[4*i32+3]], iq1s_grid[gindex[4*i32+2]],
9445
- iq1s_grid[gindex[4*i32+1]], iq1s_grid[gindex[4*i32+0]]);
10393
+ const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
10394
+ iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
9446
10395
  const __m256i dot = mul_add_epi8(q1b, q8b);
9447
10396
  const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
9448
10397
  const __m256i p = _mm256_madd_epi16(s16, dot);
@@ -9523,6 +10472,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9523
10472
  float sumf = 0;
9524
10473
 
9525
10474
  for (int ib = 0; ib < nb; ib += 2) {
10475
+
9526
10476
  q4bits.val[0] = vld1q_u8(x[ib+0].qs);
9527
10477
  q4bits.val[1] = vld1q_u8(x[ib+1].qs);
9528
10478
  q8b.val[0] = vld1q_s8(y[ib+0].qs);
@@ -9592,6 +10542,138 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9592
10542
  #endif
9593
10543
  }
9594
10544
 
10545
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10546
+ assert(nrc == 1);
10547
+ UNUSED(nrc);
10548
+ UNUSED(bx);
10549
+ UNUSED(by);
10550
+ UNUSED(bs);
10551
+ assert(n % QK_K == 0);
10552
+ #if QK_K == 64
10553
+ ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
10554
+ #else
10555
+
10556
+ const block_iq4_xs * restrict x = vx;
10557
+ const block_q8_K * restrict y = vy;
10558
+
10559
+ const int nb = n / QK_K;
10560
+
10561
+ #if defined __ARM_NEON
10562
+ const int8x16_t values = vld1q_s8(kvalues_iq4nl);
10563
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
10564
+ ggml_uint8x16x2_t q4bits;
10565
+ ggml_int8x16x4_t q4b;
10566
+ ggml_int8x16x4_t q8b;
10567
+ int32x4_t prod_1, prod_2;
10568
+
10569
+ float sumf = 0;
10570
+
10571
+ for (int ibl = 0; ibl < nb; ++ibl) {
10572
+
10573
+ const int8_t * q8 = y[ibl].qs;
10574
+ const uint8_t * q4 = x[ibl].qs;
10575
+ uint16_t h = x[ibl].scales_h;
10576
+
10577
+ int sumi1 = 0, sumi2 = 0;
10578
+ for (int ib = 0; ib < QK_K/64; ++ib) {
10579
+
10580
+ q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
10581
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
10582
+
10583
+ q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
10584
+ q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
10585
+ q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
10586
+ q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
10587
+
10588
+ prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
10589
+ prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
10590
+
10591
+ int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
10592
+ int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
10593
+ h >>= 4;
10594
+ sumi1 += vaddvq_s32(prod_1) * ls1;
10595
+ sumi2 += vaddvq_s32(prod_2) * ls2;
10596
+
10597
+ }
10598
+
10599
+ sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
10600
+ }
10601
+
10602
+ *s = sumf;
10603
+
10604
+ #elif defined __AVX2__
10605
+
10606
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
10607
+ const __m128i m4b = _mm_set1_epi8(0x0f);
10608
+
10609
+ __m256 accum = _mm256_setzero_ps();
10610
+ for (int ibl = 0; ibl < nb; ++ibl) {
10611
+ const uint8_t * qs = x[ibl].qs;
10612
+ const int8_t * q8 = y[ibl].qs;
10613
+ uint16_t sh = x[ibl].scales_h;
10614
+ __m256i sumi1 = _mm256_setzero_si256();
10615
+ __m256i sumi2 = _mm256_setzero_si256();
10616
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
10617
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10618
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10619
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10620
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10621
+ const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10622
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10623
+ const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10624
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10625
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10626
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10627
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
10628
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
10629
+ sh >>= 4;
10630
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
10631
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
10632
+ sumi1 = _mm256_add_epi32(p_1, sumi1);
10633
+ sumi2 = _mm256_add_epi32(p_2, sumi2);
10634
+ }
10635
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
10636
+ _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
10637
+ }
10638
+
10639
+ *s = hsum_float_8(accum);
10640
+
10641
+ #else
10642
+ float sumf = 0;
10643
+ for (int ibl = 0; ibl < nb; ++ibl) {
10644
+ const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
10645
+ uint16_t h = x[ibl].scales_h;
10646
+ const uint8_t * qs = x[ibl].qs;
10647
+ const int8_t * q8 = y[ibl].qs;
10648
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
10649
+ const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
10650
+ const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
10651
+ h >>= 4;
10652
+ const float d1 = d4d8*(ls1 - 32);
10653
+ const float d2 = d4d8*(ls2 - 32);
10654
+ int sumi1 = 0, sumi2 = 0;
10655
+ for (int j = 0; j < 16; ++j) {
10656
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
10657
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
10658
+ }
10659
+ sumf += d1 * (sumi1 + sumi2);
10660
+ qs += 16;
10661
+ q8 += 32;
10662
+ sumi1 = sumi2 = 0;
10663
+ for (int j = 0; j < 16; ++j) {
10664
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
10665
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
10666
+ }
10667
+ sumf += d2 * (sumi1 + sumi2);
10668
+ qs += 16;
10669
+ q8 += 32;
10670
+ }
10671
+ }
10672
+ *s = sumf;
10673
+ #endif
10674
+ #endif
10675
+ }
10676
+
9595
10677
  // ================================ IQ2 quantization =============================================
9596
10678
 
9597
10679
  typedef struct {
@@ -9600,22 +10682,25 @@ typedef struct {
9600
10682
  uint16_t * neighbours;
9601
10683
  } iq2_entry_t;
9602
10684
 
9603
- static iq2_entry_t iq2_data[3] = {
10685
+ static iq2_entry_t iq2_data[4] = {
10686
+ {NULL, NULL, NULL},
9604
10687
  {NULL, NULL, NULL},
9605
10688
  {NULL, NULL, NULL},
9606
10689
  {NULL, NULL, NULL},
9607
10690
  };
9608
10691
 
9609
10692
  static inline int iq2_data_index(enum ggml_type type) {
9610
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
10693
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
9611
10694
  return type == GGML_TYPE_IQ2_XXS ? 0 :
9612
- type == GGML_TYPE_IQ2_XS ? 1 : 2;
10695
+ type == GGML_TYPE_IQ2_XS ? 1 :
10696
+ type == GGML_TYPE_IQ1_S ? 2 : 3;
9613
10697
  }
9614
10698
 
9615
10699
  static inline int iq2_grid_size(enum ggml_type type) {
9616
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
10700
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
9617
10701
  return type == GGML_TYPE_IQ2_XXS ? 256 :
9618
- type == GGML_TYPE_IQ2_XS ? 512 : 512;
10702
+ type == GGML_TYPE_IQ2_XS ? 512 :
10703
+ type == GGML_TYPE_IQ1_S ? 512 : 1024;
9619
10704
  }
9620
10705
 
9621
10706
  static int iq2_compare_func(const void * left, const void * right) {
@@ -9716,11 +10801,79 @@ void iq2xs_init_impl(enum ggml_type type) {
9716
10801
  41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
9717
10802
  42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
9718
10803
  };
10804
+ static const uint16_t kgrid_2bit_1024[1024] = {
10805
+ 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
10806
+ 73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
10807
+ 165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
10808
+ 337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
10809
+ 517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
10810
+ 674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
10811
+ 1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
10812
+ 1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
10813
+ 1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
10814
+ 1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
10815
+ 2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
10816
+ 2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
10817
+ 2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
10818
+ 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
10819
+ 4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
10820
+ 4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
10821
+ 4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
10822
+ 4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
10823
+ 5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
10824
+ 5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
10825
+ 5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
10826
+ 5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
10827
+ 6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
10828
+ 6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
10829
+ 8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
10830
+ 8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
10831
+ 8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
10832
+ 9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
10833
+ 9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
10834
+ 10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
10835
+ 16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
10836
+ 16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
10837
+ 16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
10838
+ 16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
10839
+ 17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
10840
+ 17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
10841
+ 17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
10842
+ 17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
10843
+ 18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
10844
+ 18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
10845
+ 18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
10846
+ 20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
10847
+ 20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
10848
+ 20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
10849
+ 21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
10850
+ 21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
10851
+ 22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
10852
+ 22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
10853
+ 24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
10854
+ 24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
10855
+ 25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
10856
+ 26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
10857
+ 32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
10858
+ 33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
10859
+ 33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
10860
+ 33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
10861
+ 34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
10862
+ 35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
10863
+ 36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
10864
+ 37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
10865
+ 38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
10866
+ 39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
10867
+ 41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
10868
+ 42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
10869
+ };
9719
10870
 
9720
10871
  const int kmap_size = 43692;
9721
- const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
10872
+ //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
10873
+ const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
9722
10874
  const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
9723
- type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : kgrid_1bit_512;
10875
+ type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
10876
+ type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
9724
10877
  uint64_t * kgrid_q2xs;
9725
10878
  int * kmap_q2xs;
9726
10879
  uint16_t * kneighbors_q2xs;
@@ -9817,7 +10970,7 @@ void iq2xs_init_impl(enum ggml_type type) {
9817
10970
  }
9818
10971
 
9819
10972
  void iq2xs_free_impl(enum ggml_type type) {
9820
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
10973
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
9821
10974
  const int gindex = iq2_data_index(type);
9822
10975
  if (iq2_data[gindex].grid) {
9823
10976
  free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
@@ -9866,7 +11019,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9866
11019
 
9867
11020
  const int kMaxQ = 3;
9868
11021
 
9869
- const int nbl = n/256;
11022
+ const int nbl = n/QK_K;
9870
11023
 
9871
11024
  block_iq2_xxs * y = vy;
9872
11025
 
@@ -10039,7 +11192,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
10039
11192
 
10040
11193
  const int kMaxQ = 3;
10041
11194
 
10042
- const int nbl = n/256;
11195
+ const int nbl = n/QK_K;
10043
11196
 
10044
11197
  block_iq2_xs * y = vy;
10045
11198
 
@@ -10239,14 +11392,15 @@ typedef struct {
10239
11392
  uint16_t * neighbours;
10240
11393
  } iq3_entry_t;
10241
11394
 
10242
- static iq3_entry_t iq3_data[1] = {
11395
+ static iq3_entry_t iq3_data[2] = {
11396
+ {NULL, NULL, NULL},
10243
11397
  {NULL, NULL, NULL},
10244
11398
  };
10245
11399
 
10246
11400
  static inline int iq3_data_index(int grid_size) {
10247
11401
  (void)grid_size;
10248
- GGML_ASSERT(grid_size == 256);
10249
- return 0;
11402
+ GGML_ASSERT(grid_size == 256 || grid_size == 512);
11403
+ return grid_size == 256 ? 0 : 1;
10250
11404
  }
10251
11405
 
10252
11406
  static int iq3_compare_func(const void * left, const void * right) {
@@ -10278,9 +11432,44 @@ void iq3xs_init_impl(int grid_size) {
10278
11432
  3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
10279
11433
  3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
10280
11434
  };
11435
+ static const uint16_t kgrid_512[512] = {
11436
+ 0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34,
11437
+ 37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77,
11438
+ 80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142,
11439
+ 145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210,
11440
+ 217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288,
11441
+ 291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393,
11442
+ 395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514,
11443
+ 516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576,
11444
+ 577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653,
11445
+ 655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727,
11446
+ 728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833,
11447
+ 840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977,
11448
+ 989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047,
11449
+ 1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103,
11450
+ 1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199,
11451
+ 1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296,
11452
+ 1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415,
11453
+ 1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561,
11454
+ 1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648,
11455
+ 1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761,
11456
+ 1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877,
11457
+ 1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068,
11458
+ 2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177,
11459
+ 2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269,
11460
+ 2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520,
11461
+ 2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634,
11462
+ 2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805,
11463
+ 2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083,
11464
+ 3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276,
11465
+ 3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591,
11466
+ 3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729,
11467
+ 3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032,
11468
+ };
11469
+
10281
11470
  const int kmap_size = 4096;
10282
- const int nwant = 2;
10283
- const uint16_t * kgrid = kgrid_256;
11471
+ const int nwant = grid_size == 256 ? 2 : 3;
11472
+ const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
10284
11473
  uint32_t * kgrid_q3xs;
10285
11474
  int * kmap_q3xs;
10286
11475
  uint16_t * kneighbors_q3xs;
@@ -10377,7 +11566,7 @@ void iq3xs_init_impl(int grid_size) {
10377
11566
  }
10378
11567
 
10379
11568
  void iq3xs_free_impl(int grid_size) {
10380
- GGML_ASSERT(grid_size == 256);
11569
+ GGML_ASSERT(grid_size == 256 || grid_size == 512);
10381
11570
  const int gindex = iq3_data_index(grid_size);
10382
11571
  if (iq3_data[gindex].grid) {
10383
11572
  free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
@@ -10410,9 +11599,10 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
10410
11599
  return grid_index;
10411
11600
  }
10412
11601
 
10413
- static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
11602
+ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
11603
+ const float * restrict quant_weights) {
10414
11604
 
10415
- const int gindex = iq3_data_index(256);
11605
+ const int gindex = iq3_data_index(grid_size);
10416
11606
 
10417
11607
  const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
10418
11608
  const int * kmap_q3xs = iq3_data[gindex].map;
@@ -10426,9 +11616,23 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10426
11616
 
10427
11617
  const int kMaxQ = 8;
10428
11618
 
10429
- const int nbl = n/256;
11619
+ const int nbl = n/QK_K;
10430
11620
 
10431
- block_iq3_xxs * y = vy;
11621
+ ggml_fp16_t * dh;
11622
+ uint8_t * qs;
11623
+ int block_size;
11624
+ if (grid_size == 256) {
11625
+ block_iq3_xxs * y = vy;
11626
+ dh = &y->d;
11627
+ qs = y->qs;
11628
+ block_size = sizeof(block_iq3_xxs);
11629
+ } else {
11630
+ block_iq3_s * y = vy;
11631
+ dh = &y->d;
11632
+ qs = y->qs;
11633
+ block_size = sizeof(block_iq3_s);
11634
+ }
11635
+ int quant_size = block_size - sizeof(ggml_fp16_t);
10432
11636
 
10433
11637
  float scales[QK_K/32];
10434
11638
  float weight[32];
@@ -10439,57 +11643,271 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10439
11643
  bool is_on_grid[8];
10440
11644
  bool is_on_grid_aux[8];
10441
11645
  uint8_t block_signs[8];
10442
- uint8_t q3[3*(QK_K/8)];
11646
+ uint8_t q3[3*(QK_K/8)+QK_K/32];
10443
11647
  uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
11648
+ uint8_t * qh = q3 + 3*(QK_K/8);
10444
11649
 
10445
11650
  for (int ibl = 0; ibl < nbl; ++ibl) {
10446
11651
 
11652
+ dh[0] = GGML_FP32_TO_FP16(0.f);
11653
+ memset(q3, 0, 3*QK_K/8+QK_K/32);
11654
+
11655
+ float max_scale = 0;
11656
+
11657
+ const float * xbl = x + QK_K*ibl;
11658
+ float sumx2 = 0;
11659
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
11660
+ float sigma2 = 2*sumx2/QK_K;
11661
+
11662
+ for (int ib = 0; ib < QK_K/32; ++ib) {
11663
+ const float * xb = xbl + 32*ib;
11664
+ if (quant_weights) {
11665
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
11666
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11667
+ } else {
11668
+ for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
11669
+ }
11670
+ for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
11671
+ for (int k = 0; k < 4; ++k) {
11672
+ int nflip = 0;
11673
+ uint8_t s = 0;
11674
+ for (int i = 0; i < 8; ++i) {
11675
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
11676
+ else {
11677
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
11678
+ }
11679
+ }
11680
+ if (nflip%2) {
11681
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
11682
+ for (int i = 1; i < 8; ++i) {
11683
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
11684
+ if (ax < min) {
11685
+ min = ax; imin = i;
11686
+ }
11687
+ }
11688
+ xval[8*k+imin] = -xval[8*k+imin];
11689
+ s ^= (1 << imin);
11690
+ }
11691
+ block_signs[k] = s & 127;
11692
+ }
11693
+ float max = xval[0];
11694
+ for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
11695
+ if (!max) {
11696
+ scales[ib] = 0;
11697
+ memset(L, 0, 32);
11698
+ continue;
11699
+ }
11700
+ float best = 0;
11701
+ float scale = max/(2*kMaxQ-1);
11702
+ for (int is = -15; is <= 15; ++is) {
11703
+ float id = (2*kMaxQ-1+is*0.2f)/max;
11704
+ float this_scale = 1/id;
11705
+ for (int k = 0; k < 8; ++k) {
11706
+ for (int i = 0; i < 4; ++i) {
11707
+ int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
11708
+ Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
11709
+ }
11710
+ uint16_t u = 0;
11711
+ for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
11712
+ int grid_index = kmap_q3xs[u];
11713
+ is_on_grid_aux[k] = true;
11714
+ if (grid_index < 0) {
11715
+ is_on_grid_aux[k] = false;
11716
+ const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
11717
+ grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
11718
+ }
11719
+ }
11720
+ float sumqx = 0, sumq2 = 0;
11721
+ for (int i = 0; i < 32; ++i) {
11722
+ float w = weight[i];
11723
+ float q = 2*Laux[i] + 1;
11724
+ sumqx += w*xval[i]*q;
11725
+ sumq2 += w*q*q;
11726
+ }
11727
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
11728
+ scale = sumqx/sumq2; best = scale*sumqx;
11729
+ for (int i = 0; i < 32; ++i) L[i] = Laux[i];
11730
+ for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
11731
+ }
11732
+ }
11733
+ int n_not_ongrid = 0;
11734
+ for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
11735
+ if (n_not_ongrid > 0 && scale > 0) {
11736
+ float id = 1/scale;
11737
+ for (int k = 0; k < 8; ++k) {
11738
+ if (is_on_grid[k]) continue;
11739
+ uint16_t u = 0;
11740
+ for (int i = 0; i < 4; ++i) {
11741
+ int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
11742
+ l = MAX(0, MIN(kMaxQ-1, l));
11743
+ u |= (l << 3*i);
11744
+ }
11745
+ int grid_index = kmap_q3xs[u];
11746
+ if (grid_index < 0) {
11747
+ const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
11748
+ grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
11749
+ }
11750
+ const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
11751
+ for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
11752
+ }
11753
+ float sumqx = 0, sumq2 = 0;
11754
+ for (int i = 0; i < 32; ++i) {
11755
+ float w = weight[i];
11756
+ float q = 2*L[i] + 1;
11757
+ sumqx += w*xval[i]*q;
11758
+ sumq2 += w*q*q;
11759
+ }
11760
+ if (sumq2 > 0) scale = sumqx/sumq2;
11761
+ }
11762
+ if (scale < 0) {
11763
+ // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
11764
+ // and correspondingly flip quant signs.
11765
+ scale = -scale;
11766
+ for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
11767
+ }
11768
+ for (int k = 0; k < 8; ++k) {
11769
+ uint16_t u = 0;
11770
+ for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
11771
+ int grid_index = kmap_q3xs[u];
11772
+ if (grid_index < 0) {
11773
+ printf("Oops: found point %u not on grid:", u);
11774
+ for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
11775
+ printf("\n");
11776
+ GGML_ASSERT(false);
11777
+ }
11778
+ if (grid_size == 256) {
11779
+ q3[8*ib+k] = grid_index;
11780
+ } else {
11781
+ q3[8*ib+k] = grid_index & 255;
11782
+ qh[ib] |= ((grid_index >> 8) << k);
11783
+ }
11784
+
11785
+ }
11786
+ scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
11787
+ GGML_ASSERT(scale >= 0);
11788
+ scales[ib] = scale;
11789
+ max_scale = MAX(max_scale, scale);
11790
+ }
11791
+
11792
+ if (!max_scale) {
11793
+ memset(qs, 0, quant_size);
11794
+ dh += block_size/sizeof(ggml_fp16_t);
11795
+ qs += block_size;
11796
+ continue;
11797
+ }
11798
+
11799
+ float d = max_scale/31;
11800
+ dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor
11801
+ float id = 1/d;
11802
+ for (int ib = 0; ib < QK_K/32; ++ib) {
11803
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
11804
+ l = MAX(0, MIN(15, l));
11805
+ scales_and_signs[ib] |= ((uint32_t)l << 28);
11806
+ }
11807
+ memcpy(qs, q3, quant_size);
11808
+
11809
+ dh += block_size/sizeof(ggml_fp16_t);
11810
+ qs += block_size;
11811
+
11812
+ }
11813
+ }
11814
+
11815
+ size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11816
+ (void)hist;
11817
+ GGML_ASSERT(n_per_row%QK_K == 0);
11818
+ int nblock = n_per_row/QK_K;
11819
+ char * qrow = (char *)dst;
11820
+ for (int row = 0; row < nrow; ++row) {
11821
+ quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
11822
+ src += n_per_row;
11823
+ qrow += nblock*sizeof(block_iq3_xxs);
11824
+ }
11825
+ return nrow * nblock * sizeof(block_iq3_xxs);
11826
+ }
11827
+
11828
+ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
11829
+ assert(k % QK_K == 0);
11830
+ block_iq3_xxs * restrict y = vy;
11831
+ quantize_row_iq3_xxs_reference(x, y, k);
11832
+ }
11833
+
11834
+ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
11835
+ assert(k % QK_K == 0);
11836
+ quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
11837
+ }
11838
+
11839
+ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
11840
+ const float * restrict quant_weights,
11841
+ float * scales,
11842
+ float * weight,
11843
+ float * xval,
11844
+ int8_t * L,
11845
+ int8_t * Laux,
11846
+ float * waux,
11847
+ bool * is_on_grid,
11848
+ bool * is_on_grid_aux,
11849
+ uint8_t * block_signs) {
11850
+
11851
+ const int gindex = iq3_data_index(512);
11852
+
11853
+ const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
11854
+ const int * kmap_q3xs = iq3_data[gindex].map;
11855
+ const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
11856
+
11857
+ //GGML_ASSERT(quant_weights && "missing quantization weights");
11858
+ GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
11859
+ GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
11860
+ GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
11861
+ GGML_ASSERT(n%QK_K == 0);
11862
+
11863
+ const int kMaxQ = 8;
11864
+
11865
+ const int nbl = n/QK_K;
11866
+
11867
+ block_iq3_s * y = vy;
11868
+
11869
+ const int bs4 = block_size/4;
11870
+ const int bs8 = block_size/8;
11871
+
11872
+ for (int ibl = 0; ibl < nbl; ++ibl) {
11873
+
11874
+ memset(&y[ibl], 0, sizeof(block_iq3_s));
10447
11875
  y[ibl].d = GGML_FP32_TO_FP16(0.f);
10448
- memset(q3, 0, 3*QK_K/8);
11876
+
11877
+ uint8_t * qs = y[ibl].qs;
11878
+ uint8_t * qh = y[ibl].qh;
11879
+ uint8_t * signs = y[ibl].signs;
10449
11880
 
10450
11881
  float max_scale = 0;
10451
11882
 
10452
11883
  const float * xbl = x + QK_K*ibl;
10453
11884
  float sumx2 = 0;
10454
11885
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
10455
- float sigma2 = sumx2/QK_K;
11886
+ float sigma2 = 2*sumx2/QK_K;
10456
11887
 
10457
- for (int ib = 0; ib < QK_K/32; ++ib) {
10458
- const float * xb = xbl + 32*ib;
11888
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
11889
+ const float * xb = xbl + block_size*ib;
10459
11890
  if (quant_weights) {
10460
- const float * qw = quant_weights + QK_K*ibl + 32*ib;
10461
- for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11891
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
11892
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
10462
11893
  } else {
10463
- for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
11894
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
10464
11895
  }
10465
- for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
10466
- for (int k = 0; k < 4; ++k) {
10467
- int nflip = 0;
11896
+ for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
11897
+ for (int k = 0; k < bs8; ++k) {
10468
11898
  uint8_t s = 0;
10469
11899
  for (int i = 0; i < 8; ++i) {
10470
11900
  if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
10471
11901
  else {
10472
- xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
10473
- }
10474
- }
10475
- if (nflip%2) {
10476
- int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
10477
- for (int i = 1; i < 8; ++i) {
10478
- float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
10479
- if (ax < min) {
10480
- min = ax; imin = i;
10481
- }
11902
+ xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
10482
11903
  }
10483
- xval[8*k+imin] = -xval[8*k+imin];
10484
- s ^= (1 << imin);
10485
11904
  }
10486
- block_signs[k] = s & 127;
11905
+ block_signs[k] = s;
10487
11906
  }
10488
11907
  float max = xval[0];
10489
- for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
11908
+ for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
10490
11909
  if (!max) {
10491
11910
  scales[ib] = 0;
10492
- memset(L, 0, 32);
10493
11911
  continue;
10494
11912
  }
10495
11913
  float best = 0;
@@ -10497,7 +11915,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10497
11915
  for (int is = -15; is <= 15; ++is) {
10498
11916
  float id = (2*kMaxQ-1+is*0.2f)/max;
10499
11917
  float this_scale = 1/id;
10500
- for (int k = 0; k < 8; ++k) {
11918
+ for (int k = 0; k < bs4; ++k) {
10501
11919
  for (int i = 0; i < 4; ++i) {
10502
11920
  int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
10503
11921
  Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
@@ -10513,7 +11931,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10513
11931
  }
10514
11932
  }
10515
11933
  float sumqx = 0, sumq2 = 0;
10516
- for (int i = 0; i < 32; ++i) {
11934
+ for (int i = 0; i < block_size; ++i) {
10517
11935
  float w = weight[i];
10518
11936
  float q = 2*Laux[i] + 1;
10519
11937
  sumqx += w*xval[i]*q;
@@ -10521,15 +11939,15 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10521
11939
  }
10522
11940
  if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
10523
11941
  scale = sumqx/sumq2; best = scale*sumqx;
10524
- for (int i = 0; i < 32; ++i) L[i] = Laux[i];
10525
- for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
11942
+ for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
11943
+ for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
10526
11944
  }
10527
11945
  }
10528
11946
  int n_not_ongrid = 0;
10529
- for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
11947
+ for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
10530
11948
  if (n_not_ongrid > 0 && scale > 0) {
10531
11949
  float id = 1/scale;
10532
- for (int k = 0; k < 8; ++k) {
11950
+ for (int k = 0; k < bs4; ++k) {
10533
11951
  if (is_on_grid[k]) continue;
10534
11952
  uint16_t u = 0;
10535
11953
  for (int i = 0; i < 4; ++i) {
@@ -10546,7 +11964,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10546
11964
  for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
10547
11965
  }
10548
11966
  float sumqx = 0, sumq2 = 0;
10549
- for (int i = 0; i < 32; ++i) {
11967
+ for (int i = 0; i < block_size; ++i) {
10550
11968
  float w = weight[i];
10551
11969
  float q = 2*L[i] + 1;
10552
11970
  sumqx += w*xval[i]*q;
@@ -10558,9 +11976,9 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10558
11976
  // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
10559
11977
  // and correspondingly flip quant signs.
10560
11978
  scale = -scale;
10561
- for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
11979
+ for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
10562
11980
  }
10563
- for (int k = 0; k < 8; ++k) {
11981
+ for (int k = 0; k < bs4; ++k) {
10564
11982
  uint16_t u = 0;
10565
11983
  for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
10566
11984
  int grid_index = kmap_q3xs[u];
@@ -10570,99 +11988,71 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10570
11988
  printf("\n");
10571
11989
  GGML_ASSERT(false);
10572
11990
  }
10573
- q3[8*ib+k] = grid_index;
11991
+ qs[k] = grid_index & 255;
11992
+ qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
10574
11993
  }
10575
- scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
11994
+ qs += bs4;
11995
+ for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
11996
+ signs += bs8;
10576
11997
  GGML_ASSERT(scale >= 0);
10577
11998
  scales[ib] = scale;
10578
11999
  max_scale = MAX(max_scale, scale);
10579
12000
  }
10580
12001
 
10581
12002
  if (!max_scale) {
10582
- memset(y[ibl].qs, 0, 3*QK_K/8);
10583
12003
  continue;
10584
12004
  }
10585
12005
 
10586
12006
  float d = max_scale/31;
10587
12007
  y[ibl].d = GGML_FP32_TO_FP16(d);
10588
12008
  float id = 1/d;
10589
- float sumqx = 0, sumq2 = 0;
10590
- for (int ib = 0; ib < QK_K/32; ++ib) {
10591
- int l = nearest_int(0.5f*(id*scales[ib]-1));
10592
- l = MAX(0, MIN(15, l));
10593
- scales_and_signs[ib] |= ((uint32_t)l << 28);
10594
- if (false) {
10595
- const float * xb = xbl + 32*ib;
10596
- if (quant_weights) {
10597
- const float * qw = quant_weights + QK_K*ibl + 32*ib;
10598
- for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
10599
- } else {
10600
- for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
10601
- }
10602
- const float db = 0.25f * d * (1 + 2*l);
10603
- for (int k = 0; k < 8; ++k) {
10604
- const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
10605
- const float * xk = xb + 4*k;
10606
- const float * wk = weight + 4*k;
10607
- //const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
10608
- const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
10609
- float best_mse = 0; int best_index = q3[8*ib+k];
10610
- for (int j = 0; j < 4; ++j) {
10611
- float diff = db * grid[j] * signs[j] - xk[j];
10612
- best_mse += wk[j] * diff * diff;
10613
- }
10614
- for (int idx = 0; idx < 256; ++idx) {
10615
- //grid = (const uint8_t *)(kgrid_q3xs + idx);
10616
- grid = (const uint8_t *)(iq3xxs_grid + idx);
10617
- float mse = 0;
10618
- for (int j = 0; j < 4; ++j) {
10619
- float diff = db * grid[j] * signs[j] - xk[j];
10620
- mse += wk[j] * diff * diff;
10621
- }
10622
- if (mse < best_mse) {
10623
- best_mse = mse; best_index = idx;
10624
- }
10625
- }
10626
- q3[8*ib+k] = best_index;
10627
- //grid = (const uint8_t *)(kgrid_q3xs + best_index);
10628
- grid = (const uint8_t *)(iq3xxs_grid + best_index);
10629
- for (int j = 0; j < 4; ++j) {
10630
- float q = db * grid[j] * signs[j];
10631
- sumqx += wk[j] * q * xk[j];
10632
- sumq2 += wk[j] * q * q;
10633
- }
10634
- }
10635
- if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
10636
- }
12009
+ for (int ib = 0; ib < QK_K/block_size; ib += 2) {
12010
+ int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
12011
+ l1 = MAX(0, MIN(15, l1));
12012
+ int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
12013
+ l2 = MAX(0, MIN(15, l2));
12014
+ y[ibl].scales[ib/2] = l1 | (l2 << 4);
10637
12015
  }
10638
- memcpy(y[ibl].qs, q3, 3*QK_K/8);
12016
+
10639
12017
  }
10640
12018
  }
10641
12019
 
10642
- size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12020
+ #define IQ3S_BLOCK_SIZE 32
12021
+ size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
10643
12022
  (void)hist;
10644
12023
  GGML_ASSERT(n_per_row%QK_K == 0);
10645
12024
  int nblock = n_per_row/QK_K;
12025
+ float scales[QK_K/IQ3S_BLOCK_SIZE];
12026
+ float weight[IQ3S_BLOCK_SIZE];
12027
+ float xval[IQ3S_BLOCK_SIZE];
12028
+ int8_t L[IQ3S_BLOCK_SIZE];
12029
+ int8_t Laux[IQ3S_BLOCK_SIZE];
12030
+ float waux[IQ3S_BLOCK_SIZE];
12031
+ bool is_on_grid[IQ3S_BLOCK_SIZE/4];
12032
+ bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
12033
+ uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
10646
12034
  char * qrow = (char *)dst;
10647
12035
  for (int row = 0; row < nrow; ++row) {
10648
- quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights);
12036
+ quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
12037
+ scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
10649
12038
  src += n_per_row;
10650
- qrow += nblock*sizeof(block_iq3_xxs);
12039
+ qrow += nblock*sizeof(block_iq3_s);
10651
12040
  }
10652
- return nrow * nblock * sizeof(block_iq3_xxs);
12041
+ return nrow * nblock * sizeof(block_iq3_s);
10653
12042
  }
10654
12043
 
10655
- void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
12044
+ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
10656
12045
  assert(k % QK_K == 0);
10657
- block_iq3_xxs * restrict y = vy;
10658
- quantize_row_iq3_xxs_reference(x, y, k);
12046
+ block_iq3_s * restrict y = vy;
12047
+ quantize_row_iq3_s_reference(x, y, k);
10659
12048
  }
10660
12049
 
10661
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
12050
+ void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
10662
12051
  assert(k % QK_K == 0);
10663
- quantize_row_iq3_xxs_impl(x, y, k, NULL);
12052
+ quantize_iq3_s(x, y, 1, k, NULL, NULL);
10664
12053
  }
10665
12054
 
12055
+
10666
12056
  // =================================== 1.5 bpw ===================================================
10667
12057
 
10668
12058
  static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
@@ -10745,7 +12135,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
10745
12135
  GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
10746
12136
  GGML_ASSERT(n%QK_K == 0);
10747
12137
 
10748
- const int nbl = n/256;
12138
+ const int nbl = n/QK_K;
10749
12139
 
10750
12140
  block_iq1_s * y = vy;
10751
12141
 
@@ -10880,23 +12270,23 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
10880
12270
  return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
10881
12271
  }
10882
12272
 
10883
- static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x,
10884
- ggml_fp16_t * dh, uint8_t * q4,
10885
- float * weight, uint8_t * L,
12273
+ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
12274
+ ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
12275
+ float * scales, float * weight, uint8_t * L,
10886
12276
  const int8_t * values,
10887
12277
  const float * quant_weights) {
10888
12278
 
10889
12279
  const int ntry = 7;
10890
12280
 
10891
12281
  float sigma2 = 0;
10892
- for (int j = 0; j < QK4_NL; ++j) sigma2 += x[j]*x[j];
10893
- sigma2 *= 2.f/QK4_NL;
12282
+ for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
12283
+ sigma2 *= 2.f/super_block_size;
10894
12284
 
10895
- const int nb = QK4_NL/block_size;
12285
+ memset(q4, 0, super_block_size/2);
12286
+ dh[0] = GGML_FP32_TO_FP16(0.f);
10896
12287
 
10897
- memset(q4, 0, QK4_NL/2);
10898
- for (int ib = 0; ib < nb; ++ib) {
10899
- dh[ib] = GGML_FP32_TO_FP16(0.f);
12288
+ float max_scale = 0, amax_scale = 0;
12289
+ for (int ib = 0; ib < super_block_size/block_size; ++ib) {
10900
12290
  const float * xb = x + ib*block_size;
10901
12291
  if (quant_weights) {
10902
12292
  const float * qw = quant_weights + ib*block_size;
@@ -10912,6 +12302,7 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
10912
12302
  }
10913
12303
  }
10914
12304
  if (!amax) {
12305
+ scales[ib] = 0;
10915
12306
  continue;
10916
12307
  }
10917
12308
  float d = -max/values[0];
@@ -10925,7 +12316,6 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
10925
12316
  sumqx += w*q*xb[j];
10926
12317
  sumq2 += w*q*q;
10927
12318
  }
10928
- float best_id = id;
10929
12319
  d = sumqx/sumq2;
10930
12320
  float best = d*sumqx;
10931
12321
  for (int itry = -ntry; itry <= ntry; ++itry) {
@@ -10941,15 +12331,47 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
10941
12331
  }
10942
12332
  if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
10943
12333
  d = sumqx/sumq2; best = d * sumqx;
10944
- best_id = id;
10945
12334
  }
10946
12335
  }
10947
- dh[ib] = GGML_FP32_TO_FP16(d);
10948
- for (int j = 0; j < block_size; ++j) {
10949
- L[ib*block_size + j] = best_index_int8(16, values, best_id*xb[j]);
12336
+ scales[ib] = d;
12337
+ float abs_d = fabsf(d);
12338
+ if (abs_d > amax_scale) {
12339
+ amax_scale = abs_d; max_scale = d;
12340
+ }
12341
+ }
12342
+
12343
+ if (super_block_size/block_size > 1) {
12344
+ int nb = super_block_size/block_size;
12345
+ memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
12346
+ float d = -max_scale/32;
12347
+ dh[0] = GGML_FP32_TO_FP16(d);
12348
+ float id = d ? 1/d : 0.f;
12349
+ for (int ib = 0; ib < super_block_size/block_size; ++ib) {
12350
+ int l = nearest_int(id*scales[ib]);
12351
+ l = MAX(-32, MIN(31, l));
12352
+ float dl = d * l;
12353
+ float idl = dl ? 1/dl : 0.f;
12354
+ uint8_t * Lb = L + ib*block_size;
12355
+ const float * xb = x + ib*block_size;
12356
+ for (int j = 0; j < block_size; ++j) {
12357
+ Lb[j] = best_index_int8(16, values, idl*xb[j]);
12358
+ }
12359
+ l += 32;
12360
+ uint8_t l_l = l & 0xf;
12361
+ uint8_t l_h = l >> 4;
12362
+ if (ib%2 == 0) scales_l[ib/2] = l_l;
12363
+ else scales_l[ib/2] |= (l_l << 4);
12364
+ scales_h[ib/8] |= (l_h << 2*(ib%8));
12365
+ }
12366
+ } else {
12367
+ dh[0] = GGML_FP32_TO_FP16(scales[0]);
12368
+ float id = scales[0] ? 1/scales[0] : 0;
12369
+ for (int j = 0; j < super_block_size; ++j) {
12370
+ L[j] = best_index_int8(16, values, id*x[j]);
10950
12371
  }
10951
12372
  }
10952
- for (int i = 0; i < QK4_NL/32; ++i) {
12373
+
12374
+ for (int i = 0; i < super_block_size/32; ++i) {
10953
12375
  for (int j = 0; j < 16; ++j) {
10954
12376
  q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
10955
12377
  }
@@ -10962,12 +12384,16 @@ size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, i
10962
12384
  int nblock = n_per_row/QK4_NL;
10963
12385
  char * qrow = (char *)dst;
10964
12386
  uint8_t L[QK4_NL];
10965
- float weight[32];
12387
+ float weight[QK4_NL];
12388
+ uint16_t unused_h;
12389
+ uint8_t * unused_l = NULL;
12390
+ float scale;
10966
12391
  for (int row = 0; row < nrow; ++row) {
10967
12392
  block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
10968
12393
  for (int ibl = 0; ibl < nblock; ++ibl) {
10969
12394
  const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
10970
- quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, weight, L, kvalues_iq4nl, qw);
12395
+ quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
12396
+ &scale, weight, L, kvalues_iq4nl, qw);
10971
12397
  }
10972
12398
  src += n_per_row;
10973
12399
  qrow += nblock*sizeof(block_iq4_nl);
@@ -10986,3 +12412,232 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
10986
12412
  quantize_iq4_nl(x, y, 1, k, NULL, NULL);
10987
12413
  }
10988
12414
 
12415
+ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12416
+ #if QK_K == 64
12417
+ return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
12418
+ #else
12419
+ (void)hist;
12420
+ GGML_ASSERT(n_per_row%QK_K == 0);
12421
+ int nblock = n_per_row/QK_K;
12422
+ char * qrow = (char *)dst;
12423
+ uint8_t L[QK_K];
12424
+ float weight[32];
12425
+ float scales[QK_K/32];
12426
+ for (int row = 0; row < nrow; ++row) {
12427
+ block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
12428
+ for (int ibl = 0; ibl < nblock; ++ibl) {
12429
+ const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
12430
+ quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
12431
+ scales, weight, L, kvalues_iq4nl, qw);
12432
+ }
12433
+ src += n_per_row;
12434
+ qrow += nblock*sizeof(block_iq4_xs);
12435
+ }
12436
+ return nrow * nblock * sizeof(block_iq4_xs);
12437
+ #endif
12438
+ }
12439
+
12440
+ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
12441
+ assert(k % QK_K == 0);
12442
+ block_iq4_xs * restrict y = vy;
12443
+ quantize_row_iq4_xs_reference(x, y, k);
12444
+ }
12445
+
12446
+ void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
12447
+ assert(k % QK_K == 0);
12448
+ quantize_iq4_xs(x, y, 1, k, NULL, NULL);
12449
+ }
12450
+
12451
+ // =============================== 2.5625 bpw
12452
+
12453
+ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12454
+
12455
+ const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
12456
+
12457
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
12458
+ const int * kmap_q2xs = iq2_data[gindex].map;
12459
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
12460
+
12461
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
12462
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
12463
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
12464
+ GGML_ASSERT(n%QK_K == 0);
12465
+
12466
+ const int kMaxQ = 3;
12467
+
12468
+ const int nbl = n/QK_K;
12469
+
12470
+ block_iq2_s * y = vy;
12471
+
12472
+ float scales[QK_K/16];
12473
+ float weight[16];
12474
+ float xval[16];
12475
+ int8_t L[16];
12476
+ int8_t Laux[16];
12477
+ float waux[16];
12478
+ bool is_on_grid[2];
12479
+ bool is_on_grid_aux[2];
12480
+ uint8_t block_signs[2];
12481
+
12482
+ for (int ibl = 0; ibl < nbl; ++ibl) {
12483
+
12484
+ memset(&y[ibl], 0, sizeof(block_iq2_s));
12485
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
12486
+
12487
+ float max_scale = 0;
12488
+
12489
+ const float * xbl = x + QK_K*ibl;
12490
+ float sumx2 = 0;
12491
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
12492
+ float sigma2 = 2*sumx2/QK_K;
12493
+
12494
+ for (int ib = 0; ib < QK_K/16; ++ib) {
12495
+ const float * xb = xbl + 16*ib;
12496
+ if (quant_weights) {
12497
+ const float * qw = quant_weights + QK_K*ibl + 16*ib;
12498
+ for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12499
+ } else {
12500
+ for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
12501
+ }
12502
+ for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
12503
+ for (int k = 0; k < 2; ++k) {
12504
+ uint8_t s = 0;
12505
+ for (int i = 0; i < 8; ++i) {
12506
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
12507
+ else {
12508
+ xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
12509
+ }
12510
+ }
12511
+ block_signs[k] = s;
12512
+ }
12513
+ float max = xval[0];
12514
+ for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
12515
+ if (!max) {
12516
+ scales[ib] = 0;
12517
+ continue;
12518
+ }
12519
+ float best = 0;
12520
+ float scale = max/(2*kMaxQ-1);
12521
+ is_on_grid[0] = is_on_grid[1] = true;
12522
+ for (int is = -9; is <= 9; ++is) {
12523
+ float id = (2*kMaxQ-1+is*0.1f)/max;
12524
+ float this_scale = 1/id;
12525
+ for (int k = 0; k < 2; ++k) {
12526
+ for (int i = 0; i < 8; ++i) {
12527
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
12528
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
12529
+ }
12530
+ uint16_t u = 0;
12531
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
12532
+ int grid_index = kmap_q2xs[u];
12533
+ is_on_grid_aux[k] = true;
12534
+ if (grid_index < 0) {
12535
+ is_on_grid_aux[k] = false;
12536
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12537
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
12538
+ }
12539
+ }
12540
+ float sumqx = 0, sumq2 = 0;
12541
+ for (int i = 0; i < 16; ++i) {
12542
+ float w = weight[i];
12543
+ float q = 2*Laux[i] + 1;
12544
+ sumqx += w*xval[i]*q;
12545
+ sumq2 += w*q*q;
12546
+ }
12547
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
12548
+ scale = sumqx/sumq2; best = scale*sumqx;
12549
+ for (int i = 0; i < 16; ++i) L[i] = Laux[i];
12550
+ for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
12551
+ }
12552
+ }
12553
+ int n_not_ongrid = 0;
12554
+ for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
12555
+ if (n_not_ongrid > 0 && scale > 0) {
12556
+ float id = 1/scale;
12557
+ for (int k = 0; k < 2; ++k) {
12558
+ if (is_on_grid[k]) continue;
12559
+ uint16_t u = 0;
12560
+ for (int i = 0; i < 8; ++i) {
12561
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
12562
+ l = MAX(0, MIN(kMaxQ-1, l));
12563
+ u |= (l << 2*i);
12564
+ L[8*k + i] = l;
12565
+ }
12566
+ int grid_index = kmap_q2xs[u];
12567
+ if (grid_index < 0) {
12568
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12569
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
12570
+ }
12571
+ }
12572
+ float sumqx = 0, sumq2 = 0;
12573
+ for (int i = 0; i < 16; ++i) {
12574
+ float w = weight[i];
12575
+ float q = 2*L[i] + 1;
12576
+ sumqx += w*xval[i]*q;
12577
+ sumq2 += w*q*q;
12578
+ }
12579
+ if (sumq2 > 0) scale = sumqx/sumq2;
12580
+ }
12581
+ if (scale < 0) {
12582
+ scale = -scale;
12583
+ for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
12584
+ }
12585
+ for (int k = 0; k < 2; ++k) {
12586
+ uint16_t u = 0;
12587
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
12588
+ int grid_index = kmap_q2xs[u];
12589
+ if (grid_index < 0) {
12590
+ printf("Oops: found point %u not on grid:", u);
12591
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
12592
+ printf("\n");
12593
+ GGML_ASSERT(false);
12594
+ }
12595
+ const int i8 = 2*ib + k;
12596
+ y[ibl].qs[i8] = grid_index & 255;
12597
+ y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
12598
+ y[ibl].qs[QK_K/8 + i8] = block_signs[k];
12599
+ }
12600
+ GGML_ASSERT(scale >= 0);
12601
+ scales[ib] = scale;
12602
+ max_scale = MAX(max_scale, scale);
12603
+ }
12604
+
12605
+ if (!max_scale) {
12606
+ continue;
12607
+ }
12608
+
12609
+ float d = max_scale/31;
12610
+ y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
12611
+ float id = 1/d;
12612
+ for (int ib = 0; ib < QK_K/16; ++ib) {
12613
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
12614
+ l = MAX(0, MIN(15, l));
12615
+ if (ib%2 == 0) y[ibl].scales[ib/2] = l;
12616
+ else y[ibl].scales[ib/2] |= (l << 4);
12617
+ }
12618
+ }
12619
+ }
12620
+
12621
+ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12622
+ (void)hist;
12623
+ GGML_ASSERT(n_per_row%QK_K == 0);
12624
+ int nblock = n_per_row/QK_K;
12625
+ char * qrow = (char *)dst;
12626
+ for (int row = 0; row < nrow; ++row) {
12627
+ quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
12628
+ src += n_per_row;
12629
+ qrow += nblock*sizeof(block_iq2_s);
12630
+ }
12631
+ return nrow * nblock * sizeof(block_iq2_s);
12632
+ }
12633
+
12634
+ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
12635
+ assert(k % QK_K == 0);
12636
+ quantize_iq2_s(x, y, 1, k, NULL, NULL);
12637
+ }
12638
+
12639
+ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
12640
+ assert(k % QK_K == 0);
12641
+ block_iq2_s * restrict y = vy;
12642
+ quantize_row_iq2_s_reference(x, y, k);
12643
+ }