llama_cpp 0.12.6 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -438,6 +438,54 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
438
438
  return res;
439
439
  }
440
440
 
441
+ // NOTE: not tested
442
+ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
443
+ int8x16_t res;
444
+
445
+ res[ 0] = a[b[ 0]];
446
+ res[ 1] = a[b[ 1]];
447
+ res[ 2] = a[b[ 2]];
448
+ res[ 3] = a[b[ 3]];
449
+ res[ 4] = a[b[ 4]];
450
+ res[ 5] = a[b[ 5]];
451
+ res[ 6] = a[b[ 6]];
452
+ res[ 7] = a[b[ 7]];
453
+ res[ 8] = a[b[ 8]];
454
+ res[ 9] = a[b[ 9]];
455
+ res[10] = a[b[10]];
456
+ res[11] = a[b[11]];
457
+ res[12] = a[b[12]];
458
+ res[13] = a[b[13]];
459
+ res[14] = a[b[14]];
460
+ res[15] = a[b[15]];
461
+
462
+ return res;
463
+ }
464
+
465
+ // NOTE: not tested
466
+ inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
467
+ int8x16_t res;
468
+
469
+ res[ 0] = a[b[ 0]];
470
+ res[ 1] = a[b[ 1]];
471
+ res[ 2] = a[b[ 2]];
472
+ res[ 3] = a[b[ 3]];
473
+ res[ 4] = a[b[ 4]];
474
+ res[ 5] = a[b[ 5]];
475
+ res[ 6] = a[b[ 6]];
476
+ res[ 7] = a[b[ 7]];
477
+ res[ 8] = a[b[ 8]];
478
+ res[ 9] = a[b[ 9]];
479
+ res[10] = a[b[10]];
480
+ res[11] = a[b[11]];
481
+ res[12] = a[b[12]];
482
+ res[13] = a[b[13]];
483
+ res[14] = a[b[14]];
484
+ res[15] = a[b[15]];
485
+
486
+ return res;
487
+ }
488
+
441
489
  #else
442
490
 
443
491
  #define ggml_int16x8x2_t int16x8x2_t
@@ -451,6 +499,8 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
451
499
  #define ggml_vld1q_u8_x4 vld1q_u8_x4
452
500
  #define ggml_vld1q_s8_x2 vld1q_s8_x2
453
501
  #define ggml_vld1q_s8_x4 vld1q_s8_x4
502
+ #define ggml_vqtbl1q_s8 vqtbl1q_s8
503
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
454
504
 
455
505
  #endif
456
506
 
@@ -1827,7 +1877,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1827
1877
  float mins[QK_K/16];
1828
1878
  float scales[QK_K/16];
1829
1879
  float sw[QK_K/16];
1830
- float weight[QK_K/16];
1880
+ float weight[16];
1831
1881
  uint8_t Ls[QK_K/16], Lm[QK_K/16];
1832
1882
 
1833
1883
  for (int i = 0; i < nb; i++) {
@@ -1838,12 +1888,41 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1838
1888
  for (int j = 0; j < QK_K/16; ++j) {
1839
1889
  const float * restrict qw = quant_weights + QK_K * i + 16*j;
1840
1890
  for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
1841
- for (int l = 0; l < 16; ++l) sw[j] += weight[l];
1891
+ for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
1842
1892
  scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
1843
1893
  }
1844
1894
 
1845
- float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
1846
- float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
1895
+ float dm, mm;
1896
+ #if QK_K == 64
1897
+ float max_scale = 0, max_min = 0;
1898
+ for (int j = 0; j < QK_K/16; ++j) {
1899
+ max_scale = MAX(max_scale, scales[j]);
1900
+ max_min = MAX(max_min, mins[j]);
1901
+ }
1902
+ dm = max_scale/15;
1903
+ mm = max_min/15;
1904
+ if (max_scale) {
1905
+ float id = 1/dm;
1906
+ for (int j = 0; j < QK_K/16; ++j) {
1907
+ int l = nearest_int(id*scales[j]);
1908
+ Ls[j] = MAX(0, MIN(15, l));
1909
+ }
1910
+ } else {
1911
+ memset(Ls, 0, QK_K/16);
1912
+ }
1913
+ if (max_min) {
1914
+ float id = 1/mm;
1915
+ for (int j = 0; j < QK_K/16; ++j) {
1916
+ int l = nearest_int(id*mins[j]);
1917
+ Lm[j] = MAX(0, MIN(15, l));
1918
+ }
1919
+ } else {
1920
+ memset(Lm, 0, QK_K/16);
1921
+ }
1922
+ #else
1923
+ dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
1924
+ mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
1925
+ #endif
1847
1926
  y[i].d = GGML_FP32_TO_FP16(dm);
1848
1927
  y[i].dmin = GGML_FP32_TO_FP16(mm);
1849
1928
  dm = GGML_FP16_TO_FP32(y[i].d);
@@ -3445,6 +3524,265 @@ static const uint64_t iq2xs_grid[512] = {
3445
3524
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3446
3525
  };
3447
3526
 
3527
+ static const uint64_t iq2s_grid[1024] = {
3528
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3529
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3530
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3531
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3532
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3533
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
3534
+ 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
3535
+ 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
3536
+ 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
3537
+ 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
3538
+ 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
3539
+ 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
3540
+ 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
3541
+ 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
3542
+ 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
3543
+ 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
3544
+ 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
3545
+ 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
3546
+ 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
3547
+ 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
3548
+ 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
3549
+ 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
3550
+ 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
3551
+ 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
3552
+ 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
3553
+ 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
3554
+ 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
3555
+ 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
3556
+ 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
3557
+ 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
3558
+ 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
3559
+ 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
3560
+ 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
3561
+ 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
3562
+ 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
3563
+ 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
3564
+ 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
3565
+ 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
3566
+ 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
3567
+ 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
3568
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
3569
+ 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
3570
+ 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
3571
+ 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
3572
+ 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
3573
+ 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
3574
+ 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
3575
+ 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
3576
+ 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
3577
+ 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
3578
+ 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
3579
+ 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
3580
+ 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
3581
+ 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
3582
+ 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
3583
+ 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
3584
+ 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
3585
+ 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
3586
+ 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
3587
+ 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
3588
+ 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
3589
+ 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
3590
+ 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
3591
+ 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
3592
+ 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
3593
+ 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
3594
+ 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
3595
+ 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
3596
+ 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
3597
+ 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
3598
+ 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
3599
+ 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
3600
+ 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
3601
+ 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
3602
+ 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
3603
+ 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
3604
+ 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
3605
+ 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
3606
+ 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
3607
+ 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
3608
+ 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
3609
+ 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
3610
+ 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
3611
+ 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
3612
+ 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
3613
+ 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
3614
+ 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
3615
+ 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
3616
+ 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
3617
+ 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
3618
+ 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
3619
+ 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
3620
+ 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
3621
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
3622
+ 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
3623
+ 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
3624
+ 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
3625
+ 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
3626
+ 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
3627
+ 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
3628
+ 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
3629
+ 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
3630
+ 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
3631
+ 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
3632
+ 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
3633
+ 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
3634
+ 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
3635
+ 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
3636
+ 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
3637
+ 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
3638
+ 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
3639
+ 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
3640
+ 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
3641
+ 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
3642
+ 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
3643
+ 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
3644
+ 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
3645
+ 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
3646
+ 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
3647
+ 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
3648
+ 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
3649
+ 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
3650
+ 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
3651
+ 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
3652
+ 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
3653
+ 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
3654
+ 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
3655
+ 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
3656
+ 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
3657
+ 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
3658
+ 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
3659
+ 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
3660
+ 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
3661
+ 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
3662
+ 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
3663
+ 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
3664
+ 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
3665
+ 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
3666
+ 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
3667
+ 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
3668
+ 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
3669
+ 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
3670
+ 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
3671
+ 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
3672
+ 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
3673
+ 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
3674
+ 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
3675
+ 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
3676
+ 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
3677
+ 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
3678
+ 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
3679
+ 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
3680
+ 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
3681
+ 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
3682
+ 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
3683
+ 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
3684
+ 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
3685
+ 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
3686
+ 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
3687
+ 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
3688
+ 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
3689
+ 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
3690
+ 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
3691
+ 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
3692
+ 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
3693
+ 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
3694
+ 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
3695
+ 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
3696
+ 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
3697
+ 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
3698
+ 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
3699
+ 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
3700
+ 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
3701
+ 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
3702
+ 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
3703
+ 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
3704
+ 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
3705
+ 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
3706
+ 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
3707
+ 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
3708
+ 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
3709
+ 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
3710
+ 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
3711
+ 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
3712
+ 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
3713
+ 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
3714
+ 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
3715
+ 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
3716
+ 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
3717
+ 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
3718
+ 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
3719
+ 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
3720
+ 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
3721
+ 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
3722
+ 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
3723
+ 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
3724
+ 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
3725
+ 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
3726
+ 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
3727
+ 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
3728
+ 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
3729
+ 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
3730
+ 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
3731
+ 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
3732
+ 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
3733
+ 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
3734
+ 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
3735
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
3736
+ 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
3737
+ 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
3738
+ 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
3739
+ 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
3740
+ 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
3741
+ 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
3742
+ 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
3743
+ 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
3744
+ 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
3745
+ 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
3746
+ 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
3747
+ 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
3748
+ 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
3749
+ 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
3750
+ 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
3751
+ 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
3752
+ 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
3753
+ 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
3754
+ 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
3755
+ 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
3756
+ 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
3757
+ 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
3758
+ 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
3759
+ 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
3760
+ 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
3761
+ 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
3762
+ 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
3763
+ 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
3764
+ 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
3765
+ 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
3766
+ 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
3767
+ 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
3768
+ 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
3769
+ 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
3770
+ 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
3771
+ 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
3772
+ 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
3773
+ 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
3774
+ 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
3775
+ 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
3776
+ 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
3777
+ 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
3778
+ 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
3779
+ 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
3780
+ 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
3781
+ 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
3782
+ 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
3783
+ 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
3784
+ };
3785
+
3448
3786
  static const uint32_t iq3xxs_grid[256] = {
3449
3787
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
3450
3788
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
@@ -3480,6 +3818,206 @@ static const uint32_t iq3xxs_grid[256] = {
3480
3818
  0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
3481
3819
  };
3482
3820
 
3821
+ static const uint32_t iq3xs_grid[512] = {
3822
+ 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
3823
+ 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
3824
+ 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
3825
+ 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
3826
+ 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
3827
+ 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
3828
+ 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
3829
+ 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
3830
+ 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
3831
+ 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
3832
+ 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
3833
+ 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
3834
+ 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
3835
+ 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
3836
+ 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
3837
+ 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
3838
+ 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
3839
+ 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
3840
+ 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
3841
+ 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
3842
+ 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
3843
+ 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
3844
+ 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
3845
+ 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
3846
+ 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
3847
+ 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
3848
+ 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
3849
+ 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
3850
+ 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
3851
+ 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
3852
+ 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
3853
+ 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
3854
+ 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
3855
+ 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
3856
+ 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
3857
+ 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
3858
+ 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
3859
+ 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
3860
+ 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
3861
+ 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
3862
+ 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
3863
+ 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
3864
+ 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
3865
+ 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
3866
+ 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
3867
+ 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
3868
+ 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
3869
+ 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
3870
+ 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
3871
+ 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
3872
+ 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
3873
+ 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
3874
+ 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
3875
+ 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
3876
+ 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
3877
+ 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
3878
+ 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
3879
+ 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
3880
+ 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
3881
+ 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
3882
+ 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
3883
+ 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
3884
+ 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
3885
+ 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
3886
+ };
3887
+
3888
+ #define NGRID_IQ2XXS 512
3889
+ static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
3890
+ 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
3891
+ 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
3892
+ 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
3893
+ 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
3894
+ 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
3895
+ 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
3896
+ 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
3897
+ 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
3898
+ 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
3899
+ 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
3900
+ 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
3901
+ 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
3902
+ 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
3903
+ 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
3904
+ 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
3905
+ 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
3906
+ 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
3907
+ 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
3908
+ 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
3909
+ 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
3910
+ 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
3911
+ 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
3912
+ 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
3913
+ 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
3914
+ 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
3915
+ 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
3916
+ 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
3917
+ 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
3918
+ 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
3919
+ 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
3920
+ 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
3921
+ 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
3922
+ 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
3923
+ 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
3924
+ 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
3925
+ 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
3926
+ 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
3927
+ 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
3928
+ 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
3929
+ 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
3930
+ 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
3931
+ 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
3932
+ 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
3933
+ 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
3934
+ 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
3935
+ 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
3936
+ 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
3937
+ 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
3938
+ 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
3939
+ 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
3940
+ 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
3941
+ 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
3942
+ 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
3943
+ 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
3944
+ 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
3945
+ 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
3946
+ 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
3947
+ 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
3948
+ 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
3949
+ 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
3950
+ 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
3951
+ 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
3952
+ 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
3953
+ 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
3954
+ 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
3955
+ 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
3956
+ 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
3957
+ 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
3958
+ 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
3959
+ 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
3960
+ 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
3961
+ 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
3962
+ 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
3963
+ 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
3964
+ 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
3965
+ 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
3966
+ 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
3967
+ 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
3968
+ 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
3969
+ 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
3970
+ 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
3971
+ 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
3972
+ 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
3973
+ 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
3974
+ 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
3975
+ 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
3976
+ 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
3977
+ 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
3978
+ 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
3979
+ 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
3980
+ 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
3981
+ 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
3982
+ 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
3983
+ 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
3984
+ 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
3985
+ 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
3986
+ 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
3987
+ 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
3988
+ 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
3989
+ 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
3990
+ 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
3991
+ 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
3992
+ 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
3993
+ 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
3994
+ 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
3995
+ 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
3996
+ 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
3997
+ 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
3998
+ 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
3999
+ 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
4000
+ 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
4001
+ 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
4002
+ 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
4003
+ 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
4004
+ 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
4005
+ 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
4006
+ 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
4007
+ 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
4008
+ 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
4009
+ 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
4010
+ 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
4011
+ 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
4012
+ 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
4013
+ 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
4014
+ 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
4015
+ 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
4016
+ 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
4017
+ 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
4018
+
4019
+ };
4020
+
3483
4021
  static const uint8_t ksigns_iq2xs[128] = {
3484
4022
  0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
3485
4023
  144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
@@ -3546,6 +4084,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
3546
4084
  }
3547
4085
  }
3548
4086
 
4087
+ // ====================== 2.5625 bpw (de)-quantization
4088
+
4089
+ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
4090
+ assert(k % QK_K == 0);
4091
+ const int nb = k / QK_K;
4092
+
4093
+ float db[2];
4094
+
4095
+ for (int i = 0; i < nb; i++) {
4096
+
4097
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4098
+ const uint8_t * qs = x[i].qs;
4099
+ const uint8_t * qh = x[i].qh;
4100
+ const uint8_t * signs = qs + QK_K/8;
4101
+
4102
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
4103
+ db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
4104
+ db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
4105
+ for (int l = 0; l < 4; ++l) {
4106
+ const float dl = db[l/2];
4107
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
4108
+ for (int j = 0; j < 8; ++j) {
4109
+ y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
4110
+ }
4111
+ y += 8;
4112
+ }
4113
+ qs += 4;
4114
+ signs += 4;
4115
+ }
4116
+ }
4117
+ }
4118
+
3549
4119
  // ====================== 3.0625 bpw (de)-quantization
3550
4120
 
3551
4121
  void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
@@ -3578,6 +4148,139 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
3578
4148
  }
3579
4149
  }
3580
4150
 
4151
+ // ====================== 3.3125 bpw (de)-quantization
4152
+
4153
+ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
4154
+ assert(k % QK_K == 0);
4155
+ const int nb = k / QK_K;
4156
+
4157
+ for (int i = 0; i < nb; i++) {
4158
+
4159
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4160
+ const uint8_t * qs = x[i].qs;
4161
+ const uint8_t * qh = x[i].qh;
4162
+ const uint8_t * signs = x[i].signs;
4163
+
4164
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
4165
+ const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f;
4166
+ const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f;
4167
+ for (int l = 0; l < 4; ++l) {
4168
+ const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
4169
+ const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
4170
+ for (int j = 0; j < 4; ++j) {
4171
+ y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4172
+ y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
4173
+ }
4174
+ y += 8;
4175
+ }
4176
+ qs += 8;
4177
+ signs += 4;
4178
+ for (int l = 0; l < 4; ++l) {
4179
+ const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
4180
+ const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
4181
+ for (int j = 0; j < 4; ++j) {
4182
+ y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4183
+ y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
4184
+ }
4185
+ y += 8;
4186
+ }
4187
+ qh += 2;
4188
+ qs += 8;
4189
+ signs += 4;
4190
+ }
4191
+ }
4192
+ }
4193
+
4194
+ // ====================== 1.5625 bpw (de)-quantization
4195
+
4196
+ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
4197
+ assert(k % QK_K == 0);
4198
+ const int nb = k / QK_K;
4199
+
4200
+ float db[4];
4201
+ uint16_t idx[4];
4202
+ //const int8_t * grid[4];
4203
+
4204
+ for (int i = 0; i < nb; i++) {
4205
+
4206
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4207
+ const uint8_t * sc = x[i].scales;
4208
+ const uint8_t * qs = x[i].qs;
4209
+
4210
+ for (int i8 = 0; i8 < QK_K/8; i8 += 4) {
4211
+ idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
4212
+ idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
4213
+ idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
4214
+ idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
4215
+ //grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
4216
+ //grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
4217
+ //grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
4218
+ //grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
4219
+ db[0] = d * (2*(sc[0] & 7) + 1);
4220
+ db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
4221
+ db[2] = d * (2*(sc[1] & 7) + 1);
4222
+ db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
4223
+ for (int l = 0; l < 4; ++l) {
4224
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
4225
+ for (int j = 0; j < 8; ++j) {
4226
+ //y[j] = db[l] * grid[l][j];
4227
+ y[j] = db[l] * grid[j];
4228
+ }
4229
+ y += 8;
4230
+ }
4231
+ qs += 4;
4232
+ sc += 2;
4233
+ }
4234
+ }
4235
+ }
4236
+
4237
+ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
4238
+
4239
+ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
4240
+ assert(k % QK4_NL == 0);
4241
+ const int nb = k / QK4_NL;
4242
+
4243
+ for (int i = 0; i < nb; i++) {
4244
+
4245
+ const uint8_t * qs = x[i].qs;
4246
+
4247
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4248
+ for (int j = 0; j < QK4_NL/2; ++j) {
4249
+ y[j+ 0] = d * kvalues_iq4nl[qs[j] & 0xf];
4250
+ y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >> 4];
4251
+ }
4252
+ y += QK4_NL;
4253
+ qs += QK4_NL/2;
4254
+ }
4255
+ }
4256
+
4257
+ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
4258
+ assert(k % QK_K == 0);
4259
+ #if QK_K == 64
4260
+ dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
4261
+ #else
4262
+ const int nb = k / QK_K;
4263
+
4264
+ for (int i = 0; i < nb; i++) {
4265
+
4266
+ const uint8_t * qs = x[i].qs;
4267
+
4268
+ const float d = GGML_FP16_TO_FP32(x[i].d);
4269
+
4270
+ for (int ib = 0; ib < QK_K/32; ++ib) {
4271
+ const int ls = ((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4);
4272
+ const float dl = d * (ls - 32);
4273
+ for (int j = 0; j < 16; ++j) {
4274
+ y[j+ 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
4275
+ y[j+16] = dl * kvalues_iq4nl[qs[j] >> 4];
4276
+ }
4277
+ y += 32;
4278
+ qs += 16;
4279
+ }
4280
+ }
4281
+ #endif
4282
+ }
4283
+
3581
4284
  //===================================== Q8_K ==============================================
3582
4285
 
3583
4286
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -3848,15 +4551,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3848
4551
 
3849
4552
  const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
3850
4553
 
3851
- __m128i bx = _mm_and_si128(lowMask, tmp);
3852
- __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
3853
- bx = _mm_sub_epi8(bx, off);
3854
- const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
4554
+ __m128i bx_0 = _mm_and_si128(lowMask, tmp);
4555
+ __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
4556
+ bx_0 = _mm_sub_epi8(bx_0, off);
4557
+ const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
3855
4558
 
3856
- bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
3857
- by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
3858
- bx = _mm_sub_epi8(bx, off);
3859
- const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
4559
+ bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
4560
+ by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
4561
+ bx_0 = _mm_sub_epi8(bx_0, off);
4562
+ const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
3860
4563
 
3861
4564
  // Convert int32_t to float
3862
4565
  __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
@@ -4442,21 +5145,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4442
5145
  /* Compute combined scale for the block */
4443
5146
  const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
4444
5147
 
4445
- __m256i bx = bytes_from_nibbles_32(x[i].qs);
5148
+ __m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
4446
5149
  const __m256i bxhi = bytes_from_bits_32(x[i].qh);
4447
5150
  __m128i bxhil = _mm256_castsi256_si128(bxhi);
4448
5151
  __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
4449
5152
  bxhil = _mm_andnot_si128(bxhil, mask);
4450
5153
  bxhih = _mm_andnot_si128(bxhih, mask);
4451
- __m128i bxl = _mm256_castsi256_si128(bx);
4452
- __m128i bxh = _mm256_extractf128_si256(bx, 1);
5154
+ __m128i bxl = _mm256_castsi256_si128(bx_0);
5155
+ __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
4453
5156
  bxl = _mm_or_si128(bxl, bxhil);
4454
5157
  bxh = _mm_or_si128(bxh, bxhih);
4455
- bx = MM256_SET_M128I(bxh, bxl);
5158
+ bx_0 = MM256_SET_M128I(bxh, bxl);
4456
5159
 
4457
- const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
5160
+ const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
4458
5161
 
4459
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
5162
+ const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
4460
5163
 
4461
5164
  /* Multiply q with scale and accumulate */
4462
5165
  acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
@@ -4749,22 +5452,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4749
5452
 
4750
5453
  summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
4751
5454
 
4752
- __m256i bx = bytes_from_nibbles_32(x[i].qs);
5455
+ __m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
4753
5456
  const __m256i bxhi = bytes_from_bits_32(x[i].qh);
4754
5457
  __m128i bxhil = _mm256_castsi256_si128(bxhi);
4755
5458
  __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
4756
5459
  bxhil = _mm_and_si128(bxhil, mask);
4757
5460
  bxhih = _mm_and_si128(bxhih, mask);
4758
- __m128i bxl = _mm256_castsi256_si128(bx);
4759
- __m128i bxh = _mm256_extractf128_si256(bx, 1);
5461
+ __m128i bxl = _mm256_castsi256_si128(bx_0);
5462
+ __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
4760
5463
  bxl = _mm_or_si128(bxl, bxhil);
4761
5464
  bxh = _mm_or_si128(bxh, bxhih);
4762
- bx = MM256_SET_M128I(bxh, bxl);
5465
+ bx_0 = MM256_SET_M128I(bxh, bxl);
4763
5466
 
4764
5467
  const __m256 dy = _mm256_set1_ps(y[i].d);
4765
- const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
5468
+ const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
4766
5469
 
4767
- const __m256 q = mul_sum_us8_pairs_float(bx, by);
5470
+ const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
4768
5471
 
4769
5472
  acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
4770
5473
  }
@@ -4993,10 +5696,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4993
5696
 
4994
5697
  for (int i = 0; i < nb; i++) {
4995
5698
  // load elements
4996
- vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
4997
- vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
5699
+ vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl);
5700
+ vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
4998
5701
 
4999
- vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
5702
+ vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
5000
5703
 
5001
5704
  vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
5002
5705
  vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
@@ -5433,8 +6136,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5433
6136
 
5434
6137
  for (int i = 0; i < nb; ++i) {
5435
6138
 
5436
- const float d = y[i].d * (float)x[i].d;
5437
- const float dmin = -y[i].d * (float)x[i].dmin;
6139
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6140
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5438
6141
 
5439
6142
  const uint8_t * restrict q2 = x[i].qs;
5440
6143
  const int8_t * restrict q8 = y[i].qs;
@@ -5583,8 +6286,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5583
6286
 
5584
6287
  for (int i = 0; i < nb; ++i) {
5585
6288
 
5586
- const float d = y[i].d * (float)x[i].d;
5587
- const float dmin = -y[i].d * (float)x[i].dmin;
6289
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6290
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5588
6291
 
5589
6292
  const uint8_t * restrict q2 = x[i].qs;
5590
6293
  const int8_t * restrict q8 = y[i].qs;
@@ -5636,7 +6339,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5636
6339
 
5637
6340
  float sumf = 0;
5638
6341
 
5639
- int isum[4];
6342
+ int isum[QK_K/16];
5640
6343
 
5641
6344
  for (int i = 0; i < nb; ++i) {
5642
6345
 
@@ -5652,14 +6355,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5652
6355
  const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5653
6356
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5654
6357
 
5655
- isum[0] = isum[1] = isum[2] = isum[3] = 0;
6358
+ memset(isum, 0, (QK_K/16)*sizeof(int));
5656
6359
  for (int l = 0; l < 16; ++l) {
5657
6360
  isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
5658
6361
  isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
5659
6362
  isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
5660
6363
  isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
5661
6364
  }
5662
- for (int l = 0; l < 4; ++l) {
6365
+ for (int l = 0; l < QK_K/16; ++l) {
5663
6366
  isum[l] *= (sc[l] & 0xF);
5664
6367
  }
5665
6368
  sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
@@ -6237,7 +6940,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6237
6940
 
6238
6941
  int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
6239
6942
 
6240
- const float d = y[i].d * (float)x[i].d;
6943
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6241
6944
 
6242
6945
  const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
6243
6946
  q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
@@ -6439,7 +7142,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6439
7142
 
6440
7143
  int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
6441
7144
 
6442
- const float d = y[i].d * (float)x[i].d;
7145
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6443
7146
 
6444
7147
  vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
6445
7148
 
@@ -6942,9 +7645,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6942
7645
  aux16[1] = (a[0] >> 4) & 0x0f0f;
6943
7646
 
6944
7647
  const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
6945
- sum_mins += y[i].d * (float)x[i].d[1] * summi;
7648
+ sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;
6946
7649
 
6947
- const float d = y[i].d * (float)x[i].d[0];
7650
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
6948
7651
 
6949
7652
  const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
6950
7653
 
@@ -7602,7 +8305,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7602
8305
 
7603
8306
  for (int i = 0; i < nb; ++i) {
7604
8307
 
7605
- const float d = y[i].d * (float)x[i].d;
8308
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7606
8309
  const int8_t * sc = x[i].scales;
7607
8310
 
7608
8311
  const uint8_t * restrict q5 = x[i].qs;
@@ -7744,7 +8447,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7744
8447
 
7745
8448
  for (int i = 0; i < nb; ++i) {
7746
8449
 
7747
- const float d = y[i].d * (float)x[i].d;
8450
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7748
8451
  const int8_t * sc = x[i].scales;
7749
8452
 
7750
8453
  const uint8_t * restrict q5 = x[i].qs;
@@ -8312,7 +9015,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8312
9015
 
8313
9016
  for (int i = 0; i < nb; ++i) {
8314
9017
 
8315
- const float d_all = (float)x[i].d;
9018
+ const float d_all = GGML_FP16_TO_FP32(x[i].d);
8316
9019
 
8317
9020
  const uint8_t * restrict q6 = x[i].ql;
8318
9021
  const uint8_t * restrict qh = x[i].qh;
@@ -8483,7 +9186,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8483
9186
 
8484
9187
  for (int i = 0; i < nb; ++i) {
8485
9188
 
8486
- const float d_all = (float)x[i].d;
9189
+ const float d_all = GGML_FP16_TO_FP32(x[i].d);
8487
9190
 
8488
9191
  const uint8_t * restrict q6 = x[i].ql;
8489
9192
  const uint8_t * restrict qh = x[i].qh;
@@ -8585,6 +9288,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8585
9288
 
8586
9289
  #endif
8587
9290
 
9291
+ #if defined (__AVX2__) || defined (__ARM_NEON)
8588
9292
  static const int8_t keven_signs_q2xs[1024] = {
8589
9293
  1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
8590
9294
  1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
@@ -8619,6 +9323,7 @@ static const int8_t keven_signs_q2xs[1024] = {
8619
9323
  1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
8620
9324
  1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
8621
9325
  };
9326
+ #endif
8622
9327
 
8623
9328
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8624
9329
  assert(n % QK_K == 0);
@@ -8816,15 +9521,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8816
9521
 
8817
9522
  #elif defined(__AVX2__)
8818
9523
 
8819
- const __m128i m4 = _mm_set1_epi8(0xf);
8820
- const __m128i m1 = _mm_set1_epi8(1);
8821
- const __m256i m511 = _mm256_set1_epi16(511);
8822
9524
  const __m256i mone = _mm256_set1_epi8(1);
8823
-
8824
- static const uint8_t k_bit_helper[32] = {
8825
- 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
8826
- 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
8827
- };
8828
9525
  static const char block_sign_shuffle_mask_1[32] = {
8829
9526
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
8830
9527
  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
@@ -8838,11 +9535,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8838
9535
  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
8839
9536
  };
8840
9537
 
8841
- const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
8842
9538
  const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
8843
9539
  const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
8844
9540
  const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
8845
9541
 
9542
+ #if QK_K == 64
9543
+ static const uint8_t k_bit_helper[16] = {
9544
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9545
+ };
9546
+ const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
9547
+ const __m128i m511 = _mm_set1_epi16(511);
9548
+ typedef union {
9549
+ __m128i vec_index;
9550
+ uint16_t index[8];
9551
+ } index_t;
9552
+
9553
+ index_t idx;
9554
+ __m256 accumf = _mm256_setzero_ps();
9555
+ for (int i = 0; i < nb; ++i) {
9556
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9557
+ const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
9558
+ idx.vec_index = _mm_and_si128(q2_data, m511);
9559
+
9560
+ const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
9561
+ const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
9562
+ const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
9563
+
9564
+ const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
9565
+ const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
9566
+ const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
9567
+
9568
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
9569
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
9570
+
9571
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
9572
+ iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
9573
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
9574
+ iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
9575
+
9576
+ __m256i signs;
9577
+ signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
9578
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
9579
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
9580
+
9581
+ signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
9582
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
9583
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
9584
+
9585
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
9586
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
9587
+
9588
+ const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
9589
+ const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
9590
+
9591
+ const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
9592
+
9593
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
9594
+
9595
+ }
9596
+
9597
+ *s = 0.125f * hsum_float_8(accumf);
9598
+ #else
9599
+
9600
+ static const uint8_t k_bit_helper[32] = {
9601
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9602
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9603
+ };
9604
+ const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
9605
+ const __m256i m511 = _mm256_set1_epi16(511);
9606
+ const __m128i m4 = _mm_set1_epi8(0xf);
9607
+ const __m128i m1 = _mm_set1_epi8(1);
9608
+
8846
9609
  uint64_t aux64;
8847
9610
 
8848
9611
  // somewhat hacky, but gives a significant boost in performance
@@ -8931,6 +9694,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8931
9694
  }
8932
9695
 
8933
9696
  *s = 0.125f * hsum_float_8(accumf);
9697
+ #endif
8934
9698
 
8935
9699
  #else
8936
9700
 
@@ -8972,8 +9736,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8972
9736
  #endif
8973
9737
  }
8974
9738
 
8975
- // TODO
8976
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9739
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8977
9740
  assert(n % QK_K == 0);
8978
9741
  assert(nrc == 1);
8979
9742
  UNUSED(nrc);
@@ -8981,75 +9744,279 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8981
9744
  UNUSED(by);
8982
9745
  UNUSED(bs);
8983
9746
 
8984
- const block_iq3_xxs * restrict x = vx;
8985
- const block_q8_K * restrict y = vy;
9747
+ const block_iq2_s * restrict x = vx;
9748
+ const block_q8_K * restrict y = vy;
8986
9749
 
8987
9750
  const int nb = n / QK_K;
8988
9751
 
8989
9752
  #if defined(__ARM_NEON)
8990
9753
 
8991
- const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9754
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9755
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9756
+ };
8992
9757
 
8993
- uint32_t aux32[2];
9758
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
8994
9759
 
8995
- ggml_int8x16x4_t q3s;
9760
+ const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
9761
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
9762
+ const uint8x16_t m1 = vdupq_n_u8(1);
9763
+ const int32x4_t vzero = vdupq_n_s32(0);
9764
+
9765
+ uint8x16x2_t vs;
9766
+ ggml_int8x16x4_t q2s;
8996
9767
  ggml_int8x16x4_t q8b;
8997
9768
 
8998
9769
  float sumf = 0;
8999
9770
  for (int i = 0; i < nb; ++i) {
9771
+
9000
9772
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9001
- const uint8_t * restrict q3 = x[i].qs;
9002
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9003
- const int8_t * restrict q8 = y[i].qs;
9004
- float sumf1 = 0, sumf2 = 0;
9773
+
9774
+ const uint8_t * restrict qs = x[i].qs;
9775
+ const uint8_t * restrict qh = x[i].qh;
9776
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9777
+ const int8_t * restrict q8 = y[i].qs;
9778
+
9779
+ int sumi1 = 0, sumi2 = 0;
9005
9780
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9006
9781
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9007
- memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
9008
- const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
9009
- const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
9010
- const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
9011
- const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
9012
- q3 += 16;
9013
- q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
9014
- q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
9015
- q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
9016
- q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
9017
- q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
9018
- q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
9019
- q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
9020
- q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
9021
- const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
9022
- const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
9023
- sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
9024
- sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
9782
+ q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
9783
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
9784
+ q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
9785
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
9786
+ q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
9787
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
9788
+ q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
9789
+ vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
9790
+ qs += 8;
9791
+
9792
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9793
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9794
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9795
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
9796
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
9797
+
9798
+ q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
9799
+ q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
9800
+
9801
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9802
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9803
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9804
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
9805
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
9806
+
9807
+ signs += 4;
9808
+
9809
+ q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
9810
+ q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
9811
+
9812
+ const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
9813
+ const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
9814
+ const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
9815
+ const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
9816
+
9817
+ sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
9818
+ sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
9819
+ sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
9820
+ sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
9025
9821
  }
9026
- sumf += d*(sumf1 + sumf2);
9822
+ sumf += d*(sumi1 + sumi2);
9027
9823
  }
9028
- *s = 0.5f * sumf;
9824
+
9825
+ *s = 0.125f * sumf;
9029
9826
 
9030
9827
  #elif defined(__AVX2__)
9031
9828
 
9032
- const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9829
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9830
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9831
+ };
9033
9832
 
9034
- uint32_t aux32[2];
9833
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9834
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9835
+ };
9836
+
9837
+ const __m128i m4 = _mm_set1_epi8(0xf);
9838
+ const __m128i m1 = _mm_set1_epi8(1);
9839
+
9840
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
9841
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
9842
+
9843
+ uint64_t aux64;
9035
9844
 
9036
9845
  __m256 accumf = _mm256_setzero_ps();
9037
9846
  for (int i = 0; i < nb; ++i) {
9038
9847
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9039
- const uint8_t * restrict q3 = x[i].qs;
9040
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9848
+ const uint8_t * restrict qs = x[i].qs;
9849
+ const uint8_t * restrict qh = x[i].qh;
9850
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9041
9851
  const int8_t * restrict q8 = y[i].qs;
9852
+
9853
+ memcpy(&aux64, x[i].scales, 8);
9854
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
9855
+ const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
9856
+
9042
9857
  __m256i sumi1 = _mm256_setzero_si256();
9043
9858
  __m256i sumi2 = _mm256_setzero_si256();
9044
9859
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9045
9860
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
9046
9861
  const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
9047
- const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
9048
- iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
9049
- q3 += 8;
9050
- const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
9051
- iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
9052
- q3 += 8;
9862
+ const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
9863
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
9864
+ iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
9865
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
9866
+ const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
9867
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
9868
+ iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
9869
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
9870
+ qs += 8;
9871
+
9872
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
9873
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9874
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
9875
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
9876
+
9877
+ aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
9878
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9879
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
9880
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
9881
+
9882
+ signs += 4;
9883
+
9884
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
9885
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
9886
+
9887
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
9888
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
9889
+ sumi1 = _mm256_add_epi32(sumi1, p1);
9890
+ sumi2 = _mm256_add_epi32(sumi2, p2);
9891
+ }
9892
+
9893
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
9894
+
9895
+ }
9896
+
9897
+ *s = 0.125f * hsum_float_8(accumf);
9898
+
9899
+ #else
9900
+
9901
+ float sumf = 0;
9902
+ for (int i = 0; i < nb; i++) {
9903
+
9904
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9905
+ const int8_t * q8 = y[i].qs;
9906
+ const uint8_t * qs = x[i].qs;
9907
+ const uint8_t * qh = x[i].qh;
9908
+ const uint8_t * signs = qs + QK_K/8;
9909
+
9910
+ int bsum = 0;
9911
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9912
+ int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
9913
+ int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
9914
+ int sumi1 = 0, sumi2 = 0;
9915
+ for (int l = 0; l < 2; ++l) {
9916
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
9917
+ for (int j = 0; j < 8; ++j) {
9918
+ sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
9919
+ }
9920
+ q8 += 8;
9921
+ }
9922
+ for (int l = 2; l < 4; ++l) {
9923
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
9924
+ for (int j = 0; j < 8; ++j) {
9925
+ sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
9926
+ }
9927
+ q8 += 8;
9928
+ }
9929
+ bsum += ls1 * sumi1 + ls2 * sumi2;
9930
+ qs += 4;
9931
+ signs += 4;
9932
+ }
9933
+
9934
+ sumf += d * bsum;
9935
+ }
9936
+
9937
+ *s = 0.125f * sumf;
9938
+
9939
+ #endif
9940
+
9941
+ }
9942
+
9943
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9944
+ assert(n % QK_K == 0);
9945
+ assert(nrc == 1);
9946
+ UNUSED(nrc);
9947
+ UNUSED(bx);
9948
+ UNUSED(by);
9949
+ UNUSED(bs);
9950
+
9951
+ const block_iq3_xxs * restrict x = vx;
9952
+ const block_q8_K * restrict y = vy;
9953
+
9954
+ const int nb = n / QK_K;
9955
+
9956
+ #if defined(__ARM_NEON)
9957
+
9958
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9959
+
9960
+ uint32_t aux32[2];
9961
+
9962
+ ggml_int8x16x4_t q3s;
9963
+ ggml_int8x16x4_t q8b;
9964
+
9965
+ float sumf = 0;
9966
+ for (int i = 0; i < nb; ++i) {
9967
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9968
+ const uint8_t * restrict q3 = x[i].qs;
9969
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
9970
+ const int8_t * restrict q8 = y[i].qs;
9971
+ float sumf1 = 0, sumf2 = 0;
9972
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9973
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9974
+ memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
9975
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
9976
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
9977
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
9978
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
9979
+ q3 += 16;
9980
+ q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
9981
+ q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
9982
+ q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
9983
+ q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
9984
+ q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
9985
+ q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
9986
+ q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
9987
+ q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
9988
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
9989
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
9990
+ sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
9991
+ sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
9992
+ }
9993
+ sumf += d*(sumf1 + sumf2);
9994
+ }
9995
+ *s = 0.5f * sumf;
9996
+
9997
+ #elif defined(__AVX2__)
9998
+
9999
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10000
+
10001
+ uint32_t aux32[2];
10002
+
10003
+ __m256 accumf = _mm256_setzero_ps();
10004
+ for (int i = 0; i < nb; ++i) {
10005
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10006
+ const uint8_t * restrict q3 = x[i].qs;
10007
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
10008
+ const int8_t * restrict q8 = y[i].qs;
10009
+ __m256i sumi1 = _mm256_setzero_si256();
10010
+ __m256i sumi2 = _mm256_setzero_si256();
10011
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10012
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10013
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10014
+ const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
10015
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10016
+ q3 += 8;
10017
+ const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
10018
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10019
+ q3 += 8;
9053
10020
  memcpy(aux32, gas, 8); gas += 8;
9054
10021
  const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
9055
10022
  signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
@@ -9107,137 +10074,1449 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9107
10074
  #endif
9108
10075
  }
9109
10076
 
9110
- // ================================ IQ2 quantization =============================================
10077
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10078
+ assert(n % QK_K == 0);
10079
+ assert(nrc == 1);
10080
+ UNUSED(nrc);
10081
+ UNUSED(bx);
10082
+ UNUSED(by);
10083
+ UNUSED(bs);
9111
10084
 
9112
- typedef struct {
9113
- uint64_t * grid;
9114
- int * map;
9115
- uint16_t * neighbours;
9116
- } iq2_entry_t;
10085
+ const block_iq3_s * restrict x = vx;
10086
+ const block_q8_K * restrict y = vy;
9117
10087
 
9118
- static iq2_entry_t iq2_data[2] = {
9119
- {NULL, NULL, NULL},
9120
- {NULL, NULL, NULL},
9121
- };
10088
+ const int nb = n / QK_K;
9122
10089
 
9123
- static inline int iq2_data_index(int grid_size) {
9124
- GGML_ASSERT(grid_size == 256 || grid_size == 512);
9125
- return grid_size == 256 ? 0 : 1;
9126
- }
10090
+ #if defined(__ARM_NEON)
9127
10091
 
9128
- static int iq2_compare_func(const void * left, const void * right) {
9129
- const int * l = (const int *)left;
9130
- const int * r = (const int *)right;
9131
- return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
9132
- }
10092
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10093
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10094
+ };
9133
10095
 
9134
- void iq2xs_init_impl(int grid_size) {
9135
- const int gindex = iq2_data_index(grid_size);
9136
- if (iq2_data[gindex].grid) {
9137
- return;
9138
- }
9139
- static const uint16_t kgrid_256[256] = {
9140
- 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
9141
- 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
9142
- 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
9143
- 1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
9144
- 2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
9145
- 4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
9146
- 5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
9147
- 8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
9148
- 10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
9149
- 16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
9150
- 17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
9151
- 20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
9152
- 22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
9153
- 25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
9154
- 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
9155
- 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
9156
- };
9157
- static const uint16_t kgrid_512[512] = {
9158
- 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
9159
- 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
9160
- 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
9161
- 352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
9162
- 640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
9163
- 1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
9164
- 1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
9165
- 2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
9166
- 2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
9167
- 4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
9168
- 4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
9169
- 5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
9170
- 5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
9171
- 8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
9172
- 8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
9173
- 10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
9174
- 16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
9175
- 16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
9176
- 16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
9177
- 17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
9178
- 18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
9179
- 20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
9180
- 21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
9181
- 22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
9182
- 24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
9183
- 32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
9184
- 33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
9185
- 33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
9186
- 35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
9187
- 37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
9188
- 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
9189
- 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
9190
- };
9191
- const int kmap_size = 43692;
9192
- const int nwant = 2;
9193
- const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
9194
- uint64_t * kgrid_q2xs;
9195
- int * kmap_q2xs;
9196
- uint16_t * kneighbors_q2xs;
10096
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9197
10097
 
9198
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
9199
- uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
9200
- for (int k = 0; k < grid_size; ++k) {
9201
- int8_t * pos = (int8_t *)(the_grid + k);
9202
- for (int i = 0; i < 8; ++i) {
9203
- int l = (kgrid[k] >> 2*i) & 0x3;
9204
- pos[i] = 2*l + 1;
10098
+ const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
10099
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
10100
+
10101
+ uint8x16x2_t vs;
10102
+ ggml_int8x16x4_t q3s;
10103
+ ggml_int8x16x4_t q8b;
10104
+
10105
+ float sumf = 0;
10106
+ for (int i = 0; i < nb; ++i) {
10107
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10108
+ const uint8_t * restrict qs = x[i].qs;
10109
+ const uint8_t * restrict qh = x[i].qh;
10110
+ const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10111
+ const int8_t * restrict q8 = y[i].qs;
10112
+ int sumi1 = 0, sumi2 = 0;
10113
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10114
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
10115
+ const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
10116
+ iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
10117
+ const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
10118
+ iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
10119
+ const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
10120
+ iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
10121
+ const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
10122
+ iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
10123
+ qs += 16;
10124
+
10125
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
10126
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10127
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10128
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
10129
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
10130
+
10131
+ q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0]));
10132
+ q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
10133
+
10134
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
10135
+ vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10136
+ vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10137
+ vs.val[0] = vceqq_u8(vs.val[0], mask2);
10138
+ vs.val[1] = vceqq_u8(vs.val[1], mask2);
10139
+
10140
+ signs += 4;
10141
+
10142
+ q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0]));
10143
+ q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1]));
10144
+
10145
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
10146
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
10147
+ sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
10148
+ sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
9205
10149
  }
10150
+ sumf += d*(sumi1 + sumi2);
9206
10151
  }
9207
- kgrid_q2xs = the_grid;
9208
- iq2_data[gindex].grid = the_grid;
9209
- kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
9210
- iq2_data[gindex].map = kmap_q2xs;
9211
- for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
9212
- uint64_t aux64;
9213
- uint8_t * aux8 = (uint8_t *)&aux64;
10152
+ *s = 0.25f * sumf;
10153
+
10154
+ #elif defined(__AVX2__)
10155
+
10156
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10157
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10158
+ };
10159
+
10160
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10161
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10162
+ };
10163
+
10164
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
10165
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
10166
+
10167
+ __m256 accumf = _mm256_setzero_ps();
10168
+ for (int i = 0; i < nb; ++i) {
10169
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10170
+ const uint8_t * restrict qs = x[i].qs;
10171
+ const uint8_t * restrict qh = x[i].qh;
10172
+ const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10173
+ const int8_t * restrict q8 = y[i].qs;
10174
+ __m256i sumi1 = _mm256_setzero_si256();
10175
+ __m256i sumi2 = _mm256_setzero_si256();
10176
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10177
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10178
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10179
+ const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)],
10180
+ iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)],
10181
+ iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)],
10182
+ iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)],
10183
+ iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)],
10184
+ iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)],
10185
+ iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)],
10186
+ iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
10187
+ qs += 8;
10188
+ const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)],
10189
+ iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)],
10190
+ iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)],
10191
+ iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)],
10192
+ iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)],
10193
+ iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)],
10194
+ iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)],
10195
+ iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]);
10196
+ qs += 8;
10197
+
10198
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
10199
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
10200
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
10201
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
10202
+
10203
+ aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
10204
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
10205
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
10206
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
10207
+
10208
+ signs += 4;
10209
+
10210
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
10211
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
10212
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
10213
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
10214
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
10215
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
10216
+ sumi1 = _mm256_add_epi32(sumi1, p1);
10217
+ sumi2 = _mm256_add_epi32(sumi2, p2);
10218
+ }
10219
+
10220
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
10221
+
10222
+ }
10223
+
10224
+ *s = 0.25f * hsum_float_8(accumf);
10225
+
10226
+ #else
10227
+
10228
+ float sumf = 0.f;
10229
+ for (int i = 0; i < nb; ++i) {
10230
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10231
+ const uint8_t * restrict qs = x[i].qs;
10232
+ const uint8_t * restrict qh = x[i].qh;
10233
+ const uint8_t * restrict signs = x[i].signs;
10234
+ const int8_t * restrict q8 = y[i].qs;
10235
+ int32_t bsum = 0;
10236
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10237
+ const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
10238
+ const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
10239
+ int32_t sumi = 0;
10240
+ for (int l = 0; l < 4; ++l) {
10241
+ const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
10242
+ const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
10243
+ for (int j = 0; j < 4; ++j) {
10244
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10245
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
10246
+ }
10247
+ q8 += 8;
10248
+ }
10249
+ qs += 8;
10250
+ signs += 4;
10251
+ bsum += sumi * ls1;
10252
+ sumi = 0;
10253
+ for (int l = 0; l < 4; ++l) {
10254
+ const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
10255
+ const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
10256
+ for (int j = 0; j < 4; ++j) {
10257
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10258
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
10259
+ }
10260
+ q8 += 8;
10261
+ }
10262
+ qs += 8;
10263
+ signs += 4;
10264
+ bsum += sumi * ls2;
10265
+ }
10266
+ sumf += d * bsum;
10267
+ }
10268
+ *s = 0.25f * sumf;
10269
+ #endif
10270
+ }
10271
+
10272
+
10273
+ #ifdef __AVX2__
10274
+ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
10275
+ const __m256i ax = _mm256_sign_epi8(x, x);
10276
+ const __m256i sy = _mm256_sign_epi8(y, x);
10277
+ return _mm256_maddubs_epi16(ax, sy);
10278
+ }
10279
+ #endif
10280
+
10281
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10282
+ assert(n % QK_K == 0);
10283
+ assert(nrc == 1);
10284
+ UNUSED(nrc);
10285
+ UNUSED(bx);
10286
+ UNUSED(by);
10287
+ UNUSED(bs);
10288
+
10289
+ const block_iq1_s * restrict x = vx;
10290
+ const block_q8_K * restrict y = vy;
10291
+
10292
+ const int nb = n / QK_K;
10293
+
10294
+ // TODO: implement for QK_K = 64
10295
+ #if defined __ARM_NEON && QK_K == 256
10296
+
10297
+ const uint8x16_t m8 = vdupq_n_u8(0x08);
10298
+ const uint8x16_t m7 = vdupq_n_u8(0x07);
10299
+ const uint8x16_t m1 = vdupq_n_u8(0x01);
10300
+ const int32x4_t vzero = vdupq_n_s32(0);
10301
+
10302
+ uint16_t gindex[8];
10303
+ uint16x8x2_t vindex;
10304
+ int8x16x4_t q1b;
10305
+ ggml_int8x16x4_t q8b;
10306
+ uint16x8x4_t scales;
10307
+ int32x4x2_t sumi;
10308
+ int32x4x2_t dotq;
10309
+
10310
+ float sumf = 0;
10311
+ for (int i = 0; i < nb; ++i) {
10312
+
10313
+ const int8_t * q8 = y[i].qs;
10314
+ const uint8_t * qs = x[i].qs;
10315
+ const uint8_t * sc = x[i].scales;
10316
+
10317
+ sumi.val[0] = sumi.val[1] = vzero;
10318
+
10319
+ for (int i128 = 0; i128 < QK_K/128; ++i128) {
10320
+ const uint8x16_t ql = vld1q_u8(qs); qs += 16;
10321
+ const uint8x8_t tm1 = vld1_u8 (sc); sc += 8;
10322
+ const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
10323
+ const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
10324
+ const uint8x16_t hbit = vandq_u8(qh, m8);
10325
+ vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
10326
+ vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
10327
+ const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
10328
+ scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
10329
+ scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
10330
+
10331
+ for (int l = 0; l < 2; ++l) {
10332
+ vst1q_u16(gindex+0, vindex.val[l]);
10333
+ q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1])));
10334
+ q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3])));
10335
+ q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5])));
10336
+ q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7])));
10337
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
10338
+
10339
+ dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1]));
10340
+ dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3]));
10341
+
10342
+ sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
10343
+ sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
10344
+ }
10345
+ }
10346
+
10347
+ sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1]));
10348
+ }
10349
+
10350
+ *s = sumf;
10351
+
10352
+ // TODO: implement for QK_K = 64
10353
+ #elif defined __AVX2__ && QK_K == 256
10354
+
10355
+ const __m128i m8 = _mm_set1_epi8(0x08);
10356
+ const __m128i m7 = _mm_set1_epi8(0x07);
10357
+ const __m128i m1 = _mm_set1_epi8(0x01);
10358
+ const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
10359
+ const __m128i shuffle_s[4] = {
10360
+ _mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
10361
+ _mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
10362
+ _mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
10363
+ _mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
10364
+ };
10365
+
10366
+ uint64_t aux64;
10367
+
10368
+ typedef union m256i_uint16 {
10369
+ __m256i reg;
10370
+ uint16_t s[16];
10371
+ } m256i_uint16_t;
10372
+
10373
+ m256i_uint16_t v_gindex;
10374
+
10375
+ __m256 accum = _mm256_setzero_ps();
10376
+ for (int i = 0; i < nb; ++i) {
10377
+
10378
+ const int8_t * q8 = y[i].qs;
10379
+ const uint8_t * qs = x[i].qs;
10380
+ const uint8_t * sc = x[i].scales;
10381
+
10382
+ __m256i sumi = _mm256_setzero_si256();
10383
+ for (int i128 = 0; i128 < QK_K/128; ++i128) {
10384
+ const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10385
+ memcpy(&aux64, sc, 8); sc += 8;
10386
+ const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
10387
+ const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
10388
+ v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
10389
+ const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
10390
+
10391
+ for (int i32 = 0; i32 < 4; ++i32) {
10392
+ const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10393
+ const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
10394
+ iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
10395
+ const __m256i dot = mul_add_epi8(q1b, q8b);
10396
+ const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
10397
+ const __m256i p = _mm256_madd_epi16(s16, dot);
10398
+ sumi = _mm256_add_epi32(sumi, p);
10399
+ }
10400
+
10401
+ }
10402
+
10403
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum);
10404
+
10405
+ }
10406
+
10407
+ *s = hsum_float_8(accum);
10408
+
10409
+ #else
10410
+
10411
+ int db[4];
10412
+ uint16_t idx[4];
10413
+
10414
+ float sumf = 0;
10415
+ for (int i = 0; i < nb; ++i) {
10416
+
10417
+ const int8_t * q8 = y[i].qs;
10418
+ const uint8_t * qs = x[i].qs;
10419
+ const uint8_t * sc = x[i].scales;
10420
+
10421
+ int sumi = 0;
10422
+ for (int i32 = 0; i32 < QK_K/32; ++i32) {
10423
+ idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
10424
+ idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
10425
+ idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
10426
+ idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
10427
+ db[0] = (2*(sc[0] & 7) + 1);
10428
+ db[1] = (2*((sc[0] >> 4) & 7) + 1);
10429
+ db[2] = (2*(sc[1] & 7) + 1);
10430
+ db[3] = (2*((sc[1] >> 4) & 7) + 1);
10431
+ for (int l = 0; l < 4; ++l) {
10432
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
10433
+ int suml = 0;
10434
+ for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j];
10435
+ sumi += db[l] * suml;
10436
+ q8 += 8;
10437
+ }
10438
+ qs += 4;
10439
+ sc += 2;
10440
+ }
10441
+
10442
+ sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
10443
+ }
10444
+
10445
+ *s = sumf;
10446
+
10447
+ #endif
10448
+ }
10449
+
10450
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10451
+ assert(nrc == 1);
10452
+ UNUSED(nrc);
10453
+ UNUSED(bx);
10454
+ UNUSED(by);
10455
+ UNUSED(bs);
10456
+ assert(n % QK4_NL == 0);
10457
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
10458
+
10459
+ const block_iq4_nl * restrict x = vx;
10460
+ const block_q8_0 * restrict y = vy;
10461
+
10462
+ const int nb = n / QK4_NL;
10463
+
10464
+ #if defined __ARM_NEON
10465
+ const int8x16_t values = vld1q_s8(kvalues_iq4nl);
10466
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
10467
+ uint8x16x2_t q4bits;
10468
+ int8x16x4_t q4b;
10469
+ int8x16x4_t q8b;
10470
+ int32x4_t prod_1, prod_2;
10471
+
10472
+ float sumf = 0;
10473
+
10474
+ for (int ib = 0; ib < nb; ib += 2) {
10475
+
10476
+ q4bits.val[0] = vld1q_u8(x[ib+0].qs);
10477
+ q4bits.val[1] = vld1q_u8(x[ib+1].qs);
10478
+ q8b.val[0] = vld1q_s8(y[ib+0].qs);
10479
+ q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
10480
+ q8b.val[2] = vld1q_s8(y[ib+1].qs);
10481
+ q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
10482
+
10483
+ q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
10484
+ q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
10485
+ q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
10486
+ q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
10487
+
10488
+ prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
10489
+ prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
10490
+
10491
+ sumf +=
10492
+ GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
10493
+ GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
10494
+ }
10495
+
10496
+ *s = sumf;
10497
+
10498
+ #elif defined __AVX2__
10499
+
10500
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
10501
+ const __m128i m4b = _mm_set1_epi8(0x0f);
10502
+ const __m256i mone = _mm256_set1_epi16(1);
10503
+
10504
+ __m256 accum1 = _mm256_setzero_ps();
10505
+ __m256 accum2 = _mm256_setzero_ps();
10506
+ for (int ib = 0; ib < nb; ib += 2) {
10507
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
10508
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
10509
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
10510
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
10511
+ const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10512
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10513
+ const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10514
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10515
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10516
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10517
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
10518
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
10519
+ accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
10520
+ _mm256_cvtepi32_ps(p_1), accum1);
10521
+ accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
10522
+ _mm256_cvtepi32_ps(p_2), accum2);
10523
+
10524
+ y += 2;
10525
+ x += 2;
10526
+ }
10527
+
10528
+ *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
10529
+
10530
+ #else
10531
+ float sumf = 0;
10532
+ for (int ib = 0; ib < nb; ++ib) {
10533
+ const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
10534
+ int sumi1 = 0, sumi2 = 0;
10535
+ for (int j = 0; j < QK4_NL/2; ++j) {
10536
+ sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
10537
+ sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
10538
+ }
10539
+ sumf += d * (sumi1 + sumi2);
10540
+ }
10541
+ *s = sumf;
10542
+ #endif
10543
+ }
10544
+
10545
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10546
+ assert(nrc == 1);
10547
+ UNUSED(nrc);
10548
+ UNUSED(bx);
10549
+ UNUSED(by);
10550
+ UNUSED(bs);
10551
+ assert(n % QK_K == 0);
10552
+ #if QK_K == 64
10553
+ ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
10554
+ #else
10555
+
10556
+ const block_iq4_xs * restrict x = vx;
10557
+ const block_q8_K * restrict y = vy;
10558
+
10559
+ const int nb = n / QK_K;
10560
+
10561
+ #if defined __ARM_NEON
10562
+ const int8x16_t values = vld1q_s8(kvalues_iq4nl);
10563
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
10564
+ ggml_uint8x16x2_t q4bits;
10565
+ ggml_int8x16x4_t q4b;
10566
+ ggml_int8x16x4_t q8b;
10567
+ int32x4_t prod_1, prod_2;
10568
+
10569
+ float sumf = 0;
10570
+
10571
+ for (int ibl = 0; ibl < nb; ++ibl) {
10572
+
10573
+ const int8_t * q8 = y[ibl].qs;
10574
+ const uint8_t * q4 = x[ibl].qs;
10575
+ uint16_t h = x[ibl].scales_h;
10576
+
10577
+ int sumi1 = 0, sumi2 = 0;
10578
+ for (int ib = 0; ib < QK_K/64; ++ib) {
10579
+
10580
+ q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
10581
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
10582
+
10583
+ q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
10584
+ q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
10585
+ q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
10586
+ q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
10587
+
10588
+ prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
10589
+ prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
10590
+
10591
+ int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
10592
+ int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
10593
+ h >>= 4;
10594
+ sumi1 += vaddvq_s32(prod_1) * ls1;
10595
+ sumi2 += vaddvq_s32(prod_2) * ls2;
10596
+
10597
+ }
10598
+
10599
+ sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
10600
+ }
10601
+
10602
+ *s = sumf;
10603
+
10604
+ #elif defined __AVX2__
10605
+
10606
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
10607
+ const __m128i m4b = _mm_set1_epi8(0x0f);
10608
+
10609
+ __m256 accum = _mm256_setzero_ps();
10610
+ for (int ibl = 0; ibl < nb; ++ibl) {
10611
+ const uint8_t * qs = x[ibl].qs;
10612
+ const int8_t * q8 = y[ibl].qs;
10613
+ uint16_t sh = x[ibl].scales_h;
10614
+ __m256i sumi1 = _mm256_setzero_si256();
10615
+ __m256i sumi2 = _mm256_setzero_si256();
10616
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
10617
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10618
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10619
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10620
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10621
+ const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10622
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10623
+ const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10624
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10625
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10626
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10627
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
10628
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
10629
+ sh >>= 4;
10630
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
10631
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
10632
+ sumi1 = _mm256_add_epi32(p_1, sumi1);
10633
+ sumi2 = _mm256_add_epi32(p_2, sumi2);
10634
+ }
10635
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
10636
+ _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
10637
+ }
10638
+
10639
+ *s = hsum_float_8(accum);
10640
+
10641
+ #else
10642
+ float sumf = 0;
10643
+ for (int ibl = 0; ibl < nb; ++ibl) {
10644
+ const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
10645
+ uint16_t h = x[ibl].scales_h;
10646
+ const uint8_t * qs = x[ibl].qs;
10647
+ const int8_t * q8 = y[ibl].qs;
10648
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
10649
+ const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
10650
+ const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
10651
+ h >>= 4;
10652
+ const float d1 = d4d8*(ls1 - 32);
10653
+ const float d2 = d4d8*(ls2 - 32);
10654
+ int sumi1 = 0, sumi2 = 0;
10655
+ for (int j = 0; j < 16; ++j) {
10656
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
10657
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
10658
+ }
10659
+ sumf += d1 * (sumi1 + sumi2);
10660
+ qs += 16;
10661
+ q8 += 32;
10662
+ sumi1 = sumi2 = 0;
10663
+ for (int j = 0; j < 16; ++j) {
10664
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
10665
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
10666
+ }
10667
+ sumf += d2 * (sumi1 + sumi2);
10668
+ qs += 16;
10669
+ q8 += 32;
10670
+ }
10671
+ }
10672
+ *s = sumf;
10673
+ #endif
10674
+ #endif
10675
+ }
10676
+
10677
+ // ================================ IQ2 quantization =============================================
10678
+
10679
+ typedef struct {
10680
+ uint64_t * grid;
10681
+ int * map;
10682
+ uint16_t * neighbours;
10683
+ } iq2_entry_t;
10684
+
10685
+ static iq2_entry_t iq2_data[4] = {
10686
+ {NULL, NULL, NULL},
10687
+ {NULL, NULL, NULL},
10688
+ {NULL, NULL, NULL},
10689
+ {NULL, NULL, NULL},
10690
+ };
10691
+
10692
+ static inline int iq2_data_index(enum ggml_type type) {
10693
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10694
+ return type == GGML_TYPE_IQ2_XXS ? 0 :
10695
+ type == GGML_TYPE_IQ2_XS ? 1 :
10696
+ type == GGML_TYPE_IQ1_S ? 2 : 3;
10697
+ }
10698
+
10699
+ static inline int iq2_grid_size(enum ggml_type type) {
10700
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10701
+ return type == GGML_TYPE_IQ2_XXS ? 256 :
10702
+ type == GGML_TYPE_IQ2_XS ? 512 :
10703
+ type == GGML_TYPE_IQ1_S ? 512 : 1024;
10704
+ }
10705
+
10706
+ static int iq2_compare_func(const void * left, const void * right) {
10707
+ const int * l = (const int *)left;
10708
+ const int * r = (const int *)right;
10709
+ return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
10710
+ }
10711
+
10712
+ void iq2xs_init_impl(enum ggml_type type) {
10713
+ const int gindex = iq2_data_index(type);
10714
+ const int grid_size = iq2_grid_size(type);
10715
+ if (iq2_data[gindex].grid) {
10716
+ return;
10717
+ }
10718
+ static const uint16_t kgrid_2bit_256[256] = {
10719
+ 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
10720
+ 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
10721
+ 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
10722
+ 1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
10723
+ 2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
10724
+ 4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
10725
+ 5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
10726
+ 8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
10727
+ 10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
10728
+ 16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
10729
+ 17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
10730
+ 20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
10731
+ 22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
10732
+ 25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
10733
+ 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
10734
+ 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
10735
+ };
10736
+ static const uint16_t kgrid_2bit_512[512] = {
10737
+ 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
10738
+ 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
10739
+ 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
10740
+ 352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
10741
+ 640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
10742
+ 1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
10743
+ 1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
10744
+ 2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
10745
+ 2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
10746
+ 4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
10747
+ 4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
10748
+ 5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
10749
+ 5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
10750
+ 8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
10751
+ 8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
10752
+ 10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
10753
+ 16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
10754
+ 16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
10755
+ 16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
10756
+ 17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
10757
+ 18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
10758
+ 20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
10759
+ 21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
10760
+ 22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
10761
+ 24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
10762
+ 32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
10763
+ 33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
10764
+ 33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
10765
+ 35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
10766
+ 37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
10767
+ 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
10768
+ 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
10769
+ };
10770
+ static const uint16_t kgrid_1bit_512[512] = {
10771
+ 10, 33, 41, 85, 132, 134, 160, 162, 277, 337, 340, 345, 357, 405, 516, 545,
10772
+ 553, 598, 641, 650, 681, 1042, 1044, 1097, 1169, 1176, 1320, 1345, 1365, 1378, 1434, 1444,
10773
+ 1545, 1617, 1642, 1685, 2053, 2080, 2089, 2133, 2176, 2182, 2208, 2214, 2306, 2384, 2393, 2440,
10774
+ 2453, 2581, 2664, 2690, 2721, 4117, 4161, 4182, 4184, 4261, 4357, 4369, 4372, 4377, 4390, 4422,
10775
+ 4432, 4437, 4449, 4457, 4485, 4497, 4505, 4629, 4677, 4696, 4774, 5205, 5217, 5225, 5386, 5397,
10776
+ 5409, 5445, 5457, 5460, 5461, 5462, 5465, 5472, 5477, 5525, 5545, 5650, 5668, 5717, 5729, 5769,
10777
+ 5777, 6212, 6234, 6244, 6293, 6424, 6482, 6485, 6502, 6505, 6529, 6538, 6565, 6656, 6682, 6788,
10778
+ 6806, 6820, 8218, 8224, 8226, 8232, 8277, 8326, 8354, 8469, 8521, 8530, 8549, 8596, 8737, 8794,
10779
+ 9221, 9253, 9348, 9369, 9380, 9474, 9557, 9633, 9732, 9753, 9793, 9830, 9862, 9880, 10240, 10272,
10780
+ 10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665,
10781
+ 16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685,
10782
+ 17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529,
10783
+ 18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517,
10784
+ 20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872,
10785
+ 20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653,
10786
+ 21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842,
10787
+ 21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913,
10788
+ 21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608,
10789
+ 22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072,
10790
+ 23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110,
10791
+ 25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937,
10792
+ 25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885,
10793
+ 26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808,
10794
+ 32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320,
10795
+ 33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918,
10796
+ 34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125,
10797
+ 37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973,
10798
+ 38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485,
10799
+ 38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497,
10800
+ 39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514,
10801
+ 41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
10802
+ 42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
10803
+ };
10804
+ static const uint16_t kgrid_2bit_1024[1024] = {
10805
+ 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
10806
+ 73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
10807
+ 165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
10808
+ 337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
10809
+ 517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
10810
+ 674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
10811
+ 1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
10812
+ 1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
10813
+ 1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
10814
+ 1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
10815
+ 2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
10816
+ 2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
10817
+ 2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
10818
+ 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
10819
+ 4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
10820
+ 4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
10821
+ 4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
10822
+ 4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
10823
+ 5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
10824
+ 5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
10825
+ 5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
10826
+ 5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
10827
+ 6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
10828
+ 6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
10829
+ 8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
10830
+ 8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
10831
+ 8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
10832
+ 9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
10833
+ 9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
10834
+ 10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
10835
+ 16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
10836
+ 16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
10837
+ 16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
10838
+ 16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
10839
+ 17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
10840
+ 17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
10841
+ 17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
10842
+ 17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
10843
+ 18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
10844
+ 18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
10845
+ 18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
10846
+ 20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
10847
+ 20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
10848
+ 20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
10849
+ 21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
10850
+ 21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
10851
+ 22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
10852
+ 22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
10853
+ 24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
10854
+ 24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
10855
+ 25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
10856
+ 26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
10857
+ 32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
10858
+ 33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
10859
+ 33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
10860
+ 33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
10861
+ 34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
10862
+ 35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
10863
+ 36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
10864
+ 37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
10865
+ 38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
10866
+ 39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
10867
+ 41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
10868
+ 42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
10869
+ };
10870
+
10871
+ const int kmap_size = 43692;
10872
+ //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
10873
+ const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10874
+ const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
10875
+ type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
10876
+ type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
10877
+ uint64_t * kgrid_q2xs;
10878
+ int * kmap_q2xs;
10879
+ uint16_t * kneighbors_q2xs;
10880
+
10881
+ printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10882
+ uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
10883
+ for (int k = 0; k < grid_size; ++k) {
10884
+ int8_t * pos = (int8_t *)(the_grid + k);
10885
+ for (int i = 0; i < 8; ++i) {
10886
+ int l = (kgrid[k] >> 2*i) & 0x3;
10887
+ pos[i] = 2*l + 1;
10888
+ }
10889
+ }
10890
+ kgrid_q2xs = the_grid;
10891
+ iq2_data[gindex].grid = the_grid;
10892
+ kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
10893
+ iq2_data[gindex].map = kmap_q2xs;
10894
+ for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
10895
+ uint64_t aux64;
10896
+ uint8_t * aux8 = (uint8_t *)&aux64;
10897
+ for (int i = 0; i < grid_size; ++i) {
10898
+ aux64 = kgrid_q2xs[i];
10899
+ uint16_t index = 0;
10900
+ for (int k=0; k<8; ++k) {
10901
+ uint16_t q = (aux8[k] - 1)/2;
10902
+ index |= (q << 2*k);
10903
+ }
10904
+ kmap_q2xs[index] = i;
10905
+ }
10906
+ int8_t pos[8];
10907
+ int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
10908
+ int num_neighbors = 0, num_not_in_map = 0;
10909
+ for (int i = 0; i < kmap_size; ++i) {
10910
+ if (kmap_q2xs[i] >= 0) continue;
10911
+ ++num_not_in_map;
10912
+ for (int k = 0; k < 8; ++k) {
10913
+ int l = (i >> 2*k) & 0x3;
10914
+ pos[k] = 2*l + 1;
10915
+ }
10916
+ for (int j = 0; j < grid_size; ++j) {
10917
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
10918
+ int d2 = 0;
10919
+ for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
10920
+ dist2[2*j+0] = d2;
10921
+ dist2[2*j+1] = j;
10922
+ }
10923
+ qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
10924
+ int n = 0; int d2 = dist2[0];
10925
+ int nhave = 1;
10926
+ for (int j = 0; j < grid_size; ++j) {
10927
+ if (dist2[2*j] > d2) {
10928
+ if (nhave == nwant) break;
10929
+ d2 = dist2[2*j];
10930
+ ++nhave;
10931
+ }
10932
+ ++n;
10933
+ }
10934
+ num_neighbors += n;
10935
+ }
10936
+ printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10937
+ kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
10938
+ iq2_data[gindex].neighbours = kneighbors_q2xs;
10939
+ int counter = 0;
10940
+ for (int i = 0; i < kmap_size; ++i) {
10941
+ if (kmap_q2xs[i] >= 0) continue;
10942
+ for (int k = 0; k < 8; ++k) {
10943
+ int l = (i >> 2*k) & 0x3;
10944
+ pos[k] = 2*l + 1;
10945
+ }
10946
+ for (int j = 0; j < grid_size; ++j) {
10947
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
10948
+ int d2 = 0;
10949
+ for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
10950
+ dist2[2*j+0] = d2;
10951
+ dist2[2*j+1] = j;
10952
+ }
10953
+ qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
10954
+ kmap_q2xs[i] = -(counter + 1);
10955
+ int d2 = dist2[0];
10956
+ uint16_t * start = &kneighbors_q2xs[counter++];
10957
+ int n = 0, nhave = 1;
10958
+ for (int j = 0; j < grid_size; ++j) {
10959
+ if (dist2[2*j] > d2) {
10960
+ if (nhave == nwant) break;
10961
+ d2 = dist2[2*j];
10962
+ ++nhave;
10963
+ }
10964
+ kneighbors_q2xs[counter++] = dist2[2*j+1];
10965
+ ++n;
10966
+ }
10967
+ *start = n;
10968
+ }
10969
+ free(dist2);
10970
+ }
10971
+
10972
+ void iq2xs_free_impl(enum ggml_type type) {
10973
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10974
+ const int gindex = iq2_data_index(type);
10975
+ if (iq2_data[gindex].grid) {
10976
+ free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
10977
+ free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
10978
+ free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
10979
+ }
10980
+ }
10981
+
10982
+ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
10983
+ const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
10984
+ int num_neighbors = neighbours[0];
10985
+ GGML_ASSERT(num_neighbors > 0);
10986
+ float best_d2 = FLT_MAX;
10987
+ int grid_index = -1;
10988
+ for (int j = 1; j <= num_neighbors; ++j) {
10989
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
10990
+ float d2 = 0;
10991
+ for (int i = 0; i < 8; ++i) {
10992
+ float q = pg[i];
10993
+ float diff = scale*q - xval[i];
10994
+ d2 += weight[i]*diff*diff;
10995
+ }
10996
+ if (d2 < best_d2) {
10997
+ best_d2 = d2; grid_index = neighbours[j];
10998
+ }
10999
+ }
11000
+ GGML_ASSERT(grid_index >= 0);
11001
+ const int8_t * pg = (const int8_t *)(grid + grid_index);
11002
+ for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
11003
+ return grid_index;
11004
+ }
11005
+
11006
+ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
11007
+
11008
+ const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
11009
+
11010
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
11011
+ const int * kmap_q2xs = iq2_data[gindex].map;
11012
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
11013
+
11014
+ GGML_ASSERT(quant_weights && "missing quantization weights");
11015
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
11016
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
11017
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
11018
+ GGML_ASSERT(n%QK_K == 0);
11019
+
11020
+ const int kMaxQ = 3;
11021
+
11022
+ const int nbl = n/QK_K;
11023
+
11024
+ block_iq2_xxs * y = vy;
11025
+
11026
+ float scales[QK_K/32];
11027
+ float weight[32];
11028
+ float xval[32];
11029
+ int8_t L[32];
11030
+ int8_t Laux[32];
11031
+ float waux[32];
11032
+ uint8_t block_signs[4];
11033
+ uint32_t q2[2*(QK_K/32)];
11034
+
11035
+ for (int ibl = 0; ibl < nbl; ++ibl) {
11036
+
11037
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
11038
+ memset(q2, 0, QK_K/4);
11039
+
11040
+ float max_scale = 0;
11041
+
11042
+ const float * xbl = x + QK_K*ibl;
11043
+ float sumx2 = 0;
11044
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
11045
+ float sigma2 = sumx2/QK_K;
11046
+
11047
+ for (int ib = 0; ib < QK_K/32; ++ib) {
11048
+ const float * xb = xbl + 32*ib;
11049
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
11050
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11051
+ for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
11052
+ for (int k = 0; k < 4; ++k) {
11053
+ int nflip = 0;
11054
+ uint8_t s = 0;
11055
+ for (int i = 0; i < 8; ++i) {
11056
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
11057
+ else {
11058
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
11059
+ }
11060
+ }
11061
+ if (nflip%2) {
11062
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
11063
+ for (int i = 1; i < 8; ++i) {
11064
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
11065
+ if (ax < min) {
11066
+ min = ax; imin = i;
11067
+ }
11068
+ }
11069
+ xval[8*k+imin] = -xval[8*k+imin];
11070
+ s ^= (1 << imin);
11071
+ }
11072
+ block_signs[k] = s & 127;
11073
+ }
11074
+ float max = xval[0];
11075
+ for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
11076
+ if (!max) {
11077
+ scales[ib] = 0;
11078
+ memset(L, 0, 32);
11079
+ continue;
11080
+ }
11081
+ float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
11082
+ float eff_max = scale*kMaxQ;
11083
+ float best = 0;
11084
+ for (int is = -6; is <= 6; ++is) {
11085
+ float id = (2*kMaxQ-1+is*0.1f)/eff_max;
11086
+ float this_scale = 1/id;
11087
+ for (int k = 0; k < 4; ++k) {
11088
+ for (int i = 0; i < 8; ++i) {
11089
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
11090
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
11091
+ }
11092
+ uint16_t u = 0;
11093
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
11094
+ int grid_index = kmap_q2xs[u];
11095
+ if (grid_index < 0) {
11096
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
11097
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
11098
+ }
11099
+ }
11100
+ float sumqx = 0, sumq2 = 0;
11101
+ for (int i = 0; i < 32; ++i) {
11102
+ float w = weight[i];
11103
+ float q = 2*Laux[i] + 1;
11104
+ sumqx += w*xval[i]*q;
11105
+ sumq2 += w*q*q;
11106
+ }
11107
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
11108
+ scale = sumqx/sumq2; best = scale*sumqx;
11109
+ memcpy(L, Laux, 32);
11110
+ }
11111
+ }
11112
+ if (scale > 0) {
11113
+ float id = 1/scale;
11114
+ for (int k = 0; k < 4; ++k) {
11115
+ uint16_t u = 0;
11116
+ for (int i = 0; i < 8; ++i) {
11117
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
11118
+ l = MAX(0, MIN(kMaxQ-1, l));
11119
+ u |= (l << 2*i);
11120
+ }
11121
+ int grid_index = kmap_q2xs[u];
11122
+ if (grid_index < 0) {
11123
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
11124
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
11125
+ }
11126
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
11127
+ for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
11128
+ }
11129
+ float sumqx = 0, sumq2 = 0;
11130
+ for (int i = 0; i < 32; ++i) {
11131
+ float w = weight[i];
11132
+ float q = 2*L[i] + 1;
11133
+ sumqx += w*xval[i]*q;
11134
+ sumq2 += w*q*q;
11135
+ }
11136
+ if (sumq2 > 0) scale = sumqx/sumq2;
11137
+ }
11138
+ if (scale < 0) {
11139
+ // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
11140
+ // and correspondingly flip quant signs.
11141
+ scale = -scale;
11142
+ for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
11143
+ }
11144
+ for (int k = 0; k < 4; ++k) {
11145
+ uint16_t u = 0;
11146
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
11147
+ int grid_index = kmap_q2xs[u];
11148
+ if (grid_index < 0) {
11149
+ printf("Oops: found point %u not on grid:", u);
11150
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
11151
+ printf("\n");
11152
+ GGML_ASSERT(false);
11153
+ }
11154
+ q2[2*ib+0] |= (grid_index << 8*k);
11155
+ q2[2*ib+1] |= (block_signs[k] << 7*k);
11156
+ }
11157
+ GGML_ASSERT(scale >= 0);
11158
+ scales[ib] = scale;
11159
+ max_scale = MAX(max_scale, scale);
11160
+ }
11161
+
11162
+ if (!max_scale) {
11163
+ memset(y[ibl].qs, 0, QK_K/4);
11164
+ continue;
11165
+ }
11166
+
11167
+ float d = max_scale/31;
11168
+ y[ibl].d = GGML_FP32_TO_FP16(d);
11169
+ float id = 1/d;
11170
+ for (int ib = 0; ib < QK_K/32; ++ib) {
11171
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
11172
+ l = MAX(0, MIN(15, l));
11173
+ q2[2*ib+1] |= ((uint32_t)l << 28);
11174
+ }
11175
+ memcpy(y[ibl].qs, q2, QK_K/4);
11176
+ }
11177
+ }
11178
+
11179
+ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
11180
+
11181
+ const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
11182
+
11183
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
11184
+ const int * kmap_q2xs = iq2_data[gindex].map;
11185
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
11186
+
11187
+ GGML_ASSERT(quant_weights && "missing quantization weights");
11188
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
11189
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
11190
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
11191
+ GGML_ASSERT(n%QK_K == 0);
11192
+
11193
+ const int kMaxQ = 3;
11194
+
11195
+ const int nbl = n/QK_K;
11196
+
11197
+ block_iq2_xs * y = vy;
11198
+
11199
+ float scales[QK_K/16];
11200
+ float weight[16];
11201
+ float xval[16];
11202
+ int8_t L[16];
11203
+ int8_t Laux[16];
11204
+ float waux[16];
11205
+ bool is_on_grid[2];
11206
+ bool is_on_grid_aux[2];
11207
+ uint8_t block_signs[2];
11208
+ uint16_t q2[2*(QK_K/16)];
11209
+
11210
+ for (int ibl = 0; ibl < nbl; ++ibl) {
11211
+
11212
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
11213
+ memset(q2, 0, QK_K/4);
11214
+ memset(y[ibl].scales, 0, QK_K/32);
11215
+
11216
+ float max_scale = 0;
11217
+
11218
+ const float * xbl = x + QK_K*ibl;
11219
+ float sumx2 = 0;
11220
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
11221
+ float sigma2 = sumx2/QK_K;
11222
+
11223
+ for (int ib = 0; ib < QK_K/16; ++ib) {
11224
+ const float * xb = xbl + 16*ib;
11225
+ const float * qw = quant_weights + QK_K*ibl + 16*ib;
11226
+ for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11227
+ for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
11228
+ for (int k = 0; k < 2; ++k) {
11229
+ int nflip = 0;
11230
+ uint8_t s = 0;
11231
+ for (int i = 0; i < 8; ++i) {
11232
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
11233
+ else {
11234
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
11235
+ }
11236
+ }
11237
+ if (nflip%2) {
11238
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
11239
+ for (int i = 1; i < 8; ++i) {
11240
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
11241
+ if (ax < min) {
11242
+ min = ax; imin = i;
11243
+ }
11244
+ }
11245
+ xval[8*k+imin] = -xval[8*k+imin];
11246
+ s ^= (1 << imin);
11247
+ }
11248
+ block_signs[k] = s & 127;
11249
+ }
11250
+ float max = xval[0];
11251
+ for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
11252
+ if (!max) {
11253
+ scales[ib] = 0;
11254
+ memset(L, 0, 16);
11255
+ continue;
11256
+ }
11257
+ float best = 0;
11258
+ float scale = max/(2*kMaxQ-1);
11259
+ is_on_grid[0] = is_on_grid[1] = true;
11260
+ for (int is = -9; is <= 9; ++is) {
11261
+ float id = (2*kMaxQ-1+is*0.1f)/max;
11262
+ float this_scale = 1/id;
11263
+ for (int k = 0; k < 2; ++k) {
11264
+ for (int i = 0; i < 8; ++i) {
11265
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
11266
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
11267
+ }
11268
+ uint16_t u = 0;
11269
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
11270
+ int grid_index = kmap_q2xs[u];
11271
+ is_on_grid_aux[k] = true;
11272
+ if (grid_index < 0) {
11273
+ is_on_grid_aux[k] = false;
11274
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
11275
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
11276
+ }
11277
+ }
11278
+ float sumqx = 0, sumq2 = 0;
11279
+ for (int i = 0; i < 16; ++i) {
11280
+ float w = weight[i];
11281
+ float q = 2*Laux[i] + 1;
11282
+ sumqx += w*xval[i]*q;
11283
+ sumq2 += w*q*q;
11284
+ }
11285
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
11286
+ scale = sumqx/sumq2; best = scale*sumqx;
11287
+ for (int i = 0; i < 16; ++i) L[i] = Laux[i];
11288
+ for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
11289
+ }
11290
+ }
11291
+ int n_not_ongrid = 0;
11292
+ for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
11293
+ if (n_not_ongrid > 0 && scale > 0) {
11294
+ float id = 1/scale;
11295
+ for (int k = 0; k < 2; ++k) {
11296
+ if (is_on_grid[k]) continue;
11297
+ uint16_t u = 0;
11298
+ for (int i = 0; i < 8; ++i) {
11299
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
11300
+ l = MAX(0, MIN(kMaxQ-1, l));
11301
+ u |= (l << 2*i);
11302
+ L[8*k + i] = l;
11303
+ }
11304
+ int grid_index = kmap_q2xs[u];
11305
+ if (grid_index < 0) {
11306
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
11307
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
11308
+ }
11309
+ }
11310
+ float sumqx = 0, sumq2 = 0;
11311
+ for (int i = 0; i < 16; ++i) {
11312
+ float w = weight[i];
11313
+ float q = 2*L[i] + 1;
11314
+ sumqx += w*xval[i]*q;
11315
+ sumq2 += w*q*q;
11316
+ }
11317
+ if (sumq2 > 0) scale = sumqx/sumq2;
11318
+ }
11319
+ if (scale < 0) {
11320
+ scale = -scale;
11321
+ for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
11322
+ }
11323
+ for (int k = 0; k < 2; ++k) {
11324
+ uint16_t u = 0;
11325
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
11326
+ int grid_index = kmap_q2xs[u];
11327
+ if (grid_index < 0) {
11328
+ printf("Oops: found point %u not on grid:", u);
11329
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
11330
+ printf("\n");
11331
+ GGML_ASSERT(false);
11332
+ }
11333
+ q2[2*ib+k] = grid_index | (block_signs[k] << 9);
11334
+ }
11335
+ GGML_ASSERT(scale >= 0);
11336
+ scales[ib] = scale;
11337
+ max_scale = MAX(max_scale, scale);
11338
+ }
11339
+
11340
+ if (!max_scale) {
11341
+ memset(y[ibl].qs, 0, QK_K/4);
11342
+ continue;
11343
+ }
11344
+
11345
+ float d = max_scale/31;
11346
+ y[ibl].d = GGML_FP32_TO_FP16(d);
11347
+ float id = 1/d;
11348
+ for (int ib = 0; ib < QK_K/16; ++ib) {
11349
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
11350
+ l = MAX(0, MIN(15, l));
11351
+ if (ib%2 == 0) y[ibl].scales[ib/2] = l;
11352
+ else y[ibl].scales[ib/2] |= (l << 4);
11353
+ }
11354
+ memcpy(y[ibl].qs, q2, QK_K/4);
11355
+
11356
+ }
11357
+ }
11358
+
11359
+ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11360
+ (void)hist;
11361
+ GGML_ASSERT(n_per_row%QK_K == 0);
11362
+ int nblock = n_per_row/QK_K;
11363
+ char * qrow = (char *)dst;
11364
+ for (int row = 0; row < nrow; ++row) {
11365
+ quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
11366
+ src += n_per_row;
11367
+ qrow += nblock*sizeof(block_iq2_xxs);
11368
+ }
11369
+ return nrow * nblock * sizeof(block_iq2_xxs);
11370
+ }
11371
+
11372
+ size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11373
+ (void)hist;
11374
+ GGML_ASSERT(n_per_row%QK_K == 0);
11375
+ int nblock = n_per_row/QK_K;
11376
+ char * qrow = (char *)dst;
11377
+ for (int row = 0; row < nrow; ++row) {
11378
+ quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
11379
+ src += n_per_row;
11380
+ qrow += nblock*sizeof(block_iq2_xs);
11381
+ }
11382
+ return nrow * nblock * sizeof(block_iq2_xs);
11383
+ }
11384
+
11385
+ //
11386
+ // ============================================= 3-bit using D4 lattice
11387
+ //
11388
+
11389
+ typedef struct {
11390
+ uint32_t * grid;
11391
+ int * map;
11392
+ uint16_t * neighbours;
11393
+ } iq3_entry_t;
11394
+
11395
+ static iq3_entry_t iq3_data[2] = {
11396
+ {NULL, NULL, NULL},
11397
+ {NULL, NULL, NULL},
11398
+ };
11399
+
11400
+ static inline int iq3_data_index(int grid_size) {
11401
+ (void)grid_size;
11402
+ GGML_ASSERT(grid_size == 256 || grid_size == 512);
11403
+ return grid_size == 256 ? 0 : 1;
11404
+ }
11405
+
11406
+ static int iq3_compare_func(const void * left, const void * right) {
11407
+ const int * l = (const int *)left;
11408
+ const int * r = (const int *)right;
11409
+ return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
11410
+ }
11411
+
11412
+ void iq3xs_init_impl(int grid_size) {
11413
+ const int gindex = iq3_data_index(grid_size);
11414
+ if (iq3_data[gindex].grid) {
11415
+ return;
11416
+ }
11417
+ static const uint16_t kgrid_256[256] = {
11418
+ 0, 2, 4, 9, 11, 15, 16, 18, 25, 34, 59, 61, 65, 67, 72, 74,
11419
+ 81, 85, 88, 90, 97, 108, 120, 128, 130, 132, 137, 144, 146, 153, 155, 159,
11420
+ 169, 175, 189, 193, 199, 200, 202, 213, 248, 267, 287, 292, 303, 315, 317, 321,
11421
+ 327, 346, 362, 413, 436, 456, 460, 462, 483, 497, 513, 515, 520, 522, 529, 531,
11422
+ 536, 538, 540, 551, 552, 576, 578, 585, 592, 594, 641, 643, 648, 650, 657, 664,
11423
+ 698, 704, 706, 720, 729, 742, 758, 769, 773, 808, 848, 852, 870, 889, 901, 978,
11424
+ 992, 1024, 1026, 1033, 1035, 1040, 1042, 1046, 1049, 1058, 1089, 1091, 1093, 1096, 1098, 1105,
11425
+ 1112, 1139, 1143, 1144, 1152, 1154, 1161, 1167, 1168, 1170, 1183, 1184, 1197, 1217, 1224, 1228,
11426
+ 1272, 1276, 1309, 1323, 1347, 1367, 1377, 1404, 1473, 1475, 1486, 1509, 1537, 1544, 1546, 1553,
11427
+ 1555, 1576, 1589, 1594, 1600, 1602, 1616, 1625, 1636, 1638, 1665, 1667, 1672, 1685, 1706, 1722,
11428
+ 1737, 1755, 1816, 1831, 1850, 1856, 1862, 1874, 1901, 1932, 1950, 1971, 2011, 2032, 2052, 2063,
11429
+ 2077, 2079, 2091, 2095, 2172, 2192, 2207, 2208, 2224, 2230, 2247, 2277, 2308, 2345, 2356, 2389,
11430
+ 2403, 2424, 2501, 2504, 2506, 2520, 2570, 2593, 2616, 2624, 2630, 2646, 2669, 2700, 2714, 2746,
11431
+ 2754, 2795, 2824, 2835, 2839, 2874, 2882, 2905, 2984, 3028, 3042, 3092, 3108, 3110, 3124, 3153,
11432
+ 3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
11433
+ 3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
11434
+ };
11435
+ static const uint16_t kgrid_512[512] = {
11436
+ 0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34,
11437
+ 37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77,
11438
+ 80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142,
11439
+ 145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210,
11440
+ 217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288,
11441
+ 291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393,
11442
+ 395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514,
11443
+ 516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576,
11444
+ 577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653,
11445
+ 655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727,
11446
+ 728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833,
11447
+ 840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977,
11448
+ 989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047,
11449
+ 1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103,
11450
+ 1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199,
11451
+ 1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296,
11452
+ 1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415,
11453
+ 1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561,
11454
+ 1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648,
11455
+ 1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761,
11456
+ 1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877,
11457
+ 1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068,
11458
+ 2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177,
11459
+ 2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269,
11460
+ 2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520,
11461
+ 2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634,
11462
+ 2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805,
11463
+ 2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083,
11464
+ 3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276,
11465
+ 3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591,
11466
+ 3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729,
11467
+ 3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032,
11468
+ };
11469
+
11470
+ const int kmap_size = 4096;
11471
+ const int nwant = grid_size == 256 ? 2 : 3;
11472
+ const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
11473
+ uint32_t * kgrid_q3xs;
11474
+ int * kmap_q3xs;
11475
+ uint16_t * kneighbors_q3xs;
11476
+
11477
+ printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
11478
+ uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
11479
+ for (int k = 0; k < grid_size; ++k) {
11480
+ int8_t * pos = (int8_t *)(the_grid + k);
11481
+ for (int i = 0; i < 4; ++i) {
11482
+ int l = (kgrid[k] >> 3*i) & 0x7;
11483
+ pos[i] = 2*l + 1;
11484
+ }
11485
+ }
11486
+ kgrid_q3xs = the_grid;
11487
+ iq3_data[gindex].grid = the_grid;
11488
+ kmap_q3xs = (int *)malloc(kmap_size*sizeof(int));
11489
+ iq3_data[gindex].map = kmap_q3xs;
11490
+ for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1;
11491
+ uint32_t aux32;
11492
+ uint8_t * aux8 = (uint8_t *)&aux32;
9214
11493
  for (int i = 0; i < grid_size; ++i) {
9215
- aux64 = kgrid_q2xs[i];
11494
+ aux32 = kgrid_q3xs[i];
9216
11495
  uint16_t index = 0;
9217
- for (int k=0; k<8; ++k) {
11496
+ for (int k=0; k<4; ++k) {
9218
11497
  uint16_t q = (aux8[k] - 1)/2;
9219
- index |= (q << 2*k);
11498
+ index |= (q << 3*k);
9220
11499
  }
9221
- kmap_q2xs[index] = i;
11500
+ kmap_q3xs[index] = i;
9222
11501
  }
9223
- int8_t pos[8];
11502
+ int8_t pos[4];
9224
11503
  int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
9225
11504
  int num_neighbors = 0, num_not_in_map = 0;
9226
11505
  for (int i = 0; i < kmap_size; ++i) {
9227
- if (kmap_q2xs[i] >= 0) continue;
11506
+ if (kmap_q3xs[i] >= 0) continue;
9228
11507
  ++num_not_in_map;
9229
- for (int k = 0; k < 8; ++k) {
9230
- int l = (i >> 2*k) & 0x3;
11508
+ for (int k = 0; k < 4; ++k) {
11509
+ int l = (i >> 3*k) & 0x7;
9231
11510
  pos[k] = 2*l + 1;
9232
11511
  }
9233
11512
  for (int j = 0; j < grid_size; ++j) {
9234
- const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
11513
+ const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
9235
11514
  int d2 = 0;
9236
- for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
11515
+ for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
9237
11516
  dist2[2*j+0] = d2;
9238
11517
  dist2[2*j+1] = j;
9239
11518
  }
9240
- qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
11519
+ qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
9241
11520
  int n = 0; int d2 = dist2[0];
9242
11521
  int nhave = 1;
9243
11522
  for (int j = 0; j < grid_size; ++j) {
@@ -9251,26 +11530,26 @@ void iq2xs_init_impl(int grid_size) {
9251
11530
  num_neighbors += n;
9252
11531
  }
9253
11532
  printf("%s: %d neighbours in total\n", __func__, num_neighbors);
9254
- kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
9255
- iq2_data[gindex].neighbours = kneighbors_q2xs;
11533
+ kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
11534
+ iq3_data[gindex].neighbours = kneighbors_q3xs;
9256
11535
  int counter = 0;
9257
11536
  for (int i = 0; i < kmap_size; ++i) {
9258
- if (kmap_q2xs[i] >= 0) continue;
9259
- for (int k = 0; k < 8; ++k) {
9260
- int l = (i >> 2*k) & 0x3;
11537
+ if (kmap_q3xs[i] >= 0) continue;
11538
+ for (int k = 0; k < 4; ++k) {
11539
+ int l = (i >> 3*k) & 0x7;
9261
11540
  pos[k] = 2*l + 1;
9262
11541
  }
9263
11542
  for (int j = 0; j < grid_size; ++j) {
9264
- const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
11543
+ const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
9265
11544
  int d2 = 0;
9266
- for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
11545
+ for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
9267
11546
  dist2[2*j+0] = d2;
9268
11547
  dist2[2*j+1] = j;
9269
11548
  }
9270
- qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
9271
- kmap_q2xs[i] = -(counter + 1);
11549
+ qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
11550
+ kmap_q3xs[i] = -(counter + 1);
9272
11551
  int d2 = dist2[0];
9273
- uint16_t * start = &kneighbors_q2xs[counter++];
11552
+ uint16_t * start = &kneighbors_q3xs[counter++];
9274
11553
  int n = 0, nhave = 1;
9275
11554
  for (int j = 0; j < grid_size; ++j) {
9276
11555
  if (dist2[2*j] > d2) {
@@ -9278,7 +11557,7 @@ void iq2xs_init_impl(int grid_size) {
9278
11557
  d2 = dist2[2*j];
9279
11558
  ++nhave;
9280
11559
  }
9281
- kneighbors_q2xs[counter++] = dist2[2*j+1];
11560
+ kneighbors_q3xs[counter++] = dist2[2*j+1];
9282
11561
  ++n;
9283
11562
  }
9284
11563
  *start = n;
@@ -9286,17 +11565,17 @@ void iq2xs_init_impl(int grid_size) {
9286
11565
  free(dist2);
9287
11566
  }
9288
11567
 
9289
- void iq2xs_free_impl(int grid_size) {
9290
- GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
9291
- const int gindex = iq2_data_index(grid_size);
9292
- if (iq2_data[gindex].grid) {
9293
- free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
9294
- free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
9295
- free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
11568
+ void iq3xs_free_impl(int grid_size) {
11569
+ GGML_ASSERT(grid_size == 256 || grid_size == 512);
11570
+ const int gindex = iq3_data_index(grid_size);
11571
+ if (iq3_data[gindex].grid) {
11572
+ free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
11573
+ free(iq3_data[gindex].map); iq3_data[gindex].map = NULL;
11574
+ free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL;
9296
11575
  }
9297
11576
  }
9298
11577
 
9299
- static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
11578
+ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
9300
11579
  const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
9301
11580
  int num_neighbors = neighbours[0];
9302
11581
  GGML_ASSERT(num_neighbors > 0);
@@ -9305,7 +11584,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
9305
11584
  for (int j = 1; j <= num_neighbors; ++j) {
9306
11585
  const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
9307
11586
  float d2 = 0;
9308
- for (int i = 0; i < 8; ++i) {
11587
+ for (int i = 0; i < 4; ++i) {
9309
11588
  float q = pg[i];
9310
11589
  float diff = scale*q - xval[i];
9311
11590
  d2 += weight[i]*diff*diff;
@@ -9316,29 +11595,44 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
9316
11595
  }
9317
11596
  GGML_ASSERT(grid_index >= 0);
9318
11597
  const int8_t * pg = (const int8_t *)(grid + grid_index);
9319
- for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
11598
+ for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2;
9320
11599
  return grid_index;
9321
11600
  }
9322
11601
 
9323
- static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
11602
+ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
11603
+ const float * restrict quant_weights) {
9324
11604
 
9325
- const int gindex = iq2_data_index(256);
11605
+ const int gindex = iq3_data_index(grid_size);
9326
11606
 
9327
- const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
9328
- const int * kmap_q2xs = iq2_data[gindex].map;
9329
- const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
11607
+ const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
11608
+ const int * kmap_q3xs = iq3_data[gindex].map;
11609
+ const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
9330
11610
 
9331
- GGML_ASSERT(quant_weights && "missing quantization weights");
9332
- GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
9333
- GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
9334
- GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
11611
+ //GGML_ASSERT(quant_weights && "missing quantization weights");
11612
+ GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
11613
+ GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
11614
+ GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
9335
11615
  GGML_ASSERT(n%QK_K == 0);
9336
11616
 
9337
- const int kMaxQ = 3;
11617
+ const int kMaxQ = 8;
9338
11618
 
9339
- const int nbl = n/256;
11619
+ const int nbl = n/QK_K;
9340
11620
 
9341
- block_iq2_xxs * y = vy;
11621
+ ggml_fp16_t * dh;
11622
+ uint8_t * qs;
11623
+ int block_size;
11624
+ if (grid_size == 256) {
11625
+ block_iq3_xxs * y = vy;
11626
+ dh = &y->d;
11627
+ qs = y->qs;
11628
+ block_size = sizeof(block_iq3_xxs);
11629
+ } else {
11630
+ block_iq3_s * y = vy;
11631
+ dh = &y->d;
11632
+ qs = y->qs;
11633
+ block_size = sizeof(block_iq3_s);
11634
+ }
11635
+ int quant_size = block_size - sizeof(ggml_fp16_t);
9342
11636
 
9343
11637
  float scales[QK_K/32];
9344
11638
  float weight[32];
@@ -9346,25 +11640,33 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9346
11640
  int8_t L[32];
9347
11641
  int8_t Laux[32];
9348
11642
  float waux[32];
9349
- uint8_t block_signs[4];
9350
- uint32_t q2[2*(QK_K/32)];
11643
+ bool is_on_grid[8];
11644
+ bool is_on_grid_aux[8];
11645
+ uint8_t block_signs[8];
11646
+ uint8_t q3[3*(QK_K/8)+QK_K/32];
11647
+ uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
11648
+ uint8_t * qh = q3 + 3*(QK_K/8);
9351
11649
 
9352
11650
  for (int ibl = 0; ibl < nbl; ++ibl) {
9353
11651
 
9354
- y[ibl].d = GGML_FP32_TO_FP16(0.f);
9355
- memset(q2, 0, QK_K/4);
11652
+ dh[0] = GGML_FP32_TO_FP16(0.f);
11653
+ memset(q3, 0, 3*QK_K/8+QK_K/32);
9356
11654
 
9357
11655
  float max_scale = 0;
9358
11656
 
9359
11657
  const float * xbl = x + QK_K*ibl;
9360
11658
  float sumx2 = 0;
9361
11659
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
9362
- float sigma2 = sumx2/QK_K;
11660
+ float sigma2 = 2*sumx2/QK_K;
9363
11661
 
9364
11662
  for (int ib = 0; ib < QK_K/32; ++ib) {
9365
11663
  const float * xb = xbl + 32*ib;
9366
- const float * qw = quant_weights + QK_K*ibl + 32*ib;
9367
- for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11664
+ if (quant_weights) {
11665
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
11666
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11667
+ } else {
11668
+ for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
11669
+ }
9368
11670
  for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
9369
11671
  for (int k = 0; k < 4; ++k) {
9370
11672
  int nflip = 0;
@@ -9395,23 +11697,24 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9395
11697
  memset(L, 0, 32);
9396
11698
  continue;
9397
11699
  }
9398
- float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
9399
- float eff_max = scale*kMaxQ;
9400
11700
  float best = 0;
9401
- for (int is = -6; is <= 6; ++is) {
9402
- float id = (2*kMaxQ-1+is*0.1f)/eff_max;
11701
+ float scale = max/(2*kMaxQ-1);
11702
+ for (int is = -15; is <= 15; ++is) {
11703
+ float id = (2*kMaxQ-1+is*0.2f)/max;
9403
11704
  float this_scale = 1/id;
9404
- for (int k = 0; k < 4; ++k) {
9405
- for (int i = 0; i < 8; ++i) {
9406
- int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
9407
- Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
11705
+ for (int k = 0; k < 8; ++k) {
11706
+ for (int i = 0; i < 4; ++i) {
11707
+ int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
11708
+ Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
9408
11709
  }
9409
11710
  uint16_t u = 0;
9410
- for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
9411
- int grid_index = kmap_q2xs[u];
11711
+ for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
11712
+ int grid_index = kmap_q3xs[u];
11713
+ is_on_grid_aux[k] = true;
9412
11714
  if (grid_index < 0) {
9413
- const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
9414
- grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
11715
+ is_on_grid_aux[k] = false;
11716
+ const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
11717
+ grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
9415
11718
  }
9416
11719
  }
9417
11720
  float sumqx = 0, sumq2 = 0;
@@ -9423,25 +11726,29 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9423
11726
  }
9424
11727
  if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
9425
11728
  scale = sumqx/sumq2; best = scale*sumqx;
9426
- memcpy(L, Laux, 32);
11729
+ for (int i = 0; i < 32; ++i) L[i] = Laux[i];
11730
+ for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
9427
11731
  }
9428
11732
  }
9429
- if (scale > 0) {
11733
+ int n_not_ongrid = 0;
11734
+ for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
11735
+ if (n_not_ongrid > 0 && scale > 0) {
9430
11736
  float id = 1/scale;
9431
- for (int k = 0; k < 4; ++k) {
11737
+ for (int k = 0; k < 8; ++k) {
11738
+ if (is_on_grid[k]) continue;
9432
11739
  uint16_t u = 0;
9433
- for (int i = 0; i < 8; ++i) {
9434
- int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
11740
+ for (int i = 0; i < 4; ++i) {
11741
+ int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
9435
11742
  l = MAX(0, MIN(kMaxQ-1, l));
9436
- u |= (l << 2*i);
11743
+ u |= (l << 3*i);
9437
11744
  }
9438
- int grid_index = kmap_q2xs[u];
11745
+ int grid_index = kmap_q3xs[u];
9439
11746
  if (grid_index < 0) {
9440
- const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
9441
- grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
11747
+ const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
11748
+ grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
9442
11749
  }
9443
- const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
9444
- for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
11750
+ const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
11751
+ for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
9445
11752
  }
9446
11753
  float sumqx = 0, sumq2 = 0;
9447
11754
  for (int i = 0; i < 32; ++i) {
@@ -9458,142 +11765,173 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9458
11765
  scale = -scale;
9459
11766
  for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
9460
11767
  }
9461
- for (int k = 0; k < 4; ++k) {
11768
+ for (int k = 0; k < 8; ++k) {
9462
11769
  uint16_t u = 0;
9463
- for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
9464
- int grid_index = kmap_q2xs[u];
11770
+ for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
11771
+ int grid_index = kmap_q3xs[u];
9465
11772
  if (grid_index < 0) {
9466
11773
  printf("Oops: found point %u not on grid:", u);
9467
- for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
11774
+ for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
9468
11775
  printf("\n");
9469
11776
  GGML_ASSERT(false);
9470
11777
  }
9471
- q2[2*ib+0] |= (grid_index << 8*k);
9472
- q2[2*ib+1] |= (block_signs[k] << 7*k);
11778
+ if (grid_size == 256) {
11779
+ q3[8*ib+k] = grid_index;
11780
+ } else {
11781
+ q3[8*ib+k] = grid_index & 255;
11782
+ qh[ib] |= ((grid_index >> 8) << k);
11783
+ }
11784
+
9473
11785
  }
11786
+ scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
9474
11787
  GGML_ASSERT(scale >= 0);
9475
11788
  scales[ib] = scale;
9476
11789
  max_scale = MAX(max_scale, scale);
9477
11790
  }
9478
11791
 
9479
11792
  if (!max_scale) {
9480
- memset(y[ibl].qs, 0, QK_K/4);
11793
+ memset(qs, 0, quant_size);
11794
+ dh += block_size/sizeof(ggml_fp16_t);
11795
+ qs += block_size;
9481
11796
  continue;
9482
11797
  }
9483
11798
 
9484
11799
  float d = max_scale/31;
9485
- y[ibl].d = GGML_FP32_TO_FP16(d);
11800
+ dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor
9486
11801
  float id = 1/d;
9487
11802
  for (int ib = 0; ib < QK_K/32; ++ib) {
9488
11803
  int l = nearest_int(0.5f*(id*scales[ib]-1));
9489
11804
  l = MAX(0, MIN(15, l));
9490
- q2[2*ib+1] |= ((uint32_t)l << 28);
11805
+ scales_and_signs[ib] |= ((uint32_t)l << 28);
9491
11806
  }
9492
- memcpy(y[ibl].qs, q2, QK_K/4);
11807
+ memcpy(qs, q3, quant_size);
11808
+
11809
+ dh += block_size/sizeof(ggml_fp16_t);
11810
+ qs += block_size;
11811
+
9493
11812
  }
9494
11813
  }
9495
11814
 
9496
- static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
11815
+ size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11816
+ (void)hist;
11817
+ GGML_ASSERT(n_per_row%QK_K == 0);
11818
+ int nblock = n_per_row/QK_K;
11819
+ char * qrow = (char *)dst;
11820
+ for (int row = 0; row < nrow; ++row) {
11821
+ quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
11822
+ src += n_per_row;
11823
+ qrow += nblock*sizeof(block_iq3_xxs);
11824
+ }
11825
+ return nrow * nblock * sizeof(block_iq3_xxs);
11826
+ }
9497
11827
 
9498
- const int gindex = iq2_data_index(512);
11828
+ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
11829
+ assert(k % QK_K == 0);
11830
+ block_iq3_xxs * restrict y = vy;
11831
+ quantize_row_iq3_xxs_reference(x, y, k);
11832
+ }
9499
11833
 
9500
- const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
9501
- const int * kmap_q2xs = iq2_data[gindex].map;
9502
- const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
11834
+ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
11835
+ assert(k % QK_K == 0);
11836
+ quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
11837
+ }
9503
11838
 
9504
- GGML_ASSERT(quant_weights && "missing quantization weights");
9505
- GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
9506
- GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
9507
- GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
11839
+ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
11840
+ const float * restrict quant_weights,
11841
+ float * scales,
11842
+ float * weight,
11843
+ float * xval,
11844
+ int8_t * L,
11845
+ int8_t * Laux,
11846
+ float * waux,
11847
+ bool * is_on_grid,
11848
+ bool * is_on_grid_aux,
11849
+ uint8_t * block_signs) {
11850
+
11851
+ const int gindex = iq3_data_index(512);
11852
+
11853
+ const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
11854
+ const int * kmap_q3xs = iq3_data[gindex].map;
11855
+ const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
11856
+
11857
+ //GGML_ASSERT(quant_weights && "missing quantization weights");
11858
+ GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
11859
+ GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
11860
+ GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
9508
11861
  GGML_ASSERT(n%QK_K == 0);
9509
11862
 
9510
- const int kMaxQ = 3;
11863
+ const int kMaxQ = 8;
9511
11864
 
9512
- const int nbl = n/256;
11865
+ const int nbl = n/QK_K;
9513
11866
 
9514
- block_iq2_xs * y = vy;
11867
+ block_iq3_s * y = vy;
9515
11868
 
9516
- float scales[QK_K/16];
9517
- float weight[16];
9518
- float xval[16];
9519
- int8_t L[16];
9520
- int8_t Laux[16];
9521
- float waux[16];
9522
- bool is_on_grid[2];
9523
- bool is_on_grid_aux[2];
9524
- uint8_t block_signs[2];
9525
- uint16_t q2[2*(QK_K/16)];
11869
+ const int bs4 = block_size/4;
11870
+ const int bs8 = block_size/8;
9526
11871
 
9527
11872
  for (int ibl = 0; ibl < nbl; ++ibl) {
9528
11873
 
11874
+ memset(&y[ibl], 0, sizeof(block_iq3_s));
9529
11875
  y[ibl].d = GGML_FP32_TO_FP16(0.f);
9530
- memset(q2, 0, QK_K/4);
9531
- memset(y[ibl].scales, 0, QK_K/32);
11876
+
11877
+ uint8_t * qs = y[ibl].qs;
11878
+ uint8_t * qh = y[ibl].qh;
11879
+ uint8_t * signs = y[ibl].signs;
9532
11880
 
9533
11881
  float max_scale = 0;
9534
11882
 
9535
11883
  const float * xbl = x + QK_K*ibl;
9536
11884
  float sumx2 = 0;
9537
11885
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
9538
- float sigma2 = sumx2/QK_K;
9539
-
9540
- for (int ib = 0; ib < QK_K/16; ++ib) {
9541
- const float * xb = xbl + 16*ib;
9542
- const float * qw = quant_weights + QK_K*ibl + 16*ib;
9543
- for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
9544
- for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
9545
- for (int k = 0; k < 2; ++k) {
9546
- int nflip = 0;
9547
- uint8_t s = 0;
9548
- for (int i = 0; i < 8; ++i) {
9549
- if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
9550
- else {
9551
- xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
9552
- }
9553
- }
9554
- if (nflip%2) {
9555
- int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
9556
- for (int i = 1; i < 8; ++i) {
9557
- float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
9558
- if (ax < min) {
9559
- min = ax; imin = i;
9560
- }
11886
+ float sigma2 = 2*sumx2/QK_K;
11887
+
11888
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
11889
+ const float * xb = xbl + block_size*ib;
11890
+ if (quant_weights) {
11891
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
11892
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11893
+ } else {
11894
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
11895
+ }
11896
+ for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
11897
+ for (int k = 0; k < bs8; ++k) {
11898
+ uint8_t s = 0;
11899
+ for (int i = 0; i < 8; ++i) {
11900
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
11901
+ else {
11902
+ xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
9561
11903
  }
9562
- xval[8*k+imin] = -xval[8*k+imin];
9563
- s ^= (1 << imin);
9564
11904
  }
9565
- block_signs[k] = s & 127;
11905
+ block_signs[k] = s;
9566
11906
  }
9567
11907
  float max = xval[0];
9568
- for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
11908
+ for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
9569
11909
  if (!max) {
9570
11910
  scales[ib] = 0;
9571
- memset(L, 0, 16);
9572
11911
  continue;
9573
11912
  }
9574
11913
  float best = 0;
9575
11914
  float scale = max/(2*kMaxQ-1);
9576
- is_on_grid[0] = is_on_grid[1] = true;
9577
- for (int is = -9; is <= 9; ++is) {
9578
- float id = (2*kMaxQ-1+is*0.1f)/max;
11915
+ for (int is = -15; is <= 15; ++is) {
11916
+ float id = (2*kMaxQ-1+is*0.2f)/max;
9579
11917
  float this_scale = 1/id;
9580
- for (int k = 0; k < 2; ++k) {
9581
- for (int i = 0; i < 8; ++i) {
9582
- int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
9583
- Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
11918
+ for (int k = 0; k < bs4; ++k) {
11919
+ for (int i = 0; i < 4; ++i) {
11920
+ int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
11921
+ Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
9584
11922
  }
9585
11923
  uint16_t u = 0;
9586
- for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
9587
- int grid_index = kmap_q2xs[u];
11924
+ for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
11925
+ int grid_index = kmap_q3xs[u];
9588
11926
  is_on_grid_aux[k] = true;
9589
11927
  if (grid_index < 0) {
9590
11928
  is_on_grid_aux[k] = false;
9591
- const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
9592
- grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
11929
+ const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
11930
+ grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
9593
11931
  }
9594
11932
  }
9595
11933
  float sumqx = 0, sumq2 = 0;
9596
- for (int i = 0; i < 16; ++i) {
11934
+ for (int i = 0; i < block_size; ++i) {
9597
11935
  float w = weight[i];
9598
11936
  float q = 2*Laux[i] + 1;
9599
11937
  sumqx += w*xval[i]*q;
@@ -9601,31 +11939,32 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
9601
11939
  }
9602
11940
  if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
9603
11941
  scale = sumqx/sumq2; best = scale*sumqx;
9604
- for (int i = 0; i < 16; ++i) L[i] = Laux[i];
9605
- for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
11942
+ for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
11943
+ for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
9606
11944
  }
9607
11945
  }
9608
11946
  int n_not_ongrid = 0;
9609
- for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
11947
+ for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
9610
11948
  if (n_not_ongrid > 0 && scale > 0) {
9611
11949
  float id = 1/scale;
9612
- for (int k = 0; k < 2; ++k) {
11950
+ for (int k = 0; k < bs4; ++k) {
9613
11951
  if (is_on_grid[k]) continue;
9614
11952
  uint16_t u = 0;
9615
- for (int i = 0; i < 8; ++i) {
9616
- int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
11953
+ for (int i = 0; i < 4; ++i) {
11954
+ int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
9617
11955
  l = MAX(0, MIN(kMaxQ-1, l));
9618
- u |= (l << 2*i);
9619
- L[8*k + i] = l;
11956
+ u |= (l << 3*i);
9620
11957
  }
9621
- int grid_index = kmap_q2xs[u];
11958
+ int grid_index = kmap_q3xs[u];
9622
11959
  if (grid_index < 0) {
9623
- const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
9624
- grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
11960
+ const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
11961
+ grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
9625
11962
  }
11963
+ const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
11964
+ for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
9626
11965
  }
9627
11966
  float sumqx = 0, sumq2 = 0;
9628
- for (int i = 0; i < 16; ++i) {
11967
+ for (int i = 0; i < block_size; ++i) {
9629
11968
  float w = weight[i];
9630
11969
  float q = 2*L[i] + 1;
9631
11970
  sumqx += w*xval[i]*q;
@@ -9634,356 +11973,572 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
9634
11973
  if (sumq2 > 0) scale = sumqx/sumq2;
9635
11974
  }
9636
11975
  if (scale < 0) {
11976
+ // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
11977
+ // and correspondingly flip quant signs.
9637
11978
  scale = -scale;
9638
- for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
11979
+ for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
9639
11980
  }
9640
- for (int k = 0; k < 2; ++k) {
11981
+ for (int k = 0; k < bs4; ++k) {
9641
11982
  uint16_t u = 0;
9642
- for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
9643
- int grid_index = kmap_q2xs[u];
11983
+ for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
11984
+ int grid_index = kmap_q3xs[u];
9644
11985
  if (grid_index < 0) {
9645
11986
  printf("Oops: found point %u not on grid:", u);
9646
- for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
11987
+ for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
9647
11988
  printf("\n");
9648
11989
  GGML_ASSERT(false);
9649
11990
  }
9650
- q2[2*ib+k] = grid_index | (block_signs[k] << 9);
11991
+ qs[k] = grid_index & 255;
11992
+ qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
9651
11993
  }
11994
+ qs += bs4;
11995
+ for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
11996
+ signs += bs8;
9652
11997
  GGML_ASSERT(scale >= 0);
9653
11998
  scales[ib] = scale;
9654
11999
  max_scale = MAX(max_scale, scale);
9655
12000
  }
9656
12001
 
9657
12002
  if (!max_scale) {
9658
- memset(y[ibl].qs, 0, QK_K/4);
9659
12003
  continue;
9660
12004
  }
9661
12005
 
9662
12006
  float d = max_scale/31;
9663
12007
  y[ibl].d = GGML_FP32_TO_FP16(d);
9664
12008
  float id = 1/d;
9665
- for (int ib = 0; ib < QK_K/16; ++ib) {
9666
- int l = nearest_int(0.5f*(id*scales[ib]-1));
9667
- l = MAX(0, MIN(15, l));
9668
- if (ib%2 == 0) y[ibl].scales[ib/2] = l;
9669
- else y[ibl].scales[ib/2] |= (l << 4);
12009
+ for (int ib = 0; ib < QK_K/block_size; ib += 2) {
12010
+ int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
12011
+ l1 = MAX(0, MIN(15, l1));
12012
+ int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
12013
+ l2 = MAX(0, MIN(15, l2));
12014
+ y[ibl].scales[ib/2] = l1 | (l2 << 4);
9670
12015
  }
9671
- memcpy(y[ibl].qs, q2, QK_K/4);
9672
12016
 
9673
12017
  }
9674
12018
  }
9675
12019
 
9676
- size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12020
+ #define IQ3S_BLOCK_SIZE 32
12021
+ size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
9677
12022
  (void)hist;
9678
12023
  GGML_ASSERT(n_per_row%QK_K == 0);
9679
12024
  int nblock = n_per_row/QK_K;
12025
+ float scales[QK_K/IQ3S_BLOCK_SIZE];
12026
+ float weight[IQ3S_BLOCK_SIZE];
12027
+ float xval[IQ3S_BLOCK_SIZE];
12028
+ int8_t L[IQ3S_BLOCK_SIZE];
12029
+ int8_t Laux[IQ3S_BLOCK_SIZE];
12030
+ float waux[IQ3S_BLOCK_SIZE];
12031
+ bool is_on_grid[IQ3S_BLOCK_SIZE/4];
12032
+ bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
12033
+ uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
9680
12034
  char * qrow = (char *)dst;
9681
12035
  for (int row = 0; row < nrow; ++row) {
9682
- quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
12036
+ quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
12037
+ scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
9683
12038
  src += n_per_row;
9684
- qrow += nblock*sizeof(block_iq2_xxs);
12039
+ qrow += nblock*sizeof(block_iq3_s);
9685
12040
  }
9686
- return nrow * nblock * sizeof(block_iq2_xxs);
12041
+ return nrow * nblock * sizeof(block_iq3_s);
9687
12042
  }
9688
12043
 
9689
- size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12044
+ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
12045
+ assert(k % QK_K == 0);
12046
+ block_iq3_s * restrict y = vy;
12047
+ quantize_row_iq3_s_reference(x, y, k);
12048
+ }
12049
+
12050
+ void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
12051
+ assert(k % QK_K == 0);
12052
+ quantize_iq3_s(x, y, 1, k, NULL, NULL);
12053
+ }
12054
+
12055
+
12056
+ // =================================== 1.5 bpw ===================================================
12057
+
12058
+ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
12059
+ const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
12060
+ int num_neighbors = neighbours[0];
12061
+ GGML_ASSERT(num_neighbors > 0);
12062
+ float best_score = 0;
12063
+ int grid_index = -1;
12064
+ for (int j = 1; j <= num_neighbors; ++j) {
12065
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
12066
+ float sumqx = 0, sumq2 = 0;
12067
+ for (int i = 0; i < 8; ++i) {
12068
+ float q = (pg[i] - 3)/2;
12069
+ float w = weight[i];
12070
+ sumqx += w*q*xval[i];
12071
+ sumq2 += w*q*q;
12072
+ }
12073
+ if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
12074
+ *scale = sumqx/sumq2; best_score = *scale * sumqx;
12075
+ grid_index = neighbours[j];
12076
+ }
12077
+ }
12078
+ if (grid_index < 0) {
12079
+ for (int i = 0; i < ngrid; ++i) {
12080
+ const int8_t * grid_i = (const int8_t *)(grid + i);
12081
+ float sumqx = 0, sumq2 = 0;
12082
+ for (int j = 0; j < 8; ++j) {
12083
+ float w = weight[j];
12084
+ float q = (grid_i[j] - 3)/2;
12085
+ sumqx += w*q*xval[j];
12086
+ sumq2 += w*q*q;
12087
+ }
12088
+ if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
12089
+ *scale = sumqx/sumq2; best_score = *scale*sumqx;
12090
+ grid_index = i;
12091
+ }
12092
+ }
12093
+ }
12094
+ if (grid_index < 0) {
12095
+ printf("Oops, did not find grid point\n");
12096
+ printf("Have %d neighbours\n", num_neighbors);
12097
+ for (int j = 1; j <= num_neighbors; ++j) {
12098
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
12099
+ float sumqx = 0, sumq2 = 0;
12100
+ for (int i = 0; i < 8; ++i) {
12101
+ float q = (pg[i] - 3)/2;
12102
+ float w = weight[i];
12103
+ sumqx += w*q*xval[i];
12104
+ sumq2 += w*q*q;
12105
+ }
12106
+ printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
12107
+ }
12108
+ }
12109
+ GGML_ASSERT(grid_index >= 0);
12110
+ //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
12111
+ *scale *= 1.05f; // This is a fudge factor. Don't ask me why it improves the result.
12112
+ //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
12113
+ const int8_t * pg = (const int8_t *)(grid + grid_index);
12114
+ for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
12115
+ return grid_index;
12116
+ }
12117
+
12118
+ static int iq1_sort_helper(const void * left, const void * right) {
12119
+ const float * l = left;
12120
+ const float * r = right;
12121
+ return *l < *r ? -1 : *l > *r ? 1 : 0;
12122
+ }
12123
+
12124
+ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12125
+
12126
+ const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
12127
+
12128
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
12129
+ const int * kmap_q2xs = iq2_data[gindex].map;
12130
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
12131
+
12132
+ GGML_ASSERT(quant_weights && "missing quantization weights");
12133
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
12134
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
12135
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
12136
+ GGML_ASSERT(n%QK_K == 0);
12137
+
12138
+ const int nbl = n/QK_K;
12139
+
12140
+ block_iq1_s * y = vy;
12141
+
12142
+ float scales[QK_K/8];
12143
+ float weight[8];
12144
+ int8_t L[8];
12145
+ float sumx[9];
12146
+ float sumw[9];
12147
+ float pairs[16];
12148
+ int * idx = (int *)(pairs + 1);
12149
+ uint8_t hbit[QK_K/8];
12150
+
12151
+ for (int ibl = 0; ibl < nbl; ++ibl) {
12152
+
12153
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
12154
+ memset(y[ibl].qs, 0, QK_K/8);
12155
+ memset(y[ibl].scales, 0, QK_K/16);
12156
+
12157
+ float max_scale = 0;
12158
+
12159
+ const float * xbl = x + QK_K*ibl;
12160
+ float sumx2 = 0;
12161
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
12162
+ float sigma2 = sumx2/QK_K;
12163
+
12164
+ for (int ib = 0; ib < QK_K/8; ++ib) {
12165
+ const float * xb = xbl + 8*ib;
12166
+ const float * qw = quant_weights + QK_K*ibl + 8*ib;
12167
+ for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12168
+ float max = fabsf(xb[0]);
12169
+ for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i]));
12170
+ if (!max) {
12171
+ scales[ib] = 0;
12172
+ memset(L, 1, 8);
12173
+ continue;
12174
+ }
12175
+ // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
12176
+ // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
12177
+ // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
12178
+ // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
12179
+ // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
12180
+ // for each possible and score for each split.
12181
+ for (int j = 0; j < 8; ++j) {
12182
+ pairs[2*j] = xb[j];
12183
+ idx[2*j] = j;
12184
+ }
12185
+ qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper);
12186
+ {
12187
+ sumx[0] = sumw[0] = 0;
12188
+ for (int j = 0; j < 8; ++j) {
12189
+ int i = idx[2*j];
12190
+ sumx[j+1] = sumx[j] + weight[i]*xb[i];
12191
+ sumw[j+1] = sumw[j] + weight[i];
12192
+ }
12193
+ }
12194
+ float best_score = 0, scale = max;
12195
+ int besti1 = 0, besti2 = 0;
12196
+ for (int i1 = 0; i1 <= 8; ++i1) {
12197
+ for (int i2 = i1; i2 <= 8; ++i2) {
12198
+ float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]);
12199
+ float sumq2 = (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]);
12200
+ if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
12201
+ scale = sumqx/sumq2; best_score = scale*sumqx;
12202
+ besti1 = i1; besti2 = i2;
12203
+ }
12204
+ }
12205
+ }
12206
+ for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
12207
+ for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
12208
+ for (int j = besti2; j < 8; ++j) L[idx[2*j]] = 2;
12209
+ if (scale < 0) {
12210
+ for (int j = 0; j < 8; ++j) L[j] = 2 - L[j];
12211
+ scale = -scale;
12212
+ }
12213
+ // Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring
12214
+ // grid point that minimizes SSD.
12215
+ uint16_t u = 0;
12216
+ for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
12217
+ int grid_index = kmap_q2xs[u];
12218
+ if (grid_index < 0) {
12219
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12220
+ grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
12221
+ GGML_ASSERT(grid_index >= 0);
12222
+ }
12223
+ y[ibl].qs[ib] = grid_index & 255;
12224
+ hbit[ib] = grid_index >> 8;
12225
+ GGML_ASSERT(scale >= 0);
12226
+ scales[ib] = scale;
12227
+ max_scale = MAX(max_scale, scale);
12228
+ }
12229
+
12230
+ if (!max_scale) {
12231
+ memset(y[ibl].qs, 0, QK_K/8);
12232
+ continue;
12233
+ }
12234
+
12235
+ float d = max_scale/15;
12236
+ y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed.
12237
+ float id = 1/d;
12238
+ for (int ib = 0; ib < QK_K/8; ++ib) {
12239
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
12240
+ l = MAX(0, MIN(7, l));
12241
+ if (hbit[ib]) l |= 8;
12242
+ y[ibl].scales[ib/2] |= (l << 4*(ib%2));
12243
+ }
12244
+ }
12245
+ }
12246
+
12247
+ size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
9690
12248
  (void)hist;
9691
12249
  GGML_ASSERT(n_per_row%QK_K == 0);
9692
12250
  int nblock = n_per_row/QK_K;
9693
12251
  char * qrow = (char *)dst;
9694
12252
  for (int row = 0; row < nrow; ++row) {
9695
- quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
12253
+ quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
9696
12254
  src += n_per_row;
9697
- qrow += nblock*sizeof(block_iq2_xs);
12255
+ qrow += nblock*sizeof(block_iq1_s);
9698
12256
  }
9699
- return nrow * nblock * sizeof(block_iq2_xs);
12257
+ return nrow * nblock * sizeof(block_iq1_s);
9700
12258
  }
9701
12259
 
9702
- //
9703
- // ============================================= 3-bit using D4 lattice
9704
- //
9705
-
9706
- typedef struct {
9707
- uint32_t * grid;
9708
- int * map;
9709
- uint16_t * neighbours;
9710
- } iq3_entry_t;
9711
-
9712
- static iq3_entry_t iq3_data[1] = {
9713
- {NULL, NULL, NULL},
9714
- };
12260
+ // ============================ 4-bit non-linear quants
9715
12261
 
9716
- static inline int iq3_data_index(int grid_size) {
9717
- (void)grid_size;
9718
- GGML_ASSERT(grid_size == 256);
9719
- return 0;
12262
+ static inline int best_index_int8(int n, const int8_t * val, float x) {
12263
+ if (x <= val[0]) return 0;
12264
+ if (x >= val[n-1]) return n-1;
12265
+ int ml = 0, mu = n-1;
12266
+ while (mu-ml > 1) {
12267
+ int mav = (ml+mu)/2;
12268
+ if (x < val[mav]) mu = mav; else ml = mav;
12269
+ }
12270
+ return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
9720
12271
  }
9721
12272
 
9722
- static int iq3_compare_func(const void * left, const void * right) {
9723
- const int * l = (const int *)left;
9724
- const int * r = (const int *)right;
9725
- return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
9726
- }
12273
+ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
12274
+ ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
12275
+ float * scales, float * weight, uint8_t * L,
12276
+ const int8_t * values,
12277
+ const float * quant_weights) {
9727
12278
 
9728
- void iq3xs_init_impl(int grid_size) {
9729
- const int gindex = iq3_data_index(grid_size);
9730
- if (iq3_data[gindex].grid) {
9731
- return;
9732
- }
9733
- static const uint16_t kgrid_256[256] = {
9734
- 0, 2, 4, 9, 11, 15, 16, 18, 25, 34, 59, 61, 65, 67, 72, 74,
9735
- 81, 85, 88, 90, 97, 108, 120, 128, 130, 132, 137, 144, 146, 153, 155, 159,
9736
- 169, 175, 189, 193, 199, 200, 202, 213, 248, 267, 287, 292, 303, 315, 317, 321,
9737
- 327, 346, 362, 413, 436, 456, 460, 462, 483, 497, 513, 515, 520, 522, 529, 531,
9738
- 536, 538, 540, 551, 552, 576, 578, 585, 592, 594, 641, 643, 648, 650, 657, 664,
9739
- 698, 704, 706, 720, 729, 742, 758, 769, 773, 808, 848, 852, 870, 889, 901, 978,
9740
- 992, 1024, 1026, 1033, 1035, 1040, 1042, 1046, 1049, 1058, 1089, 1091, 1093, 1096, 1098, 1105,
9741
- 1112, 1139, 1143, 1144, 1152, 1154, 1161, 1167, 1168, 1170, 1183, 1184, 1197, 1217, 1224, 1228,
9742
- 1272, 1276, 1309, 1323, 1347, 1367, 1377, 1404, 1473, 1475, 1486, 1509, 1537, 1544, 1546, 1553,
9743
- 1555, 1576, 1589, 1594, 1600, 1602, 1616, 1625, 1636, 1638, 1665, 1667, 1672, 1685, 1706, 1722,
9744
- 1737, 1755, 1816, 1831, 1850, 1856, 1862, 1874, 1901, 1932, 1950, 1971, 2011, 2032, 2052, 2063,
9745
- 2077, 2079, 2091, 2095, 2172, 2192, 2207, 2208, 2224, 2230, 2247, 2277, 2308, 2345, 2356, 2389,
9746
- 2403, 2424, 2501, 2504, 2506, 2520, 2570, 2593, 2616, 2624, 2630, 2646, 2669, 2700, 2714, 2746,
9747
- 2754, 2795, 2824, 2835, 2839, 2874, 2882, 2905, 2984, 3028, 3042, 3092, 3108, 3110, 3124, 3153,
9748
- 3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
9749
- 3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
9750
- };
9751
- const int kmap_size = 4096;
9752
- const int nwant = 2;
9753
- const uint16_t * kgrid = kgrid_256;
9754
- uint32_t * kgrid_q3xs;
9755
- int * kmap_q3xs;
9756
- uint16_t * kneighbors_q3xs;
12279
+ const int ntry = 7;
9757
12280
 
9758
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
9759
- uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
9760
- for (int k = 0; k < grid_size; ++k) {
9761
- int8_t * pos = (int8_t *)(the_grid + k);
9762
- for (int i = 0; i < 4; ++i) {
9763
- int l = (kgrid[k] >> 3*i) & 0x7;
9764
- pos[i] = 2*l + 1;
9765
- }
9766
- }
9767
- kgrid_q3xs = the_grid;
9768
- iq3_data[gindex].grid = the_grid;
9769
- kmap_q3xs = (int *)malloc(kmap_size*sizeof(int));
9770
- iq3_data[gindex].map = kmap_q3xs;
9771
- for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1;
9772
- uint32_t aux32;
9773
- uint8_t * aux8 = (uint8_t *)&aux32;
9774
- for (int i = 0; i < grid_size; ++i) {
9775
- aux32 = kgrid_q3xs[i];
9776
- uint16_t index = 0;
9777
- for (int k=0; k<4; ++k) {
9778
- uint16_t q = (aux8[k] - 1)/2;
9779
- index |= (q << 3*k);
9780
- }
9781
- kmap_q3xs[index] = i;
9782
- }
9783
- int8_t pos[4];
9784
- int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
9785
- int num_neighbors = 0, num_not_in_map = 0;
9786
- for (int i = 0; i < kmap_size; ++i) {
9787
- if (kmap_q3xs[i] >= 0) continue;
9788
- ++num_not_in_map;
9789
- for (int k = 0; k < 4; ++k) {
9790
- int l = (i >> 3*k) & 0x7;
9791
- pos[k] = 2*l + 1;
9792
- }
9793
- for (int j = 0; j < grid_size; ++j) {
9794
- const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
9795
- int d2 = 0;
9796
- for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
9797
- dist2[2*j+0] = d2;
9798
- dist2[2*j+1] = j;
12281
+ float sigma2 = 0;
12282
+ for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
12283
+ sigma2 *= 2.f/super_block_size;
12284
+
12285
+ memset(q4, 0, super_block_size/2);
12286
+ dh[0] = GGML_FP32_TO_FP16(0.f);
12287
+
12288
+ float max_scale = 0, amax_scale = 0;
12289
+ for (int ib = 0; ib < super_block_size/block_size; ++ib) {
12290
+ const float * xb = x + ib*block_size;
12291
+ if (quant_weights) {
12292
+ const float * qw = quant_weights + ib*block_size;
12293
+ for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
12294
+ } else {
12295
+ for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
9799
12296
  }
9800
- qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
9801
- int n = 0; int d2 = dist2[0];
9802
- int nhave = 1;
9803
- for (int j = 0; j < grid_size; ++j) {
9804
- if (dist2[2*j] > d2) {
9805
- if (nhave == nwant) break;
9806
- d2 = dist2[2*j];
9807
- ++nhave;
12297
+ float amax = 0, max = 0;
12298
+ for (int j = 0; j < block_size; ++j) {
12299
+ float ax = fabsf(xb[j]);
12300
+ if (ax > amax) {
12301
+ amax = ax; max = xb[j];
9808
12302
  }
9809
- ++n;
9810
12303
  }
9811
- num_neighbors += n;
9812
- }
9813
- printf("%s: %d neighbours in total\n", __func__, num_neighbors);
9814
- kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
9815
- iq3_data[gindex].neighbours = kneighbors_q3xs;
9816
- int counter = 0;
9817
- for (int i = 0; i < kmap_size; ++i) {
9818
- if (kmap_q3xs[i] >= 0) continue;
9819
- for (int k = 0; k < 4; ++k) {
9820
- int l = (i >> 3*k) & 0x7;
9821
- pos[k] = 2*l + 1;
12304
+ if (!amax) {
12305
+ scales[ib] = 0;
12306
+ continue;
9822
12307
  }
9823
- for (int j = 0; j < grid_size; ++j) {
9824
- const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
9825
- int d2 = 0;
9826
- for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
9827
- dist2[2*j+0] = d2;
9828
- dist2[2*j+1] = j;
12308
+ float d = -max/values[0];
12309
+ float id = 1/d;
12310
+ float sumqx = 0, sumq2 = 0;
12311
+ for (int j = 0; j < block_size; ++j) {
12312
+ float al = id*xb[j];
12313
+ int l = best_index_int8(16, values, al);
12314
+ float q = values[l];
12315
+ float w = weight[j];
12316
+ sumqx += w*q*xb[j];
12317
+ sumq2 += w*q*q;
12318
+ }
12319
+ d = sumqx/sumq2;
12320
+ float best = d*sumqx;
12321
+ for (int itry = -ntry; itry <= ntry; ++itry) {
12322
+ id = (itry + values[0])/max;
12323
+ sumqx = sumq2 = 0;
12324
+ for (int j = 0; j < block_size; ++j) {
12325
+ float al = id*xb[j];
12326
+ int l = best_index_int8(16, values, al);
12327
+ float q = values[l];
12328
+ float w = weight[j];
12329
+ sumqx += w*q*xb[j];
12330
+ sumq2 += w*q*q;
12331
+ }
12332
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
12333
+ d = sumqx/sumq2; best = d * sumqx;
12334
+ }
9829
12335
  }
9830
- qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
9831
- kmap_q3xs[i] = -(counter + 1);
9832
- int d2 = dist2[0];
9833
- uint16_t * start = &kneighbors_q3xs[counter++];
9834
- int n = 0, nhave = 1;
9835
- for (int j = 0; j < grid_size; ++j) {
9836
- if (dist2[2*j] > d2) {
9837
- if (nhave == nwant) break;
9838
- d2 = dist2[2*j];
9839
- ++nhave;
12336
+ scales[ib] = d;
12337
+ float abs_d = fabsf(d);
12338
+ if (abs_d > amax_scale) {
12339
+ amax_scale = abs_d; max_scale = d;
12340
+ }
12341
+ }
12342
+
12343
+ if (super_block_size/block_size > 1) {
12344
+ int nb = super_block_size/block_size;
12345
+ memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
12346
+ float d = -max_scale/32;
12347
+ dh[0] = GGML_FP32_TO_FP16(d);
12348
+ float id = d ? 1/d : 0.f;
12349
+ for (int ib = 0; ib < super_block_size/block_size; ++ib) {
12350
+ int l = nearest_int(id*scales[ib]);
12351
+ l = MAX(-32, MIN(31, l));
12352
+ float dl = d * l;
12353
+ float idl = dl ? 1/dl : 0.f;
12354
+ uint8_t * Lb = L + ib*block_size;
12355
+ const float * xb = x + ib*block_size;
12356
+ for (int j = 0; j < block_size; ++j) {
12357
+ Lb[j] = best_index_int8(16, values, idl*xb[j]);
9840
12358
  }
9841
- kneighbors_q3xs[counter++] = dist2[2*j+1];
9842
- ++n;
12359
+ l += 32;
12360
+ uint8_t l_l = l & 0xf;
12361
+ uint8_t l_h = l >> 4;
12362
+ if (ib%2 == 0) scales_l[ib/2] = l_l;
12363
+ else scales_l[ib/2] |= (l_l << 4);
12364
+ scales_h[ib/8] |= (l_h << 2*(ib%8));
12365
+ }
12366
+ } else {
12367
+ dh[0] = GGML_FP32_TO_FP16(scales[0]);
12368
+ float id = scales[0] ? 1/scales[0] : 0;
12369
+ for (int j = 0; j < super_block_size; ++j) {
12370
+ L[j] = best_index_int8(16, values, id*x[j]);
9843
12371
  }
9844
- *start = n;
9845
12372
  }
9846
- free(dist2);
9847
- }
9848
12373
 
9849
- void iq3xs_free_impl(int grid_size) {
9850
- GGML_ASSERT(grid_size == 256);
9851
- const int gindex = iq3_data_index(grid_size);
9852
- if (iq3_data[gindex].grid) {
9853
- free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
9854
- free(iq3_data[gindex].map); iq3_data[gindex].map = NULL;
9855
- free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL;
12374
+ for (int i = 0; i < super_block_size/32; ++i) {
12375
+ for (int j = 0; j < 16; ++j) {
12376
+ q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
12377
+ }
9856
12378
  }
9857
12379
  }
9858
12380
 
9859
- static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
9860
- const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
9861
- int num_neighbors = neighbours[0];
9862
- GGML_ASSERT(num_neighbors > 0);
9863
- float best_d2 = FLT_MAX;
9864
- int grid_index = -1;
9865
- for (int j = 1; j <= num_neighbors; ++j) {
9866
- const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
9867
- float d2 = 0;
9868
- for (int i = 0; i < 4; ++i) {
9869
- float q = pg[i];
9870
- float diff = scale*q - xval[i];
9871
- d2 += weight[i]*diff*diff;
12381
+ size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12382
+ (void)hist;
12383
+ GGML_ASSERT(n_per_row%QK4_NL == 0);
12384
+ int nblock = n_per_row/QK4_NL;
12385
+ char * qrow = (char *)dst;
12386
+ uint8_t L[QK4_NL];
12387
+ float weight[QK4_NL];
12388
+ uint16_t unused_h;
12389
+ uint8_t * unused_l = NULL;
12390
+ float scale;
12391
+ for (int row = 0; row < nrow; ++row) {
12392
+ block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
12393
+ for (int ibl = 0; ibl < nblock; ++ibl) {
12394
+ const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
12395
+ quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
12396
+ &scale, weight, L, kvalues_iq4nl, qw);
9872
12397
  }
9873
- if (d2 < best_d2) {
9874
- best_d2 = d2; grid_index = neighbours[j];
12398
+ src += n_per_row;
12399
+ qrow += nblock*sizeof(block_iq4_nl);
12400
+ }
12401
+ return nrow * nblock * sizeof(block_iq4_nl);
12402
+ }
12403
+
12404
+ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12405
+ assert(k % QK4_NL == 0);
12406
+ block_iq4_nl * restrict y = vy;
12407
+ quantize_row_iq4_nl_reference(x, y, k);
12408
+ }
12409
+
12410
+ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
12411
+ assert(k % QK4_NL == 0);
12412
+ quantize_iq4_nl(x, y, 1, k, NULL, NULL);
12413
+ }
12414
+
12415
+ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12416
+ #if QK_K == 64
12417
+ return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
12418
+ #else
12419
+ (void)hist;
12420
+ GGML_ASSERT(n_per_row%QK_K == 0);
12421
+ int nblock = n_per_row/QK_K;
12422
+ char * qrow = (char *)dst;
12423
+ uint8_t L[QK_K];
12424
+ float weight[32];
12425
+ float scales[QK_K/32];
12426
+ for (int row = 0; row < nrow; ++row) {
12427
+ block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
12428
+ for (int ibl = 0; ibl < nblock; ++ibl) {
12429
+ const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
12430
+ quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
12431
+ scales, weight, L, kvalues_iq4nl, qw);
9875
12432
  }
12433
+ src += n_per_row;
12434
+ qrow += nblock*sizeof(block_iq4_xs);
9876
12435
  }
9877
- GGML_ASSERT(grid_index >= 0);
9878
- const int8_t * pg = (const int8_t *)(grid + grid_index);
9879
- for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2;
9880
- return grid_index;
12436
+ return nrow * nblock * sizeof(block_iq4_xs);
12437
+ #endif
9881
12438
  }
9882
12439
 
9883
- static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12440
+ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
12441
+ assert(k % QK_K == 0);
12442
+ block_iq4_xs * restrict y = vy;
12443
+ quantize_row_iq4_xs_reference(x, y, k);
12444
+ }
9884
12445
 
9885
- const int gindex = iq3_data_index(256);
12446
+ void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
12447
+ assert(k % QK_K == 0);
12448
+ quantize_iq4_xs(x, y, 1, k, NULL, NULL);
12449
+ }
9886
12450
 
9887
- const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
9888
- const int * kmap_q3xs = iq3_data[gindex].map;
9889
- const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
12451
+ // =============================== 2.5625 bpw
9890
12452
 
9891
- //GGML_ASSERT(quant_weights && "missing quantization weights");
9892
- GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
9893
- GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
9894
- GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
12453
+ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12454
+
12455
+ const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
12456
+
12457
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
12458
+ const int * kmap_q2xs = iq2_data[gindex].map;
12459
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
12460
+
12461
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
12462
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
12463
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
9895
12464
  GGML_ASSERT(n%QK_K == 0);
9896
12465
 
9897
- const int kMaxQ = 8;
12466
+ const int kMaxQ = 3;
9898
12467
 
9899
- const int nbl = n/256;
12468
+ const int nbl = n/QK_K;
9900
12469
 
9901
- block_iq3_xxs * y = vy;
12470
+ block_iq2_s * y = vy;
9902
12471
 
9903
- float scales[QK_K/32];
9904
- float weight[32];
9905
- float xval[32];
9906
- int8_t L[32];
9907
- int8_t Laux[32];
9908
- float waux[32];
9909
- bool is_on_grid[8];
9910
- bool is_on_grid_aux[8];
9911
- uint8_t block_signs[8];
9912
- uint8_t q3[3*(QK_K/8)];
9913
- uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
12472
+ float scales[QK_K/16];
12473
+ float weight[16];
12474
+ float xval[16];
12475
+ int8_t L[16];
12476
+ int8_t Laux[16];
12477
+ float waux[16];
12478
+ bool is_on_grid[2];
12479
+ bool is_on_grid_aux[2];
12480
+ uint8_t block_signs[2];
9914
12481
 
9915
12482
  for (int ibl = 0; ibl < nbl; ++ibl) {
9916
12483
 
12484
+ memset(&y[ibl], 0, sizeof(block_iq2_s));
9917
12485
  y[ibl].d = GGML_FP32_TO_FP16(0.f);
9918
- memset(q3, 0, 3*QK_K/8);
9919
12486
 
9920
12487
  float max_scale = 0;
9921
12488
 
9922
12489
  const float * xbl = x + QK_K*ibl;
9923
12490
  float sumx2 = 0;
9924
12491
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
9925
- float sigma2 = sumx2/QK_K;
12492
+ float sigma2 = 2*sumx2/QK_K;
9926
12493
 
9927
- for (int ib = 0; ib < QK_K/32; ++ib) {
9928
- const float * xb = xbl + 32*ib;
12494
+ for (int ib = 0; ib < QK_K/16; ++ib) {
12495
+ const float * xb = xbl + 16*ib;
9929
12496
  if (quant_weights) {
9930
- const float * qw = quant_weights + QK_K*ibl + 32*ib;
9931
- for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12497
+ const float * qw = quant_weights + QK_K*ibl + 16*ib;
12498
+ for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
9932
12499
  } else {
9933
- for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
12500
+ for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
9934
12501
  }
9935
- for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
9936
- for (int k = 0; k < 4; ++k) {
9937
- int nflip = 0;
12502
+ for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
12503
+ for (int k = 0; k < 2; ++k) {
9938
12504
  uint8_t s = 0;
9939
12505
  for (int i = 0; i < 8; ++i) {
9940
12506
  if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
9941
12507
  else {
9942
- xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
9943
- }
9944
- }
9945
- if (nflip%2) {
9946
- int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
9947
- for (int i = 1; i < 8; ++i) {
9948
- float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
9949
- if (ax < min) {
9950
- min = ax; imin = i;
9951
- }
12508
+ xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
9952
12509
  }
9953
- xval[8*k+imin] = -xval[8*k+imin];
9954
- s ^= (1 << imin);
9955
12510
  }
9956
- block_signs[k] = s & 127;
12511
+ block_signs[k] = s;
9957
12512
  }
9958
12513
  float max = xval[0];
9959
- for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
12514
+ for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
9960
12515
  if (!max) {
9961
12516
  scales[ib] = 0;
9962
- memset(L, 0, 32);
9963
12517
  continue;
9964
12518
  }
9965
12519
  float best = 0;
9966
12520
  float scale = max/(2*kMaxQ-1);
9967
- for (int is = -15; is <= 15; ++is) {
9968
- float id = (2*kMaxQ-1+is*0.2f)/max;
12521
+ is_on_grid[0] = is_on_grid[1] = true;
12522
+ for (int is = -9; is <= 9; ++is) {
12523
+ float id = (2*kMaxQ-1+is*0.1f)/max;
9969
12524
  float this_scale = 1/id;
9970
- for (int k = 0; k < 8; ++k) {
9971
- for (int i = 0; i < 4; ++i) {
9972
- int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
9973
- Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
12525
+ for (int k = 0; k < 2; ++k) {
12526
+ for (int i = 0; i < 8; ++i) {
12527
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
12528
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
9974
12529
  }
9975
12530
  uint16_t u = 0;
9976
- for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
9977
- int grid_index = kmap_q3xs[u];
12531
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
12532
+ int grid_index = kmap_q2xs[u];
9978
12533
  is_on_grid_aux[k] = true;
9979
12534
  if (grid_index < 0) {
9980
12535
  is_on_grid_aux[k] = false;
9981
- const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
9982
- grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
12536
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12537
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
9983
12538
  }
9984
12539
  }
9985
12540
  float sumqx = 0, sumq2 = 0;
9986
- for (int i = 0; i < 32; ++i) {
12541
+ for (int i = 0; i < 16; ++i) {
9987
12542
  float w = weight[i];
9988
12543
  float q = 2*Laux[i] + 1;
9989
12544
  sumqx += w*xval[i]*q;
@@ -9991,32 +12546,31 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
9991
12546
  }
9992
12547
  if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
9993
12548
  scale = sumqx/sumq2; best = scale*sumqx;
9994
- for (int i = 0; i < 32; ++i) L[i] = Laux[i];
9995
- for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
12549
+ for (int i = 0; i < 16; ++i) L[i] = Laux[i];
12550
+ for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
9996
12551
  }
9997
12552
  }
9998
12553
  int n_not_ongrid = 0;
9999
- for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
12554
+ for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
10000
12555
  if (n_not_ongrid > 0 && scale > 0) {
10001
12556
  float id = 1/scale;
10002
- for (int k = 0; k < 8; ++k) {
12557
+ for (int k = 0; k < 2; ++k) {
10003
12558
  if (is_on_grid[k]) continue;
10004
12559
  uint16_t u = 0;
10005
- for (int i = 0; i < 4; ++i) {
10006
- int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
12560
+ for (int i = 0; i < 8; ++i) {
12561
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
10007
12562
  l = MAX(0, MIN(kMaxQ-1, l));
10008
- u |= (l << 3*i);
12563
+ u |= (l << 2*i);
12564
+ L[8*k + i] = l;
10009
12565
  }
10010
- int grid_index = kmap_q3xs[u];
12566
+ int grid_index = kmap_q2xs[u];
10011
12567
  if (grid_index < 0) {
10012
- const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
10013
- grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
12568
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12569
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
10014
12570
  }
10015
- const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
10016
- for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
10017
12571
  }
10018
12572
  float sumqx = 0, sumq2 = 0;
10019
- for (int i = 0; i < 32; ++i) {
12573
+ for (int i = 0; i < 16; ++i) {
10020
12574
  float w = weight[i];
10021
12575
  float q = 2*L[i] + 1;
10022
12576
  sumqx += w*xval[i]*q;
@@ -10025,110 +12579,65 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
10025
12579
  if (sumq2 > 0) scale = sumqx/sumq2;
10026
12580
  }
10027
12581
  if (scale < 0) {
10028
- // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
10029
- // and correspondingly flip quant signs.
10030
12582
  scale = -scale;
10031
- for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
12583
+ for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
10032
12584
  }
10033
- for (int k = 0; k < 8; ++k) {
12585
+ for (int k = 0; k < 2; ++k) {
10034
12586
  uint16_t u = 0;
10035
- for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
10036
- int grid_index = kmap_q3xs[u];
12587
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
12588
+ int grid_index = kmap_q2xs[u];
10037
12589
  if (grid_index < 0) {
10038
12590
  printf("Oops: found point %u not on grid:", u);
10039
- for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
12591
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
10040
12592
  printf("\n");
10041
12593
  GGML_ASSERT(false);
10042
12594
  }
10043
- q3[8*ib+k] = grid_index;
12595
+ const int i8 = 2*ib + k;
12596
+ y[ibl].qs[i8] = grid_index & 255;
12597
+ y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
12598
+ y[ibl].qs[QK_K/8 + i8] = block_signs[k];
10044
12599
  }
10045
- scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
10046
12600
  GGML_ASSERT(scale >= 0);
10047
12601
  scales[ib] = scale;
10048
12602
  max_scale = MAX(max_scale, scale);
10049
12603
  }
10050
12604
 
10051
12605
  if (!max_scale) {
10052
- memset(y[ibl].qs, 0, 3*QK_K/8);
10053
12606
  continue;
10054
12607
  }
10055
12608
 
10056
12609
  float d = max_scale/31;
10057
- y[ibl].d = GGML_FP32_TO_FP16(d);
12610
+ y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
10058
12611
  float id = 1/d;
10059
- float sumqx = 0, sumq2 = 0;
10060
- for (int ib = 0; ib < QK_K/32; ++ib) {
12612
+ for (int ib = 0; ib < QK_K/16; ++ib) {
10061
12613
  int l = nearest_int(0.5f*(id*scales[ib]-1));
10062
12614
  l = MAX(0, MIN(15, l));
10063
- scales_and_signs[ib] |= ((uint32_t)l << 28);
10064
- if (false) {
10065
- const float * xb = xbl + 32*ib;
10066
- if (quant_weights) {
10067
- const float * qw = quant_weights + QK_K*ibl + 32*ib;
10068
- for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
10069
- } else {
10070
- for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
10071
- }
10072
- const float db = 0.25f * d * (1 + 2*l);
10073
- for (int k = 0; k < 8; ++k) {
10074
- const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
10075
- const float * xk = xb + 4*k;
10076
- const float * wk = weight + 4*k;
10077
- //const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
10078
- const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
10079
- float best_mse = 0; int best_index = q3[8*ib+k];
10080
- for (int j = 0; j < 4; ++j) {
10081
- float diff = db * grid[j] * signs[j] - xk[j];
10082
- best_mse += wk[j] * diff * diff;
10083
- }
10084
- for (int idx = 0; idx < 256; ++idx) {
10085
- //grid = (const uint8_t *)(kgrid_q3xs + idx);
10086
- grid = (const uint8_t *)(iq3xxs_grid + idx);
10087
- float mse = 0;
10088
- for (int j = 0; j < 4; ++j) {
10089
- float diff = db * grid[j] * signs[j] - xk[j];
10090
- mse += wk[j] * diff * diff;
10091
- }
10092
- if (mse < best_mse) {
10093
- best_mse = mse; best_index = idx;
10094
- }
10095
- }
10096
- q3[8*ib+k] = best_index;
10097
- //grid = (const uint8_t *)(kgrid_q3xs + best_index);
10098
- grid = (const uint8_t *)(iq3xxs_grid + best_index);
10099
- for (int j = 0; j < 4; ++j) {
10100
- float q = db * grid[j] * signs[j];
10101
- sumqx += wk[j] * q * xk[j];
10102
- sumq2 += wk[j] * q * q;
10103
- }
10104
- }
10105
- if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
10106
- }
12615
+ if (ib%2 == 0) y[ibl].scales[ib/2] = l;
12616
+ else y[ibl].scales[ib/2] |= (l << 4);
10107
12617
  }
10108
- memcpy(y[ibl].qs, q3, 3*QK_K/8);
10109
12618
  }
10110
12619
  }
10111
12620
 
10112
- size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12621
+ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
10113
12622
  (void)hist;
10114
12623
  GGML_ASSERT(n_per_row%QK_K == 0);
10115
12624
  int nblock = n_per_row/QK_K;
10116
12625
  char * qrow = (char *)dst;
10117
12626
  for (int row = 0; row < nrow; ++row) {
10118
- quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights);
12627
+ quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
10119
12628
  src += n_per_row;
10120
- qrow += nblock*sizeof(block_iq3_xxs);
12629
+ qrow += nblock*sizeof(block_iq2_s);
10121
12630
  }
10122
- return nrow * nblock * sizeof(block_iq3_xxs);
12631
+ return nrow * nblock * sizeof(block_iq2_s);
10123
12632
  }
10124
12633
 
10125
- void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
12634
+ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
10126
12635
  assert(k % QK_K == 0);
10127
- block_iq3_xxs * restrict y = vy;
10128
- quantize_row_iq3_xxs_reference(x, y, k);
12636
+ quantize_iq2_s(x, y, 1, k, NULL, NULL);
10129
12637
  }
10130
12638
 
10131
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
12639
+ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
10132
12640
  assert(k % QK_K == 0);
10133
- quantize_row_iq3_xxs_impl(x, y, k, NULL);
12641
+ block_iq2_s * restrict y = vy;
12642
+ quantize_row_iq2_s_reference(x, y, k);
10134
12643
  }