llama_cpp 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,6 +51,7 @@
51
51
 
52
52
  #define UNUSED GGML_UNUSED
53
53
 
54
+ // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
54
55
  #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
55
56
 
56
57
  #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@@ -463,8 +464,8 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
463
464
  }
464
465
 
465
466
  // NOTE: not tested
466
- inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
467
- int8x16_t res;
467
+ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
468
+ uint8x16_t res;
468
469
 
469
470
  res[ 0] = a[b[ 0]];
470
471
  res[ 1] = a[b[ 1]];
@@ -3818,71 +3819,71 @@ static const uint32_t iq3xxs_grid[256] = {
3818
3819
  0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
3819
3820
  };
3820
3821
 
3821
- static const uint32_t iq3xs_grid[512] = {
3822
- 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
3823
- 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
3824
- 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
3825
- 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
3826
- 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
3827
- 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
3828
- 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
3829
- 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
3830
- 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
3831
- 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
3832
- 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
3833
- 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
3834
- 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
3835
- 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
3836
- 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
3837
- 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
3838
- 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
3839
- 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
3840
- 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
3841
- 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
3842
- 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
3843
- 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
3844
- 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
3845
- 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
3846
- 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
3847
- 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
3848
- 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
3849
- 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
3850
- 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
3851
- 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
3852
- 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
3853
- 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
3854
- 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
3855
- 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
3856
- 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
3857
- 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
3858
- 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
3859
- 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
3860
- 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
3861
- 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
3862
- 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
3863
- 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
3864
- 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
3865
- 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
3866
- 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
3867
- 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
3868
- 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
3869
- 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
3870
- 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
3871
- 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
3872
- 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
3873
- 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
3874
- 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
3875
- 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
3876
- 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
3877
- 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
3878
- 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
3879
- 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
3880
- 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
3881
- 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
3882
- 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
3883
- 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
3884
- 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
3885
- 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
3822
+ static const uint32_t iq3s_grid[512] = {
3823
+ 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
3824
+ 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
3825
+ 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
3826
+ 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
3827
+ 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
3828
+ 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
3829
+ 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
3830
+ 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
3831
+ 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
3832
+ 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
3833
+ 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
3834
+ 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
3835
+ 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
3836
+ 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
3837
+ 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
3838
+ 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
3839
+ 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
3840
+ 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
3841
+ 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
3842
+ 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
3843
+ 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
3844
+ 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
3845
+ 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
3846
+ 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
3847
+ 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
3848
+ 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
3849
+ 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
3850
+ 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
3851
+ 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
3852
+ 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
3853
+ 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
3854
+ 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
3855
+ 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
3856
+ 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
3857
+ 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
3858
+ 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
3859
+ 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
3860
+ 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
3861
+ 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
3862
+ 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
3863
+ 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
3864
+ 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
3865
+ 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
3866
+ 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
3867
+ 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
3868
+ 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
3869
+ 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
3870
+ 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
3871
+ 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
3872
+ 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
3873
+ 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
3874
+ 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
3875
+ 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
3876
+ 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
3877
+ 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
3878
+ 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
3879
+ 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
3880
+ 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
3881
+ 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
3882
+ 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
3883
+ 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
3884
+ 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
3885
+ 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
3886
+ 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
3886
3887
  };
3887
3888
 
3888
3889
  #define NGRID_IQ2XXS 512
@@ -4162,11 +4163,11 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
4162
4163
  const uint8_t * signs = x[i].signs;
4163
4164
 
4164
4165
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
4165
- const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f;
4166
- const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f;
4166
+ const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
4167
+ const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
4167
4168
  for (int l = 0; l < 4; ++l) {
4168
- const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
4169
- const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
4169
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
4170
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
4170
4171
  for (int j = 0; j < 4; ++j) {
4171
4172
  y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4172
4173
  y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
@@ -4176,8 +4177,8 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
4176
4177
  qs += 8;
4177
4178
  signs += 4;
4178
4179
  for (int l = 0; l < 4; ++l) {
4179
- const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
4180
- const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
4180
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
4181
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
4181
4182
  for (int j = 0; j < 4; ++j) {
4182
4183
  y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4183
4184
  y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
@@ -9563,7 +9564,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9563
9564
 
9564
9565
  const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
9565
9566
  const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
9566
- const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
9567
+ const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
9567
9568
 
9568
9569
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
9569
9570
  const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
@@ -9585,8 +9586,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9585
9586
  const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
9586
9587
  const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
9587
9588
 
9588
- const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
9589
- const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
9589
+ const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
9590
+ const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
9590
9591
 
9591
9592
  const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
9592
9593
 
@@ -9653,8 +9654,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9653
9654
 
9654
9655
  const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
9655
9656
  const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
9656
- const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
9657
- const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
9657
+ const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
9658
+ const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
9658
9659
 
9659
9660
  __m256i signs;
9660
9661
  signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
@@ -10089,18 +10090,34 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10089
10090
 
10090
10091
  #if defined(__ARM_NEON)
10091
10092
 
10093
+ typedef union {
10094
+ uint16x8_t vec_index;
10095
+ uint16_t index[8];
10096
+ } vec_index_t;
10097
+
10092
10098
  static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10093
10099
  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10094
10100
  };
10095
10101
 
10096
10102
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10097
10103
 
10098
- const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
10099
- const uint8x16_t mask2 = vld1q_u8(k_mask2);
10104
+ static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
10105
+
10106
+ const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
10107
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
10108
+ const int16x8_t hshift = vld1q_s16(k_shift);
10109
+ const uint16x8_t m256 = vdupq_n_u16(256);
10110
+ const uint8x16_t m1 = vdupq_n_u8(1);
10100
10111
 
10101
10112
  uint8x16x2_t vs;
10102
10113
  ggml_int8x16x4_t q3s;
10103
10114
  ggml_int8x16x4_t q8b;
10115
+ vec_index_t idx;
10116
+
10117
+ #if QK_K == 256
10118
+ uint32_t scales32[2];
10119
+ const uint8_t * scales8 = (const uint8_t *)scales32;
10120
+ #endif
10104
10121
 
10105
10122
  float sumf = 0;
10106
10123
  for (int i = 0; i < nb; ++i) {
@@ -10109,47 +10126,63 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10109
10126
  const uint8_t * restrict qh = x[i].qh;
10110
10127
  const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10111
10128
  const int8_t * restrict q8 = y[i].qs;
10129
+
10130
+ #if QK_K == 256
10131
+ memcpy(scales32, x[i].scales, 4);
10132
+ scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
10133
+ scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
10134
+ #endif
10135
+
10112
10136
  int sumi1 = 0, sumi2 = 0;
10113
10137
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10114
10138
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
10115
- const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
10116
- iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
10117
- const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
10118
- iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
10119
- const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
10120
- iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
10121
- const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
10122
- iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
10123
- qs += 16;
10139
+
10140
+ const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
10141
+ idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
10142
+ const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
10143
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
10144
+ const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
10145
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
10146
+ idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
10147
+ const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
10148
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
10149
+ const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
10150
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
10151
+
10124
10152
 
10125
10153
  vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
10126
10154
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10127
10155
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10128
- vs.val[0] = vceqq_u8(vs.val[0], mask2);
10129
- vs.val[1] = vceqq_u8(vs.val[1], mask2);
10156
+ vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
10157
+ vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
10130
10158
 
10131
- q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0]));
10132
- q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
10159
+ q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
10160
+ q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
10133
10161
 
10134
10162
  vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
10135
10163
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10136
10164
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10137
- vs.val[0] = vceqq_u8(vs.val[0], mask2);
10138
- vs.val[1] = vceqq_u8(vs.val[1], mask2);
10165
+ vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
10166
+ vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
10139
10167
 
10140
10168
  signs += 4;
10141
10169
 
10142
- q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0]));
10143
- q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1]));
10170
+ q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
10171
+ q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
10144
10172
 
10145
10173
  const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
10146
10174
  const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
10175
+ #if QK_K == 256
10176
+ sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
10177
+ sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
10178
+ #else
10147
10179
  sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
10148
10180
  sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
10181
+ #endif
10149
10182
  }
10150
10183
  sumf += d*(sumi1 + sumi2);
10151
10184
  }
10152
- *s = 0.25f * sumf;
10185
+ *s = sumf;
10153
10186
 
10154
10187
  #elif defined(__AVX2__)
10155
10188
 
@@ -10164,6 +10197,16 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10164
10197
  const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
10165
10198
  const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
10166
10199
 
10200
+ const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
10201
+ const __m256i idx_mask = _mm256_set1_epi32(256);
10202
+
10203
+ typedef union {
10204
+ __m256i vec[2];
10205
+ uint32_t index[16];
10206
+ } index_t;
10207
+
10208
+ index_t idx;
10209
+
10167
10210
  __m256 accumf = _mm256_setzero_ps();
10168
10211
  for (int i = 0; i < nb; ++i) {
10169
10212
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
@@ -10176,24 +10219,25 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10176
10219
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10177
10220
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10178
10221
  const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10179
- const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)],
10180
- iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)],
10181
- iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)],
10182
- iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)],
10183
- iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)],
10184
- iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)],
10185
- iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)],
10186
- iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
10187
- qs += 8;
10188
- const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)],
10189
- iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)],
10190
- iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)],
10191
- iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)],
10192
- iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)],
10193
- iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)],
10194
- iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)],
10195
- iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]);
10196
- qs += 8;
10222
+ const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
10223
+ idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
10224
+ idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
10225
+ idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
10226
+ idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
10227
+ idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
10228
+ idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
10229
+
10230
+ // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
10231
+ //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
10232
+ //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
10233
+ const __m256i q2_1 = _mm256_set_epi32(
10234
+ iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
10235
+ iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
10236
+ );
10237
+ const __m256i q2_2 = _mm256_set_epi32(
10238
+ iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
10239
+ iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
10240
+ );
10197
10241
 
10198
10242
  __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
10199
10243
  aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
@@ -10221,7 +10265,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10221
10265
 
10222
10266
  }
10223
10267
 
10224
- *s = 0.25f * hsum_float_8(accumf);
10268
+ *s = hsum_float_8(accumf);
10225
10269
 
10226
10270
  #else
10227
10271
 
@@ -10238,8 +10282,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10238
10282
  const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
10239
10283
  int32_t sumi = 0;
10240
10284
  for (int l = 0; l < 4; ++l) {
10241
- const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
10242
- const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
10285
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
10286
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
10243
10287
  for (int j = 0; j < 4; ++j) {
10244
10288
  sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10245
10289
  sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
@@ -10251,8 +10295,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10251
10295
  bsum += sumi * ls1;
10252
10296
  sumi = 0;
10253
10297
  for (int l = 0; l < 4; ++l) {
10254
- const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
10255
- const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
10298
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
10299
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
10256
10300
  for (int j = 0; j < 4; ++j) {
10257
10301
  sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10258
10302
  sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
@@ -10265,7 +10309,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10265
10309
  }
10266
10310
  sumf += d * bsum;
10267
10311
  }
10268
- *s = 0.25f * sumf;
10312
+ *s = sumf;
10269
10313
  #endif
10270
10314
  }
10271
10315
 
@@ -10508,10 +10552,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
10508
10552
  const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
10509
10553
  const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
10510
10554
  const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
10511
- const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10512
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10513
- const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10514
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10555
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10556
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10557
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10558
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10515
10559
  const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10516
10560
  const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10517
10561
  const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
@@ -10618,10 +10662,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
10618
10662
  const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10619
10663
  const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10620
10664
  const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10621
- const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10622
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10623
- const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10624
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10665
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10666
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10667
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10668
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10625
10669
  const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10626
10670
  const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10627
10671
  const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
@@ -11912,7 +11956,8 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11912
11956
  }
11913
11957
  float best = 0;
11914
11958
  float scale = max/(2*kMaxQ-1);
11915
- for (int is = -15; is <= 15; ++is) {
11959
+ for (int k = 0; k < bs4; ++k) is_on_grid[k] = false;
11960
+ for (int is = -9; is <= 9; ++is) {
11916
11961
  float id = (2*kMaxQ-1+is*0.2f)/max;
11917
11962
  float this_scale = 1/id;
11918
11963
  for (int k = 0; k < bs4; ++k) {
@@ -11948,7 +11993,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11948
11993
  if (n_not_ongrid > 0 && scale > 0) {
11949
11994
  float id = 1/scale;
11950
11995
  for (int k = 0; k < bs4; ++k) {
11951
- if (is_on_grid[k]) continue;
11996
+ //if (is_on_grid[k]) continue;
11952
11997
  uint16_t u = 0;
11953
11998
  for (int i = 0; i < 4; ++i) {
11954
11999
  int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
@@ -12004,7 +12049,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
12004
12049
  }
12005
12050
 
12006
12051
  float d = max_scale/31;
12007
- y[ibl].d = GGML_FP32_TO_FP16(d);
12052
+ y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f);
12008
12053
  float id = 1/d;
12009
12054
  for (int ib = 0; ib < QK_K/block_size; ib += 2) {
12010
12055
  int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));