llama_cpp 0.12.7 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +131 -288
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +29 -29
- data/vendor/tmp/llama.cpp/Makefile +10 -6
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
- data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +545 -210
- data/vendor/tmp/llama.cpp/ggml.h +65 -23
- data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
- data/vendor/tmp/llama.cpp/llama.h +81 -75
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -51,6 +51,7 @@
|
|
51
51
|
|
52
52
|
#define UNUSED GGML_UNUSED
|
53
53
|
|
54
|
+
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
54
55
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
55
56
|
|
56
57
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
@@ -462,6 +463,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
462
463
|
return res;
|
463
464
|
}
|
464
465
|
|
466
|
+
// NOTE: not tested
|
467
|
+
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
468
|
+
uint8x16_t res;
|
469
|
+
|
470
|
+
res[ 0] = a[b[ 0]];
|
471
|
+
res[ 1] = a[b[ 1]];
|
472
|
+
res[ 2] = a[b[ 2]];
|
473
|
+
res[ 3] = a[b[ 3]];
|
474
|
+
res[ 4] = a[b[ 4]];
|
475
|
+
res[ 5] = a[b[ 5]];
|
476
|
+
res[ 6] = a[b[ 6]];
|
477
|
+
res[ 7] = a[b[ 7]];
|
478
|
+
res[ 8] = a[b[ 8]];
|
479
|
+
res[ 9] = a[b[ 9]];
|
480
|
+
res[10] = a[b[10]];
|
481
|
+
res[11] = a[b[11]];
|
482
|
+
res[12] = a[b[12]];
|
483
|
+
res[13] = a[b[13]];
|
484
|
+
res[14] = a[b[14]];
|
485
|
+
res[15] = a[b[15]];
|
486
|
+
|
487
|
+
return res;
|
488
|
+
}
|
489
|
+
|
465
490
|
#else
|
466
491
|
|
467
492
|
#define ggml_int16x8x2_t int16x8x2_t
|
@@ -476,6 +501,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
476
501
|
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
477
502
|
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
478
503
|
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
504
|
+
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
479
505
|
|
480
506
|
#endif
|
481
507
|
|
@@ -1852,7 +1878,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1852
1878
|
float mins[QK_K/16];
|
1853
1879
|
float scales[QK_K/16];
|
1854
1880
|
float sw[QK_K/16];
|
1855
|
-
float weight[
|
1881
|
+
float weight[16];
|
1856
1882
|
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
1857
1883
|
|
1858
1884
|
for (int i = 0; i < nb; i++) {
|
@@ -1862,13 +1888,42 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1862
1888
|
float sigma2 = sumx2/QK_K;
|
1863
1889
|
for (int j = 0; j < QK_K/16; ++j) {
|
1864
1890
|
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
1865
|
-
for (int l = 0; l <
|
1891
|
+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
1866
1892
|
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
1867
|
-
scales[j] = make_qkx3_quants(
|
1893
|
+
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
1868
1894
|
}
|
1869
1895
|
|
1870
|
-
float dm
|
1871
|
-
|
1896
|
+
float dm, mm;
|
1897
|
+
#if QK_K == 64
|
1898
|
+
float max_scale = 0, max_min = 0;
|
1899
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1900
|
+
max_scale = MAX(max_scale, scales[j]);
|
1901
|
+
max_min = MAX(max_min, mins[j]);
|
1902
|
+
}
|
1903
|
+
dm = max_scale/15;
|
1904
|
+
mm = max_min/15;
|
1905
|
+
if (max_scale) {
|
1906
|
+
float id = 1/dm;
|
1907
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1908
|
+
int l = nearest_int(id*scales[j]);
|
1909
|
+
Ls[j] = MAX(0, MIN(15, l));
|
1910
|
+
}
|
1911
|
+
} else {
|
1912
|
+
memset(Ls, 0, QK_K/16);
|
1913
|
+
}
|
1914
|
+
if (max_min) {
|
1915
|
+
float id = 1/mm;
|
1916
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1917
|
+
int l = nearest_int(id*mins[j]);
|
1918
|
+
Lm[j] = MAX(0, MIN(15, l));
|
1919
|
+
}
|
1920
|
+
} else {
|
1921
|
+
memset(Lm, 0, QK_K/16);
|
1922
|
+
}
|
1923
|
+
#else
|
1924
|
+
dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
1925
|
+
mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
1926
|
+
#endif
|
1872
1927
|
y[i].d = GGML_FP32_TO_FP16(dm);
|
1873
1928
|
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
1874
1929
|
dm = GGML_FP16_TO_FP32(y[i].d);
|
@@ -3470,6 +3525,265 @@ static const uint64_t iq2xs_grid[512] = {
|
|
3470
3525
|
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
3471
3526
|
};
|
3472
3527
|
|
3528
|
+
static const uint64_t iq2s_grid[1024] = {
|
3529
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
3530
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
3531
|
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
3532
|
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
3533
|
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
3534
|
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
3535
|
+
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
3536
|
+
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
3537
|
+
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
3538
|
+
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
3539
|
+
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
3540
|
+
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
3541
|
+
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
3542
|
+
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
3543
|
+
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
3544
|
+
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
3545
|
+
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
3546
|
+
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
3547
|
+
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
3548
|
+
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
3549
|
+
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
3550
|
+
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
3551
|
+
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
3552
|
+
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
3553
|
+
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
3554
|
+
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
3555
|
+
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
3556
|
+
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
3557
|
+
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
3558
|
+
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
3559
|
+
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
3560
|
+
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
3561
|
+
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
3562
|
+
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
3563
|
+
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
3564
|
+
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
3565
|
+
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
3566
|
+
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
3567
|
+
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
3568
|
+
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
3569
|
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
3570
|
+
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
3571
|
+
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
3572
|
+
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
3573
|
+
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
3574
|
+
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
3575
|
+
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
3576
|
+
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
3577
|
+
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
3578
|
+
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
3579
|
+
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
3580
|
+
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
3581
|
+
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
3582
|
+
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
3583
|
+
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
3584
|
+
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
3585
|
+
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
3586
|
+
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
3587
|
+
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
3588
|
+
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
3589
|
+
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
3590
|
+
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
3591
|
+
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
3592
|
+
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
3593
|
+
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
3594
|
+
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
3595
|
+
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
3596
|
+
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
3597
|
+
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
3598
|
+
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
3599
|
+
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
3600
|
+
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
3601
|
+
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
3602
|
+
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
3603
|
+
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
3604
|
+
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
3605
|
+
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
3606
|
+
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
3607
|
+
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
3608
|
+
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
3609
|
+
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
3610
|
+
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
3611
|
+
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
3612
|
+
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
3613
|
+
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
3614
|
+
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
3615
|
+
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
3616
|
+
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
3617
|
+
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
3618
|
+
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
3619
|
+
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
3620
|
+
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
3621
|
+
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
3622
|
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
3623
|
+
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
3624
|
+
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
3625
|
+
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
3626
|
+
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
3627
|
+
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
3628
|
+
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
3629
|
+
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
3630
|
+
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
3631
|
+
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
3632
|
+
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
3633
|
+
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
3634
|
+
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
3635
|
+
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
3636
|
+
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
3637
|
+
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
3638
|
+
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
3639
|
+
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
3640
|
+
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
3641
|
+
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
3642
|
+
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
3643
|
+
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
3644
|
+
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
3645
|
+
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
3646
|
+
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
3647
|
+
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
3648
|
+
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
3649
|
+
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
3650
|
+
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
3651
|
+
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
3652
|
+
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
3653
|
+
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
3654
|
+
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
3655
|
+
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
3656
|
+
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
3657
|
+
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
3658
|
+
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
3659
|
+
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
3660
|
+
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
3661
|
+
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
3662
|
+
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
3663
|
+
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
3664
|
+
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
3665
|
+
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
3666
|
+
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
3667
|
+
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
3668
|
+
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
3669
|
+
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
3670
|
+
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
3671
|
+
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
3672
|
+
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
3673
|
+
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
3674
|
+
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
3675
|
+
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
3676
|
+
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
3677
|
+
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
3678
|
+
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
3679
|
+
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
3680
|
+
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
3681
|
+
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
3682
|
+
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
3683
|
+
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
3684
|
+
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
3685
|
+
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
3686
|
+
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
3687
|
+
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
3688
|
+
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
3689
|
+
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
3690
|
+
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
3691
|
+
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
3692
|
+
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
3693
|
+
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
3694
|
+
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
3695
|
+
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
3696
|
+
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
3697
|
+
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
3698
|
+
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
3699
|
+
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
3700
|
+
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
3701
|
+
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
3702
|
+
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
3703
|
+
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
3704
|
+
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
3705
|
+
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
3706
|
+
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
3707
|
+
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
3708
|
+
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
3709
|
+
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
3710
|
+
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
3711
|
+
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
3712
|
+
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
3713
|
+
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
3714
|
+
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
3715
|
+
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
3716
|
+
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
3717
|
+
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
3718
|
+
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
3719
|
+
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
3720
|
+
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
3721
|
+
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
3722
|
+
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
3723
|
+
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
3724
|
+
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
3725
|
+
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
3726
|
+
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
3727
|
+
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
3728
|
+
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
3729
|
+
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
3730
|
+
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
3731
|
+
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
3732
|
+
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
3733
|
+
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
3734
|
+
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
3735
|
+
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
3736
|
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
3737
|
+
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
3738
|
+
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
3739
|
+
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
3740
|
+
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
3741
|
+
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
3742
|
+
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
3743
|
+
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
3744
|
+
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
3745
|
+
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
3746
|
+
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
3747
|
+
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
3748
|
+
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
3749
|
+
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
3750
|
+
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
3751
|
+
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
3752
|
+
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
3753
|
+
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
3754
|
+
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
3755
|
+
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
3756
|
+
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
3757
|
+
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
3758
|
+
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
3759
|
+
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
3760
|
+
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
3761
|
+
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
3762
|
+
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
3763
|
+
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
3764
|
+
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
3765
|
+
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
3766
|
+
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
3767
|
+
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
3768
|
+
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
3769
|
+
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
3770
|
+
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
3771
|
+
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
3772
|
+
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
3773
|
+
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
3774
|
+
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
3775
|
+
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
3776
|
+
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
3777
|
+
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
3778
|
+
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
3779
|
+
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
3780
|
+
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
3781
|
+
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
3782
|
+
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
3783
|
+
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
3784
|
+
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
3785
|
+
};
|
3786
|
+
|
3473
3787
|
static const uint32_t iq3xxs_grid[256] = {
|
3474
3788
|
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
3475
3789
|
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
@@ -3505,6 +3819,73 @@ static const uint32_t iq3xxs_grid[256] = {
|
|
3505
3819
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
3506
3820
|
};
|
3507
3821
|
|
3822
|
+
static const uint32_t iq3s_grid[512] = {
|
3823
|
+
0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
|
3824
|
+
0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
|
3825
|
+
0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
|
3826
|
+
0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
|
3827
|
+
0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
|
3828
|
+
0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
|
3829
|
+
0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
|
3830
|
+
0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
|
3831
|
+
0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
|
3832
|
+
0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
|
3833
|
+
0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
|
3834
|
+
0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
|
3835
|
+
0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
|
3836
|
+
0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
|
3837
|
+
0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
|
3838
|
+
0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
|
3839
|
+
0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
|
3840
|
+
0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
|
3841
|
+
0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
|
3842
|
+
0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
|
3843
|
+
0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
|
3844
|
+
0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
|
3845
|
+
0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
|
3846
|
+
0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
|
3847
|
+
0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
|
3848
|
+
0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
|
3849
|
+
0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
|
3850
|
+
0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
|
3851
|
+
0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
|
3852
|
+
0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
|
3853
|
+
0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
|
3854
|
+
0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
|
3855
|
+
0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
|
3856
|
+
0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
|
3857
|
+
0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
|
3858
|
+
0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
|
3859
|
+
0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
|
3860
|
+
0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
|
3861
|
+
0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
|
3862
|
+
0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
|
3863
|
+
0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
|
3864
|
+
0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
|
3865
|
+
0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
|
3866
|
+
0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
|
3867
|
+
0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
|
3868
|
+
0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
|
3869
|
+
0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
|
3870
|
+
0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
|
3871
|
+
0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
|
3872
|
+
0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
|
3873
|
+
0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
|
3874
|
+
0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
|
3875
|
+
0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
|
3876
|
+
0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
|
3877
|
+
0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
|
3878
|
+
0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
|
3879
|
+
0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
|
3880
|
+
0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
|
3881
|
+
0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
|
3882
|
+
0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
|
3883
|
+
0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
|
3884
|
+
0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
|
3885
|
+
0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
|
3886
|
+
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
3887
|
+
};
|
3888
|
+
|
3508
3889
|
#define NGRID_IQ2XXS 512
|
3509
3890
|
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
3510
3891
|
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
@@ -3704,6 +4085,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
3704
4085
|
}
|
3705
4086
|
}
|
3706
4087
|
|
4088
|
+
// ====================== 2.5625 bpw (de)-quantization
|
4089
|
+
|
4090
|
+
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
|
4091
|
+
assert(k % QK_K == 0);
|
4092
|
+
const int nb = k / QK_K;
|
4093
|
+
|
4094
|
+
float db[2];
|
4095
|
+
|
4096
|
+
for (int i = 0; i < nb; i++) {
|
4097
|
+
|
4098
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4099
|
+
const uint8_t * qs = x[i].qs;
|
4100
|
+
const uint8_t * qh = x[i].qh;
|
4101
|
+
const uint8_t * signs = qs + QK_K/8;
|
4102
|
+
|
4103
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
4104
|
+
db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
|
4105
|
+
db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
|
4106
|
+
for (int l = 0; l < 4; ++l) {
|
4107
|
+
const float dl = db[l/2];
|
4108
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
4109
|
+
for (int j = 0; j < 8; ++j) {
|
4110
|
+
y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
4111
|
+
}
|
4112
|
+
y += 8;
|
4113
|
+
}
|
4114
|
+
qs += 4;
|
4115
|
+
signs += 4;
|
4116
|
+
}
|
4117
|
+
}
|
4118
|
+
}
|
4119
|
+
|
3707
4120
|
// ====================== 3.0625 bpw (de)-quantization
|
3708
4121
|
|
3709
4122
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
|
@@ -3736,6 +4149,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
|
3736
4149
|
}
|
3737
4150
|
}
|
3738
4151
|
|
4152
|
+
// ====================== 3.3125 bpw (de)-quantization
|
4153
|
+
|
4154
|
+
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
|
4155
|
+
assert(k % QK_K == 0);
|
4156
|
+
const int nb = k / QK_K;
|
4157
|
+
|
4158
|
+
for (int i = 0; i < nb; i++) {
|
4159
|
+
|
4160
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4161
|
+
const uint8_t * qs = x[i].qs;
|
4162
|
+
const uint8_t * qh = x[i].qh;
|
4163
|
+
const uint8_t * signs = x[i].signs;
|
4164
|
+
|
4165
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
4166
|
+
const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
4167
|
+
const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
4168
|
+
for (int l = 0; l < 4; ++l) {
|
4169
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
4170
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
4171
|
+
for (int j = 0; j < 4; ++j) {
|
4172
|
+
y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4173
|
+
y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4174
|
+
}
|
4175
|
+
y += 8;
|
4176
|
+
}
|
4177
|
+
qs += 8;
|
4178
|
+
signs += 4;
|
4179
|
+
for (int l = 0; l < 4; ++l) {
|
4180
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
|
4181
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
|
4182
|
+
for (int j = 0; j < 4; ++j) {
|
4183
|
+
y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4184
|
+
y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4185
|
+
}
|
4186
|
+
y += 8;
|
4187
|
+
}
|
4188
|
+
qh += 2;
|
4189
|
+
qs += 8;
|
4190
|
+
signs += 4;
|
4191
|
+
}
|
4192
|
+
}
|
4193
|
+
}
|
4194
|
+
|
3739
4195
|
// ====================== 1.5625 bpw (de)-quantization
|
3740
4196
|
|
3741
4197
|
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
|
@@ -3799,6 +4255,33 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
|
3799
4255
|
}
|
3800
4256
|
}
|
3801
4257
|
|
4258
|
+
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
|
4259
|
+
assert(k % QK_K == 0);
|
4260
|
+
#if QK_K == 64
|
4261
|
+
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
4262
|
+
#else
|
4263
|
+
const int nb = k / QK_K;
|
4264
|
+
|
4265
|
+
for (int i = 0; i < nb; i++) {
|
4266
|
+
|
4267
|
+
const uint8_t * qs = x[i].qs;
|
4268
|
+
|
4269
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4270
|
+
|
4271
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
4272
|
+
const int ls = ((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4);
|
4273
|
+
const float dl = d * (ls - 32);
|
4274
|
+
for (int j = 0; j < 16; ++j) {
|
4275
|
+
y[j+ 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
|
4276
|
+
y[j+16] = dl * kvalues_iq4nl[qs[j] >> 4];
|
4277
|
+
}
|
4278
|
+
y += 32;
|
4279
|
+
qs += 16;
|
4280
|
+
}
|
4281
|
+
}
|
4282
|
+
#endif
|
4283
|
+
}
|
4284
|
+
|
3802
4285
|
//===================================== Q8_K ==============================================
|
3803
4286
|
|
3804
4287
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
@@ -5857,7 +6340,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5857
6340
|
|
5858
6341
|
float sumf = 0;
|
5859
6342
|
|
5860
|
-
int isum[
|
6343
|
+
int isum[QK_K/16];
|
5861
6344
|
|
5862
6345
|
for (int i = 0; i < nb; ++i) {
|
5863
6346
|
|
@@ -5873,14 +6356,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5873
6356
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
5874
6357
|
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
5875
6358
|
|
5876
|
-
isum
|
6359
|
+
memset(isum, 0, (QK_K/16)*sizeof(int));
|
5877
6360
|
for (int l = 0; l < 16; ++l) {
|
5878
6361
|
isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
|
5879
6362
|
isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
|
5880
6363
|
isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
|
5881
6364
|
isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
|
5882
6365
|
}
|
5883
|
-
for (int l = 0; l <
|
6366
|
+
for (int l = 0; l < QK_K/16; ++l) {
|
5884
6367
|
isum[l] *= (sc[l] & 0xF);
|
5885
6368
|
}
|
5886
6369
|
sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
|
@@ -8806,6 +9289,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8806
9289
|
|
8807
9290
|
#endif
|
8808
9291
|
|
9292
|
+
#if defined (__AVX2__) || defined (__ARM_NEON)
|
8809
9293
|
static const int8_t keven_signs_q2xs[1024] = {
|
8810
9294
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
8811
9295
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
@@ -8840,6 +9324,7 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|
8840
9324
|
1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
|
8841
9325
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
8842
9326
|
};
|
9327
|
+
#endif
|
8843
9328
|
|
8844
9329
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8845
9330
|
assert(n % QK_K == 0);
|
@@ -9037,15 +9522,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9037
9522
|
|
9038
9523
|
#elif defined(__AVX2__)
|
9039
9524
|
|
9040
|
-
const __m128i m4 = _mm_set1_epi8(0xf);
|
9041
|
-
const __m128i m1 = _mm_set1_epi8(1);
|
9042
|
-
const __m256i m511 = _mm256_set1_epi16(511);
|
9043
9525
|
const __m256i mone = _mm256_set1_epi8(1);
|
9044
|
-
|
9045
|
-
static const uint8_t k_bit_helper[32] = {
|
9046
|
-
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9047
|
-
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9048
|
-
};
|
9049
9526
|
static const char block_sign_shuffle_mask_1[32] = {
|
9050
9527
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
9051
9528
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
@@ -9059,11 +9536,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9059
9536
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9060
9537
|
};
|
9061
9538
|
|
9062
|
-
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
9063
9539
|
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
|
9064
9540
|
const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
|
9065
9541
|
const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
|
9066
9542
|
|
9543
|
+
#if QK_K == 64
|
9544
|
+
static const uint8_t k_bit_helper[16] = {
|
9545
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9546
|
+
};
|
9547
|
+
const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
9548
|
+
const __m128i m511 = _mm_set1_epi16(511);
|
9549
|
+
typedef union {
|
9550
|
+
__m128i vec_index;
|
9551
|
+
uint16_t index[8];
|
9552
|
+
} index_t;
|
9553
|
+
|
9554
|
+
index_t idx;
|
9555
|
+
__m256 accumf = _mm256_setzero_ps();
|
9556
|
+
for (int i = 0; i < nb; ++i) {
|
9557
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9558
|
+
const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
|
9559
|
+
idx.vec_index = _mm_and_si128(q2_data, m511);
|
9560
|
+
|
9561
|
+
const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
|
9562
|
+
const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
|
9563
|
+
const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
|
9564
|
+
|
9565
|
+
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
9566
|
+
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
9567
|
+
const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
|
9568
|
+
|
9569
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
9570
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
9571
|
+
|
9572
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
|
9573
|
+
iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
|
9574
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
|
9575
|
+
iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
|
9576
|
+
|
9577
|
+
__m256i signs;
|
9578
|
+
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
|
9579
|
+
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
9580
|
+
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
|
9581
|
+
|
9582
|
+
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
|
9583
|
+
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
9584
|
+
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
|
9585
|
+
|
9586
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
9587
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
9588
|
+
|
9589
|
+
const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
9590
|
+
const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
9591
|
+
|
9592
|
+
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
9593
|
+
|
9594
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
|
9595
|
+
|
9596
|
+
}
|
9597
|
+
|
9598
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9599
|
+
#else
|
9600
|
+
|
9601
|
+
static const uint8_t k_bit_helper[32] = {
|
9602
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9603
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9604
|
+
};
|
9605
|
+
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
9606
|
+
const __m256i m511 = _mm256_set1_epi16(511);
|
9607
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9608
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9609
|
+
|
9067
9610
|
uint64_t aux64;
|
9068
9611
|
|
9069
9612
|
// somewhat hacky, but gives a significant boost in performance
|
@@ -9111,8 +9654,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9111
9654
|
|
9112
9655
|
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
9113
9656
|
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
9114
|
-
const __m256i full_signs_1 =
|
9115
|
-
const __m256i full_signs_2 =
|
9657
|
+
const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
|
9658
|
+
const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
|
9116
9659
|
|
9117
9660
|
__m256i signs;
|
9118
9661
|
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
@@ -9152,6 +9695,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9152
9695
|
}
|
9153
9696
|
|
9154
9697
|
*s = 0.125f * hsum_float_8(accumf);
|
9698
|
+
#endif
|
9155
9699
|
|
9156
9700
|
#else
|
9157
9701
|
|
@@ -9193,7 +9737,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9193
9737
|
#endif
|
9194
9738
|
}
|
9195
9739
|
|
9196
|
-
void
|
9740
|
+
void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9197
9741
|
assert(n % QK_K == 0);
|
9198
9742
|
assert(nrc == 1);
|
9199
9743
|
UNUSED(nrc);
|
@@ -9201,88 +9745,148 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9201
9745
|
UNUSED(by);
|
9202
9746
|
UNUSED(bs);
|
9203
9747
|
|
9204
|
-
const
|
9205
|
-
const block_q8_K
|
9748
|
+
const block_iq2_s * restrict x = vx;
|
9749
|
+
const block_q8_K * restrict y = vy;
|
9206
9750
|
|
9207
9751
|
const int nb = n / QK_K;
|
9208
9752
|
|
9209
9753
|
#if defined(__ARM_NEON)
|
9210
9754
|
|
9211
|
-
|
9755
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9756
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
9757
|
+
};
|
9212
9758
|
|
9213
|
-
|
9759
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
9214
9760
|
|
9215
|
-
|
9761
|
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
9762
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
9763
|
+
const uint8x16_t m1 = vdupq_n_u8(1);
|
9764
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
9765
|
+
|
9766
|
+
uint8x16x2_t vs;
|
9767
|
+
ggml_int8x16x4_t q2s;
|
9216
9768
|
ggml_int8x16x4_t q8b;
|
9217
9769
|
|
9218
9770
|
float sumf = 0;
|
9219
9771
|
for (int i = 0; i < nb; ++i) {
|
9772
|
+
|
9220
9773
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9221
|
-
|
9222
|
-
const uint8_t * restrict
|
9223
|
-
const
|
9224
|
-
|
9774
|
+
|
9775
|
+
const uint8_t * restrict qs = x[i].qs;
|
9776
|
+
const uint8_t * restrict qh = x[i].qh;
|
9777
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
9778
|
+
const int8_t * restrict q8 = y[i].qs;
|
9779
|
+
|
9780
|
+
int sumi1 = 0, sumi2 = 0;
|
9225
9781
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9226
9782
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9227
|
-
|
9228
|
-
|
9229
|
-
const
|
9230
|
-
|
9231
|
-
const
|
9232
|
-
|
9233
|
-
|
9234
|
-
|
9235
|
-
|
9236
|
-
|
9237
|
-
|
9238
|
-
|
9239
|
-
|
9240
|
-
|
9241
|
-
|
9242
|
-
|
9243
|
-
|
9244
|
-
|
9783
|
+
q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
|
9784
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
|
9785
|
+
q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
|
9786
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
|
9787
|
+
q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
|
9788
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
|
9789
|
+
q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
|
9790
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
|
9791
|
+
qs += 8;
|
9792
|
+
|
9793
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
9794
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9795
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9796
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
9797
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9798
|
+
|
9799
|
+
q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
|
9800
|
+
q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
|
9801
|
+
|
9802
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
9803
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9804
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9805
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
9806
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9807
|
+
|
9808
|
+
signs += 4;
|
9809
|
+
|
9810
|
+
q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
|
9811
|
+
q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
|
9812
|
+
|
9813
|
+
const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
|
9814
|
+
const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
|
9815
|
+
const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
|
9816
|
+
const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
|
9817
|
+
|
9818
|
+
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
|
9819
|
+
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
|
9820
|
+
sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
|
9821
|
+
sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
|
9245
9822
|
}
|
9246
|
-
sumf += d*(
|
9823
|
+
sumf += d*(sumi1 + sumi2);
|
9247
9824
|
}
|
9248
|
-
|
9825
|
+
|
9826
|
+
*s = 0.125f * sumf;
|
9249
9827
|
|
9250
9828
|
#elif defined(__AVX2__)
|
9251
9829
|
|
9252
|
-
|
9830
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9831
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
9832
|
+
};
|
9253
9833
|
|
9254
|
-
|
9834
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9835
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9836
|
+
};
|
9837
|
+
|
9838
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9839
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9840
|
+
|
9841
|
+
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
9842
|
+
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
9843
|
+
|
9844
|
+
uint64_t aux64;
|
9255
9845
|
|
9256
9846
|
__m256 accumf = _mm256_setzero_ps();
|
9257
9847
|
for (int i = 0; i < nb; ++i) {
|
9258
9848
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9259
|
-
const uint8_t * restrict
|
9260
|
-
const uint8_t * restrict
|
9849
|
+
const uint8_t * restrict qs = x[i].qs;
|
9850
|
+
const uint8_t * restrict qh = x[i].qh;
|
9851
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
9261
9852
|
const int8_t * restrict q8 = y[i].qs;
|
9853
|
+
|
9854
|
+
memcpy(&aux64, x[i].scales, 8);
|
9855
|
+
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
9856
|
+
const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
|
9857
|
+
|
9262
9858
|
__m256i sumi1 = _mm256_setzero_si256();
|
9263
9859
|
__m256i sumi2 = _mm256_setzero_si256();
|
9264
9860
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9265
9861
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
9266
9862
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
9267
|
-
const __m256i q2_1 =
|
9268
|
-
|
9269
|
-
|
9270
|
-
|
9271
|
-
|
9272
|
-
|
9273
|
-
|
9274
|
-
|
9275
|
-
|
9276
|
-
|
9277
|
-
|
9278
|
-
|
9279
|
-
const __m256i
|
9280
|
-
const __m256i
|
9281
|
-
|
9282
|
-
|
9283
|
-
|
9284
|
-
const __m256i
|
9285
|
-
const __m256i
|
9863
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
9864
|
+
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
|
9865
|
+
iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
9866
|
+
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
9867
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
9868
|
+
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
|
9869
|
+
iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
9870
|
+
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
9871
|
+
qs += 8;
|
9872
|
+
|
9873
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
9874
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9875
|
+
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
9876
|
+
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
9877
|
+
|
9878
|
+
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
9879
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9880
|
+
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
9881
|
+
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
9882
|
+
|
9883
|
+
signs += 4;
|
9884
|
+
|
9885
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
|
9886
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
|
9887
|
+
|
9888
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
|
9889
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
|
9286
9890
|
sumi1 = _mm256_add_epi32(sumi1, p1);
|
9287
9891
|
sumi2 = _mm256_add_epi32(sumi2, p2);
|
9288
9892
|
}
|
@@ -9291,19 +9895,163 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9291
9895
|
|
9292
9896
|
}
|
9293
9897
|
|
9294
|
-
*s = 0.
|
9898
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9295
9899
|
|
9296
9900
|
#else
|
9297
9901
|
|
9298
|
-
|
9902
|
+
float sumf = 0;
|
9903
|
+
for (int i = 0; i < nb; i++) {
|
9299
9904
|
|
9300
|
-
float sumf = 0.f;
|
9301
|
-
for (int i = 0; i < nb; ++i) {
|
9302
9905
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9303
|
-
const
|
9304
|
-
const uint8_t *
|
9305
|
-
const
|
9306
|
-
|
9906
|
+
const int8_t * q8 = y[i].qs;
|
9907
|
+
const uint8_t * qs = x[i].qs;
|
9908
|
+
const uint8_t * qh = x[i].qh;
|
9909
|
+
const uint8_t * signs = qs + QK_K/8;
|
9910
|
+
|
9911
|
+
int bsum = 0;
|
9912
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
9913
|
+
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
9914
|
+
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
9915
|
+
int sumi1 = 0, sumi2 = 0;
|
9916
|
+
for (int l = 0; l < 2; ++l) {
|
9917
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
9918
|
+
for (int j = 0; j < 8; ++j) {
|
9919
|
+
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
9920
|
+
}
|
9921
|
+
q8 += 8;
|
9922
|
+
}
|
9923
|
+
for (int l = 2; l < 4; ++l) {
|
9924
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
9925
|
+
for (int j = 0; j < 8; ++j) {
|
9926
|
+
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
9927
|
+
}
|
9928
|
+
q8 += 8;
|
9929
|
+
}
|
9930
|
+
bsum += ls1 * sumi1 + ls2 * sumi2;
|
9931
|
+
qs += 4;
|
9932
|
+
signs += 4;
|
9933
|
+
}
|
9934
|
+
|
9935
|
+
sumf += d * bsum;
|
9936
|
+
}
|
9937
|
+
|
9938
|
+
*s = 0.125f * sumf;
|
9939
|
+
|
9940
|
+
#endif
|
9941
|
+
|
9942
|
+
}
|
9943
|
+
|
9944
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9945
|
+
assert(n % QK_K == 0);
|
9946
|
+
assert(nrc == 1);
|
9947
|
+
UNUSED(nrc);
|
9948
|
+
UNUSED(bx);
|
9949
|
+
UNUSED(by);
|
9950
|
+
UNUSED(bs);
|
9951
|
+
|
9952
|
+
const block_iq3_xxs * restrict x = vx;
|
9953
|
+
const block_q8_K * restrict y = vy;
|
9954
|
+
|
9955
|
+
const int nb = n / QK_K;
|
9956
|
+
|
9957
|
+
#if defined(__ARM_NEON)
|
9958
|
+
|
9959
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
9960
|
+
|
9961
|
+
uint32_t aux32[2];
|
9962
|
+
|
9963
|
+
ggml_int8x16x4_t q3s;
|
9964
|
+
ggml_int8x16x4_t q8b;
|
9965
|
+
|
9966
|
+
float sumf = 0;
|
9967
|
+
for (int i = 0; i < nb; ++i) {
|
9968
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9969
|
+
const uint8_t * restrict q3 = x[i].qs;
|
9970
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
9971
|
+
const int8_t * restrict q8 = y[i].qs;
|
9972
|
+
float sumf1 = 0, sumf2 = 0;
|
9973
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9974
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9975
|
+
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
9976
|
+
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
|
9977
|
+
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
|
9978
|
+
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
|
9979
|
+
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
|
9980
|
+
q3 += 16;
|
9981
|
+
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
9982
|
+
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
9983
|
+
q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
|
9984
|
+
q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
|
9985
|
+
q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
|
9986
|
+
q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
|
9987
|
+
q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
|
9988
|
+
q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
|
9989
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
9990
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
9991
|
+
sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
|
9992
|
+
sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
|
9993
|
+
}
|
9994
|
+
sumf += d*(sumf1 + sumf2);
|
9995
|
+
}
|
9996
|
+
*s = 0.5f * sumf;
|
9997
|
+
|
9998
|
+
#elif defined(__AVX2__)
|
9999
|
+
|
10000
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10001
|
+
|
10002
|
+
uint32_t aux32[2];
|
10003
|
+
|
10004
|
+
__m256 accumf = _mm256_setzero_ps();
|
10005
|
+
for (int i = 0; i < nb; ++i) {
|
10006
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10007
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10008
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
10009
|
+
const int8_t * restrict q8 = y[i].qs;
|
10010
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
10011
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
10012
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10013
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10014
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10015
|
+
const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
10016
|
+
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10017
|
+
q3 += 8;
|
10018
|
+
const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
10019
|
+
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10020
|
+
q3 += 8;
|
10021
|
+
memcpy(aux32, gas, 8); gas += 8;
|
10022
|
+
const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
|
10023
|
+
signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
10024
|
+
const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
|
10025
|
+
signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
10026
|
+
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
10027
|
+
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
10028
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
10029
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
10030
|
+
const uint16_t ls1 = aux32[0] >> 28;
|
10031
|
+
const uint16_t ls2 = aux32[1] >> 28;
|
10032
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
10033
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
10034
|
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
10035
|
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
10036
|
+
}
|
10037
|
+
|
10038
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
10039
|
+
|
10040
|
+
}
|
10041
|
+
|
10042
|
+
*s = 0.25f * hsum_float_8(accumf);
|
10043
|
+
|
10044
|
+
#else
|
10045
|
+
|
10046
|
+
uint32_t aux32;
|
10047
|
+
|
10048
|
+
float sumf = 0.f;
|
10049
|
+
for (int i = 0; i < nb; ++i) {
|
10050
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10051
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10052
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
10053
|
+
const int8_t * restrict q8 = y[i].qs;
|
10054
|
+
int32_t bsum = 0;
|
9307
10055
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
9308
10056
|
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
9309
10057
|
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
@@ -9327,6 +10075,245 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9327
10075
|
#endif
|
9328
10076
|
}
|
9329
10077
|
|
10078
|
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
10079
|
+
assert(n % QK_K == 0);
|
10080
|
+
assert(nrc == 1);
|
10081
|
+
UNUSED(nrc);
|
10082
|
+
UNUSED(bx);
|
10083
|
+
UNUSED(by);
|
10084
|
+
UNUSED(bs);
|
10085
|
+
|
10086
|
+
const block_iq3_s * restrict x = vx;
|
10087
|
+
const block_q8_K * restrict y = vy;
|
10088
|
+
|
10089
|
+
const int nb = n / QK_K;
|
10090
|
+
|
10091
|
+
#if defined(__ARM_NEON)
|
10092
|
+
|
10093
|
+
typedef union {
|
10094
|
+
uint16x8_t vec_index;
|
10095
|
+
uint16_t index[8];
|
10096
|
+
} vec_index_t;
|
10097
|
+
|
10098
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10099
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10100
|
+
};
|
10101
|
+
|
10102
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
10103
|
+
|
10104
|
+
static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
|
10105
|
+
|
10106
|
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
10107
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
10108
|
+
const int16x8_t hshift = vld1q_s16(k_shift);
|
10109
|
+
const uint16x8_t m256 = vdupq_n_u16(256);
|
10110
|
+
const uint8x16_t m1 = vdupq_n_u8(1);
|
10111
|
+
|
10112
|
+
uint8x16x2_t vs;
|
10113
|
+
ggml_int8x16x4_t q3s;
|
10114
|
+
ggml_int8x16x4_t q8b;
|
10115
|
+
vec_index_t idx;
|
10116
|
+
|
10117
|
+
#if QK_K == 256
|
10118
|
+
uint32_t scales32[2];
|
10119
|
+
const uint8_t * scales8 = (const uint8_t *)scales32;
|
10120
|
+
#endif
|
10121
|
+
|
10122
|
+
float sumf = 0;
|
10123
|
+
for (int i = 0; i < nb; ++i) {
|
10124
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10125
|
+
const uint8_t * restrict qs = x[i].qs;
|
10126
|
+
const uint8_t * restrict qh = x[i].qh;
|
10127
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10128
|
+
const int8_t * restrict q8 = y[i].qs;
|
10129
|
+
|
10130
|
+
#if QK_K == 256
|
10131
|
+
memcpy(scales32, x[i].scales, 4);
|
10132
|
+
scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
|
10133
|
+
scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
|
10134
|
+
#endif
|
10135
|
+
|
10136
|
+
int sumi1 = 0, sumi2 = 0;
|
10137
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10138
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
10139
|
+
|
10140
|
+
const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
|
10141
|
+
idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
|
10142
|
+
const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
10143
|
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
|
10144
|
+
const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
10145
|
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
|
10146
|
+
idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
|
10147
|
+
const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
10148
|
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
|
10149
|
+
const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
10150
|
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
|
10151
|
+
|
10152
|
+
|
10153
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
10154
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10155
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10156
|
+
vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
|
10157
|
+
vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
|
10158
|
+
|
10159
|
+
q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
|
10160
|
+
q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
|
10161
|
+
|
10162
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
10163
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10164
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10165
|
+
vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
|
10166
|
+
vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
|
10167
|
+
|
10168
|
+
signs += 4;
|
10169
|
+
|
10170
|
+
q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
|
10171
|
+
q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
|
10172
|
+
|
10173
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
10174
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
10175
|
+
#if QK_K == 256
|
10176
|
+
sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
|
10177
|
+
sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
|
10178
|
+
#else
|
10179
|
+
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
10180
|
+
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
10181
|
+
#endif
|
10182
|
+
}
|
10183
|
+
sumf += d*(sumi1 + sumi2);
|
10184
|
+
}
|
10185
|
+
*s = sumf;
|
10186
|
+
|
10187
|
+
#elif defined(__AVX2__)
|
10188
|
+
|
10189
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10190
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10191
|
+
};
|
10192
|
+
|
10193
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10194
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10195
|
+
};
|
10196
|
+
|
10197
|
+
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
10198
|
+
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
10199
|
+
|
10200
|
+
const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
10201
|
+
const __m256i idx_mask = _mm256_set1_epi32(256);
|
10202
|
+
|
10203
|
+
typedef union {
|
10204
|
+
__m256i vec[2];
|
10205
|
+
uint32_t index[16];
|
10206
|
+
} index_t;
|
10207
|
+
|
10208
|
+
index_t idx;
|
10209
|
+
|
10210
|
+
__m256 accumf = _mm256_setzero_ps();
|
10211
|
+
for (int i = 0; i < nb; ++i) {
|
10212
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10213
|
+
const uint8_t * restrict qs = x[i].qs;
|
10214
|
+
const uint8_t * restrict qh = x[i].qh;
|
10215
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10216
|
+
const int8_t * restrict q8 = y[i].qs;
|
10217
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
10218
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
10219
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10220
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10221
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10222
|
+
const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
|
10223
|
+
idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
|
10224
|
+
idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
|
10225
|
+
idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
|
10226
|
+
idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
|
10227
|
+
idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
|
10228
|
+
idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
|
10229
|
+
|
10230
|
+
// At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
|
10231
|
+
//const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
|
10232
|
+
//const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
|
10233
|
+
const __m256i q2_1 = _mm256_set_epi32(
|
10234
|
+
iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
|
10235
|
+
iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
|
10236
|
+
);
|
10237
|
+
const __m256i q2_2 = _mm256_set_epi32(
|
10238
|
+
iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
|
10239
|
+
iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
|
10240
|
+
);
|
10241
|
+
|
10242
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
10243
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
10244
|
+
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
10245
|
+
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
10246
|
+
|
10247
|
+
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
10248
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
10249
|
+
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
10250
|
+
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
10251
|
+
|
10252
|
+
signs += 4;
|
10253
|
+
|
10254
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
10255
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
10256
|
+
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
10257
|
+
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
10258
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
10259
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
10260
|
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
10261
|
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
10262
|
+
}
|
10263
|
+
|
10264
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
10265
|
+
|
10266
|
+
}
|
10267
|
+
|
10268
|
+
*s = hsum_float_8(accumf);
|
10269
|
+
|
10270
|
+
#else
|
10271
|
+
|
10272
|
+
float sumf = 0.f;
|
10273
|
+
for (int i = 0; i < nb; ++i) {
|
10274
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10275
|
+
const uint8_t * restrict qs = x[i].qs;
|
10276
|
+
const uint8_t * restrict qh = x[i].qh;
|
10277
|
+
const uint8_t * restrict signs = x[i].signs;
|
10278
|
+
const int8_t * restrict q8 = y[i].qs;
|
10279
|
+
int32_t bsum = 0;
|
10280
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10281
|
+
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
10282
|
+
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
10283
|
+
int32_t sumi = 0;
|
10284
|
+
for (int l = 0; l < 4; ++l) {
|
10285
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
10286
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
10287
|
+
for (int j = 0; j < 4; ++j) {
|
10288
|
+
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10289
|
+
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
10290
|
+
}
|
10291
|
+
q8 += 8;
|
10292
|
+
}
|
10293
|
+
qs += 8;
|
10294
|
+
signs += 4;
|
10295
|
+
bsum += sumi * ls1;
|
10296
|
+
sumi = 0;
|
10297
|
+
for (int l = 0; l < 4; ++l) {
|
10298
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
10299
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
10300
|
+
for (int j = 0; j < 4; ++j) {
|
10301
|
+
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10302
|
+
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
10303
|
+
}
|
10304
|
+
q8 += 8;
|
10305
|
+
}
|
10306
|
+
qs += 8;
|
10307
|
+
signs += 4;
|
10308
|
+
bsum += sumi * ls2;
|
10309
|
+
}
|
10310
|
+
sumf += d * bsum;
|
10311
|
+
}
|
10312
|
+
*s = sumf;
|
10313
|
+
#endif
|
10314
|
+
}
|
10315
|
+
|
10316
|
+
|
9330
10317
|
#ifdef __AVX2__
|
9331
10318
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
9332
10319
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
@@ -9348,7 +10335,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9348
10335
|
|
9349
10336
|
const int nb = n / QK_K;
|
9350
10337
|
|
9351
|
-
|
10338
|
+
// TODO: implement for QK_K = 64
|
10339
|
+
#if defined __ARM_NEON && QK_K == 256
|
9352
10340
|
|
9353
10341
|
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
9354
10342
|
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
@@ -9405,7 +10393,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9405
10393
|
|
9406
10394
|
*s = sumf;
|
9407
10395
|
|
9408
|
-
|
10396
|
+
// TODO: implement for QK_K = 64
|
10397
|
+
#elif defined __AVX2__ && QK_K == 256
|
9409
10398
|
|
9410
10399
|
const __m128i m8 = _mm_set1_epi8(0x08);
|
9411
10400
|
const __m128i m7 = _mm_set1_epi8(0x07);
|
@@ -9420,8 +10409,12 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9420
10409
|
|
9421
10410
|
uint64_t aux64;
|
9422
10411
|
|
9423
|
-
|
9424
|
-
|
10412
|
+
typedef union m256i_uint16 {
|
10413
|
+
__m256i reg;
|
10414
|
+
uint16_t s[16];
|
10415
|
+
} m256i_uint16_t;
|
10416
|
+
|
10417
|
+
m256i_uint16_t v_gindex;
|
9425
10418
|
|
9426
10419
|
__m256 accum = _mm256_setzero_ps();
|
9427
10420
|
for (int i = 0; i < nb; ++i) {
|
@@ -9436,13 +10429,13 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9436
10429
|
memcpy(&aux64, sc, 8); sc += 8;
|
9437
10430
|
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
9438
10431
|
const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
|
9439
|
-
v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
10432
|
+
v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
9440
10433
|
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
9441
10434
|
|
9442
10435
|
for (int i32 = 0; i32 < 4; ++i32) {
|
9443
10436
|
const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9444
|
-
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[
|
9445
|
-
iq1s_grid[
|
10437
|
+
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
|
10438
|
+
iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
|
9446
10439
|
const __m256i dot = mul_add_epi8(q1b, q8b);
|
9447
10440
|
const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
|
9448
10441
|
const __m256i p = _mm256_madd_epi16(s16, dot);
|
@@ -9520,27 +10513,134 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
9520
10513
|
int8x16x4_t q8b;
|
9521
10514
|
int32x4_t prod_1, prod_2;
|
9522
10515
|
|
9523
|
-
float sumf = 0;
|
10516
|
+
float sumf = 0;
|
10517
|
+
|
10518
|
+
for (int ib = 0; ib < nb; ib += 2) {
|
10519
|
+
|
10520
|
+
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
10521
|
+
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
10522
|
+
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
10523
|
+
q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
|
10524
|
+
q8b.val[2] = vld1q_s8(y[ib+1].qs);
|
10525
|
+
q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
|
10526
|
+
|
10527
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
10528
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
10529
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
10530
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
10531
|
+
|
10532
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
10533
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
10534
|
+
|
10535
|
+
sumf +=
|
10536
|
+
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
|
10537
|
+
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
|
10538
|
+
}
|
10539
|
+
|
10540
|
+
*s = sumf;
|
10541
|
+
|
10542
|
+
#elif defined __AVX2__
|
10543
|
+
|
10544
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
10545
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
10546
|
+
const __m256i mone = _mm256_set1_epi16(1);
|
10547
|
+
|
10548
|
+
__m256 accum1 = _mm256_setzero_ps();
|
10549
|
+
__m256 accum2 = _mm256_setzero_ps();
|
10550
|
+
for (int ib = 0; ib < nb; ib += 2) {
|
10551
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
|
10552
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
10553
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
10554
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
10555
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
10556
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
10557
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
10558
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
10559
|
+
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
10560
|
+
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
10561
|
+
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
10562
|
+
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
10563
|
+
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
10564
|
+
_mm256_cvtepi32_ps(p_1), accum1);
|
10565
|
+
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
10566
|
+
_mm256_cvtepi32_ps(p_2), accum2);
|
10567
|
+
|
10568
|
+
y += 2;
|
10569
|
+
x += 2;
|
10570
|
+
}
|
10571
|
+
|
10572
|
+
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
10573
|
+
|
10574
|
+
#else
|
10575
|
+
float sumf = 0;
|
10576
|
+
for (int ib = 0; ib < nb; ++ib) {
|
10577
|
+
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
10578
|
+
int sumi1 = 0, sumi2 = 0;
|
10579
|
+
for (int j = 0; j < QK4_NL/2; ++j) {
|
10580
|
+
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
10581
|
+
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
10582
|
+
}
|
10583
|
+
sumf += d * (sumi1 + sumi2);
|
10584
|
+
}
|
10585
|
+
*s = sumf;
|
10586
|
+
#endif
|
10587
|
+
}
|
10588
|
+
|
10589
|
+
void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
10590
|
+
assert(nrc == 1);
|
10591
|
+
UNUSED(nrc);
|
10592
|
+
UNUSED(bx);
|
10593
|
+
UNUSED(by);
|
10594
|
+
UNUSED(bs);
|
10595
|
+
assert(n % QK_K == 0);
|
10596
|
+
#if QK_K == 64
|
10597
|
+
ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
|
10598
|
+
#else
|
10599
|
+
|
10600
|
+
const block_iq4_xs * restrict x = vx;
|
10601
|
+
const block_q8_K * restrict y = vy;
|
10602
|
+
|
10603
|
+
const int nb = n / QK_K;
|
10604
|
+
|
10605
|
+
#if defined __ARM_NEON
|
10606
|
+
const int8x16_t values = vld1q_s8(kvalues_iq4nl);
|
10607
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
10608
|
+
ggml_uint8x16x2_t q4bits;
|
10609
|
+
ggml_int8x16x4_t q4b;
|
10610
|
+
ggml_int8x16x4_t q8b;
|
10611
|
+
int32x4_t prod_1, prod_2;
|
10612
|
+
|
10613
|
+
float sumf = 0;
|
10614
|
+
|
10615
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
10616
|
+
|
10617
|
+
const int8_t * q8 = y[ibl].qs;
|
10618
|
+
const uint8_t * q4 = x[ibl].qs;
|
10619
|
+
uint16_t h = x[ibl].scales_h;
|
10620
|
+
|
10621
|
+
int sumi1 = 0, sumi2 = 0;
|
10622
|
+
for (int ib = 0; ib < QK_K/64; ++ib) {
|
10623
|
+
|
10624
|
+
q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
10625
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9524
10626
|
|
9525
|
-
|
9526
|
-
|
9527
|
-
|
9528
|
-
|
9529
|
-
q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
|
9530
|
-
q8b.val[2] = vld1q_s8(y[ib+1].qs);
|
9531
|
-
q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
|
10627
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
10628
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
10629
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
10630
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
9532
10631
|
|
9533
|
-
|
9534
|
-
|
9535
|
-
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
9536
|
-
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
10632
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
10633
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
9537
10634
|
|
9538
|
-
|
9539
|
-
|
10635
|
+
int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
|
10636
|
+
int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
|
10637
|
+
h >>= 4;
|
10638
|
+
sumi1 += vaddvq_s32(prod_1) * ls1;
|
10639
|
+
sumi2 += vaddvq_s32(prod_2) * ls2;
|
9540
10640
|
|
9541
|
-
|
9542
|
-
|
9543
|
-
|
10641
|
+
}
|
10642
|
+
|
10643
|
+
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
9544
10644
|
}
|
9545
10645
|
|
9546
10646
|
*s = sumf;
|
@@ -9549,47 +10649,73 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
9549
10649
|
|
9550
10650
|
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
9551
10651
|
const __m128i m4b = _mm_set1_epi8(0x0f);
|
9552
|
-
const __m256i mone = _mm256_set1_epi16(1);
|
9553
|
-
|
9554
|
-
__m256 accum1 = _mm256_setzero_ps();
|
9555
|
-
__m256 accum2 = _mm256_setzero_ps();
|
9556
|
-
for (int ib = 0; ib < nb; ib += 2) {
|
9557
|
-
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
|
9558
|
-
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
9559
|
-
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
9560
|
-
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
9561
|
-
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
9562
|
-
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
9563
|
-
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
9564
|
-
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
9565
|
-
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
9566
|
-
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
9567
|
-
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
9568
|
-
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
9569
|
-
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
9570
|
-
_mm256_cvtepi32_ps(p_1), accum1);
|
9571
|
-
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
9572
|
-
_mm256_cvtepi32_ps(p_2), accum2);
|
9573
10652
|
|
9574
|
-
|
9575
|
-
|
10653
|
+
__m256 accum = _mm256_setzero_ps();
|
10654
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
10655
|
+
const uint8_t * qs = x[ibl].qs;
|
10656
|
+
const int8_t * q8 = y[ibl].qs;
|
10657
|
+
uint16_t sh = x[ibl].scales_h;
|
10658
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
10659
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
10660
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
10661
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
10662
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
10663
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10664
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10665
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
10666
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
10667
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
10668
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
10669
|
+
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
10670
|
+
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
10671
|
+
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
10672
|
+
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
10673
|
+
sh >>= 4;
|
10674
|
+
const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
|
10675
|
+
const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
|
10676
|
+
sumi1 = _mm256_add_epi32(p_1, sumi1);
|
10677
|
+
sumi2 = _mm256_add_epi32(p_2, sumi2);
|
10678
|
+
}
|
10679
|
+
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
10680
|
+
_mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
|
9576
10681
|
}
|
9577
10682
|
|
9578
|
-
*s = hsum_float_8(
|
10683
|
+
*s = hsum_float_8(accum);
|
9579
10684
|
|
9580
10685
|
#else
|
9581
10686
|
float sumf = 0;
|
9582
|
-
for (int
|
9583
|
-
const float
|
9584
|
-
|
9585
|
-
|
9586
|
-
|
9587
|
-
|
10687
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
10688
|
+
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
10689
|
+
uint16_t h = x[ibl].scales_h;
|
10690
|
+
const uint8_t * qs = x[ibl].qs;
|
10691
|
+
const int8_t * q8 = y[ibl].qs;
|
10692
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
10693
|
+
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
10694
|
+
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
10695
|
+
h >>= 4;
|
10696
|
+
const float d1 = d4d8*(ls1 - 32);
|
10697
|
+
const float d2 = d4d8*(ls2 - 32);
|
10698
|
+
int sumi1 = 0, sumi2 = 0;
|
10699
|
+
for (int j = 0; j < 16; ++j) {
|
10700
|
+
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
10701
|
+
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
10702
|
+
}
|
10703
|
+
sumf += d1 * (sumi1 + sumi2);
|
10704
|
+
qs += 16;
|
10705
|
+
q8 += 32;
|
10706
|
+
sumi1 = sumi2 = 0;
|
10707
|
+
for (int j = 0; j < 16; ++j) {
|
10708
|
+
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
10709
|
+
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
10710
|
+
}
|
10711
|
+
sumf += d2 * (sumi1 + sumi2);
|
10712
|
+
qs += 16;
|
10713
|
+
q8 += 32;
|
9588
10714
|
}
|
9589
|
-
sumf += d * (sumi1 + sumi2);
|
9590
10715
|
}
|
9591
10716
|
*s = sumf;
|
9592
10717
|
#endif
|
10718
|
+
#endif
|
9593
10719
|
}
|
9594
10720
|
|
9595
10721
|
// ================================ IQ2 quantization =============================================
|
@@ -9600,22 +10726,25 @@ typedef struct {
|
|
9600
10726
|
uint16_t * neighbours;
|
9601
10727
|
} iq2_entry_t;
|
9602
10728
|
|
9603
|
-
static iq2_entry_t iq2_data[
|
10729
|
+
static iq2_entry_t iq2_data[4] = {
|
10730
|
+
{NULL, NULL, NULL},
|
9604
10731
|
{NULL, NULL, NULL},
|
9605
10732
|
{NULL, NULL, NULL},
|
9606
10733
|
{NULL, NULL, NULL},
|
9607
10734
|
};
|
9608
10735
|
|
9609
10736
|
static inline int iq2_data_index(enum ggml_type type) {
|
9610
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
10737
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
9611
10738
|
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
9612
|
-
type == GGML_TYPE_IQ2_XS ? 1 :
|
10739
|
+
type == GGML_TYPE_IQ2_XS ? 1 :
|
10740
|
+
type == GGML_TYPE_IQ1_S ? 2 : 3;
|
9613
10741
|
}
|
9614
10742
|
|
9615
10743
|
static inline int iq2_grid_size(enum ggml_type type) {
|
9616
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
10744
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
9617
10745
|
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
9618
|
-
type == GGML_TYPE_IQ2_XS ? 512 :
|
10746
|
+
type == GGML_TYPE_IQ2_XS ? 512 :
|
10747
|
+
type == GGML_TYPE_IQ1_S ? 512 : 1024;
|
9619
10748
|
}
|
9620
10749
|
|
9621
10750
|
static int iq2_compare_func(const void * left, const void * right) {
|
@@ -9716,11 +10845,79 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
9716
10845
|
41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
|
9717
10846
|
42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
|
9718
10847
|
};
|
10848
|
+
static const uint16_t kgrid_2bit_1024[1024] = {
|
10849
|
+
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
10850
|
+
73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
|
10851
|
+
165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
|
10852
|
+
337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
|
10853
|
+
517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
|
10854
|
+
674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
|
10855
|
+
1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
|
10856
|
+
1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
|
10857
|
+
1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
|
10858
|
+
1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
|
10859
|
+
2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
|
10860
|
+
2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
|
10861
|
+
2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
|
10862
|
+
4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
|
10863
|
+
4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
|
10864
|
+
4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
|
10865
|
+
4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
|
10866
|
+
4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
|
10867
|
+
5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
|
10868
|
+
5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
|
10869
|
+
5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
|
10870
|
+
5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
|
10871
|
+
6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
|
10872
|
+
6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
|
10873
|
+
8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
|
10874
|
+
8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
|
10875
|
+
8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
|
10876
|
+
9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
|
10877
|
+
9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
|
10878
|
+
10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
|
10879
|
+
16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
|
10880
|
+
16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
|
10881
|
+
16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
|
10882
|
+
16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
|
10883
|
+
17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
|
10884
|
+
17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
|
10885
|
+
17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
|
10886
|
+
17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
|
10887
|
+
18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
|
10888
|
+
18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
|
10889
|
+
18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
|
10890
|
+
20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
|
10891
|
+
20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
|
10892
|
+
20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
|
10893
|
+
21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
|
10894
|
+
21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
|
10895
|
+
22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
|
10896
|
+
22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
|
10897
|
+
24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
|
10898
|
+
24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
|
10899
|
+
25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
|
10900
|
+
26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
|
10901
|
+
32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
|
10902
|
+
33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
|
10903
|
+
33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
|
10904
|
+
33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
|
10905
|
+
34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
|
10906
|
+
35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
|
10907
|
+
36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
|
10908
|
+
37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
|
10909
|
+
38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
|
10910
|
+
39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
|
10911
|
+
41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
|
10912
|
+
42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
|
10913
|
+
};
|
9719
10914
|
|
9720
10915
|
const int kmap_size = 43692;
|
9721
|
-
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
10916
|
+
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
10917
|
+
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
9722
10918
|
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
9723
|
-
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10919
|
+
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10920
|
+
type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
|
9724
10921
|
uint64_t * kgrid_q2xs;
|
9725
10922
|
int * kmap_q2xs;
|
9726
10923
|
uint16_t * kneighbors_q2xs;
|
@@ -9817,7 +11014,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
9817
11014
|
}
|
9818
11015
|
|
9819
11016
|
void iq2xs_free_impl(enum ggml_type type) {
|
9820
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
11017
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
9821
11018
|
const int gindex = iq2_data_index(type);
|
9822
11019
|
if (iq2_data[gindex].grid) {
|
9823
11020
|
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
@@ -9866,7 +11063,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
9866
11063
|
|
9867
11064
|
const int kMaxQ = 3;
|
9868
11065
|
|
9869
|
-
const int nbl = n/
|
11066
|
+
const int nbl = n/QK_K;
|
9870
11067
|
|
9871
11068
|
block_iq2_xxs * y = vy;
|
9872
11069
|
|
@@ -10039,7 +11236,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
10039
11236
|
|
10040
11237
|
const int kMaxQ = 3;
|
10041
11238
|
|
10042
|
-
const int nbl = n/
|
11239
|
+
const int nbl = n/QK_K;
|
10043
11240
|
|
10044
11241
|
block_iq2_xs * y = vy;
|
10045
11242
|
|
@@ -10239,14 +11436,15 @@ typedef struct {
|
|
10239
11436
|
uint16_t * neighbours;
|
10240
11437
|
} iq3_entry_t;
|
10241
11438
|
|
10242
|
-
static iq3_entry_t iq3_data[
|
11439
|
+
static iq3_entry_t iq3_data[2] = {
|
11440
|
+
{NULL, NULL, NULL},
|
10243
11441
|
{NULL, NULL, NULL},
|
10244
11442
|
};
|
10245
11443
|
|
10246
11444
|
static inline int iq3_data_index(int grid_size) {
|
10247
11445
|
(void)grid_size;
|
10248
|
-
GGML_ASSERT(grid_size == 256);
|
10249
|
-
return 0;
|
11446
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
11447
|
+
return grid_size == 256 ? 0 : 1;
|
10250
11448
|
}
|
10251
11449
|
|
10252
11450
|
static int iq3_compare_func(const void * left, const void * right) {
|
@@ -10278,9 +11476,44 @@ void iq3xs_init_impl(int grid_size) {
|
|
10278
11476
|
3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
|
10279
11477
|
3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
|
10280
11478
|
};
|
11479
|
+
static const uint16_t kgrid_512[512] = {
|
11480
|
+
0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34,
|
11481
|
+
37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77,
|
11482
|
+
80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142,
|
11483
|
+
145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210,
|
11484
|
+
217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288,
|
11485
|
+
291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393,
|
11486
|
+
395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514,
|
11487
|
+
516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576,
|
11488
|
+
577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653,
|
11489
|
+
655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727,
|
11490
|
+
728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833,
|
11491
|
+
840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977,
|
11492
|
+
989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047,
|
11493
|
+
1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103,
|
11494
|
+
1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199,
|
11495
|
+
1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296,
|
11496
|
+
1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415,
|
11497
|
+
1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561,
|
11498
|
+
1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648,
|
11499
|
+
1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761,
|
11500
|
+
1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877,
|
11501
|
+
1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068,
|
11502
|
+
2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177,
|
11503
|
+
2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269,
|
11504
|
+
2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520,
|
11505
|
+
2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634,
|
11506
|
+
2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805,
|
11507
|
+
2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083,
|
11508
|
+
3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276,
|
11509
|
+
3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591,
|
11510
|
+
3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729,
|
11511
|
+
3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032,
|
11512
|
+
};
|
11513
|
+
|
10281
11514
|
const int kmap_size = 4096;
|
10282
|
-
const int nwant = 2;
|
10283
|
-
const uint16_t * kgrid = kgrid_256;
|
11515
|
+
const int nwant = grid_size == 256 ? 2 : 3;
|
11516
|
+
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
10284
11517
|
uint32_t * kgrid_q3xs;
|
10285
11518
|
int * kmap_q3xs;
|
10286
11519
|
uint16_t * kneighbors_q3xs;
|
@@ -10377,7 +11610,7 @@ void iq3xs_init_impl(int grid_size) {
|
|
10377
11610
|
}
|
10378
11611
|
|
10379
11612
|
void iq3xs_free_impl(int grid_size) {
|
10380
|
-
GGML_ASSERT(grid_size == 256);
|
11613
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
10381
11614
|
const int gindex = iq3_data_index(grid_size);
|
10382
11615
|
if (iq3_data[gindex].grid) {
|
10383
11616
|
free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
|
@@ -10410,9 +11643,10 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
10410
11643
|
return grid_index;
|
10411
11644
|
}
|
10412
11645
|
|
10413
|
-
static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n,
|
11646
|
+
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
|
11647
|
+
const float * restrict quant_weights) {
|
10414
11648
|
|
10415
|
-
const int gindex = iq3_data_index(
|
11649
|
+
const int gindex = iq3_data_index(grid_size);
|
10416
11650
|
|
10417
11651
|
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
10418
11652
|
const int * kmap_q3xs = iq3_data[gindex].map;
|
@@ -10426,9 +11660,23 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10426
11660
|
|
10427
11661
|
const int kMaxQ = 8;
|
10428
11662
|
|
10429
|
-
const int nbl = n/
|
11663
|
+
const int nbl = n/QK_K;
|
10430
11664
|
|
10431
|
-
|
11665
|
+
ggml_fp16_t * dh;
|
11666
|
+
uint8_t * qs;
|
11667
|
+
int block_size;
|
11668
|
+
if (grid_size == 256) {
|
11669
|
+
block_iq3_xxs * y = vy;
|
11670
|
+
dh = &y->d;
|
11671
|
+
qs = y->qs;
|
11672
|
+
block_size = sizeof(block_iq3_xxs);
|
11673
|
+
} else {
|
11674
|
+
block_iq3_s * y = vy;
|
11675
|
+
dh = &y->d;
|
11676
|
+
qs = y->qs;
|
11677
|
+
block_size = sizeof(block_iq3_s);
|
11678
|
+
}
|
11679
|
+
int quant_size = block_size - sizeof(ggml_fp16_t);
|
10432
11680
|
|
10433
11681
|
float scales[QK_K/32];
|
10434
11682
|
float weight[32];
|
@@ -10439,65 +11687,280 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10439
11687
|
bool is_on_grid[8];
|
10440
11688
|
bool is_on_grid_aux[8];
|
10441
11689
|
uint8_t block_signs[8];
|
10442
|
-
uint8_t q3[3*(QK_K/8)];
|
11690
|
+
uint8_t q3[3*(QK_K/8)+QK_K/32];
|
10443
11691
|
uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
|
11692
|
+
uint8_t * qh = q3 + 3*(QK_K/8);
|
10444
11693
|
|
10445
11694
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
10446
11695
|
|
10447
|
-
|
10448
|
-
memset(q3, 0, 3*QK_K/8);
|
11696
|
+
dh[0] = GGML_FP32_TO_FP16(0.f);
|
11697
|
+
memset(q3, 0, 3*QK_K/8+QK_K/32);
|
10449
11698
|
|
10450
11699
|
float max_scale = 0;
|
10451
11700
|
|
10452
11701
|
const float * xbl = x + QK_K*ibl;
|
10453
11702
|
float sumx2 = 0;
|
10454
11703
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
10455
|
-
float sigma2 = sumx2/QK_K;
|
11704
|
+
float sigma2 = 2*sumx2/QK_K;
|
10456
11705
|
|
10457
11706
|
for (int ib = 0; ib < QK_K/32; ++ib) {
|
10458
11707
|
const float * xb = xbl + 32*ib;
|
10459
11708
|
if (quant_weights) {
|
10460
|
-
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
10461
|
-
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
11709
|
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
11710
|
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
11711
|
+
} else {
|
11712
|
+
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
11713
|
+
}
|
11714
|
+
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
11715
|
+
for (int k = 0; k < 4; ++k) {
|
11716
|
+
int nflip = 0;
|
11717
|
+
uint8_t s = 0;
|
11718
|
+
for (int i = 0; i < 8; ++i) {
|
11719
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
11720
|
+
else {
|
11721
|
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
11722
|
+
}
|
11723
|
+
}
|
11724
|
+
if (nflip%2) {
|
11725
|
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
11726
|
+
for (int i = 1; i < 8; ++i) {
|
11727
|
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
11728
|
+
if (ax < min) {
|
11729
|
+
min = ax; imin = i;
|
11730
|
+
}
|
11731
|
+
}
|
11732
|
+
xval[8*k+imin] = -xval[8*k+imin];
|
11733
|
+
s ^= (1 << imin);
|
11734
|
+
}
|
11735
|
+
block_signs[k] = s & 127;
|
11736
|
+
}
|
11737
|
+
float max = xval[0];
|
11738
|
+
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
11739
|
+
if (!max) {
|
11740
|
+
scales[ib] = 0;
|
11741
|
+
memset(L, 0, 32);
|
11742
|
+
continue;
|
11743
|
+
}
|
11744
|
+
float best = 0;
|
11745
|
+
float scale = max/(2*kMaxQ-1);
|
11746
|
+
for (int is = -15; is <= 15; ++is) {
|
11747
|
+
float id = (2*kMaxQ-1+is*0.2f)/max;
|
11748
|
+
float this_scale = 1/id;
|
11749
|
+
for (int k = 0; k < 8; ++k) {
|
11750
|
+
for (int i = 0; i < 4; ++i) {
|
11751
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
11752
|
+
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
11753
|
+
}
|
11754
|
+
uint16_t u = 0;
|
11755
|
+
for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
|
11756
|
+
int grid_index = kmap_q3xs[u];
|
11757
|
+
is_on_grid_aux[k] = true;
|
11758
|
+
if (grid_index < 0) {
|
11759
|
+
is_on_grid_aux[k] = false;
|
11760
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
11761
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
|
11762
|
+
}
|
11763
|
+
}
|
11764
|
+
float sumqx = 0, sumq2 = 0;
|
11765
|
+
for (int i = 0; i < 32; ++i) {
|
11766
|
+
float w = weight[i];
|
11767
|
+
float q = 2*Laux[i] + 1;
|
11768
|
+
sumqx += w*xval[i]*q;
|
11769
|
+
sumq2 += w*q*q;
|
11770
|
+
}
|
11771
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
11772
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
11773
|
+
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
|
11774
|
+
for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
11775
|
+
}
|
11776
|
+
}
|
11777
|
+
int n_not_ongrid = 0;
|
11778
|
+
for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
11779
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
11780
|
+
float id = 1/scale;
|
11781
|
+
for (int k = 0; k < 8; ++k) {
|
11782
|
+
if (is_on_grid[k]) continue;
|
11783
|
+
uint16_t u = 0;
|
11784
|
+
for (int i = 0; i < 4; ++i) {
|
11785
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
11786
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
11787
|
+
u |= (l << 3*i);
|
11788
|
+
}
|
11789
|
+
int grid_index = kmap_q3xs[u];
|
11790
|
+
if (grid_index < 0) {
|
11791
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
11792
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
|
11793
|
+
}
|
11794
|
+
const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
|
11795
|
+
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
11796
|
+
}
|
11797
|
+
float sumqx = 0, sumq2 = 0;
|
11798
|
+
for (int i = 0; i < 32; ++i) {
|
11799
|
+
float w = weight[i];
|
11800
|
+
float q = 2*L[i] + 1;
|
11801
|
+
sumqx += w*xval[i]*q;
|
11802
|
+
sumq2 += w*q*q;
|
11803
|
+
}
|
11804
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
11805
|
+
}
|
11806
|
+
if (scale < 0) {
|
11807
|
+
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
11808
|
+
// and correspondingly flip quant signs.
|
11809
|
+
scale = -scale;
|
11810
|
+
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
11811
|
+
}
|
11812
|
+
for (int k = 0; k < 8; ++k) {
|
11813
|
+
uint16_t u = 0;
|
11814
|
+
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
11815
|
+
int grid_index = kmap_q3xs[u];
|
11816
|
+
if (grid_index < 0) {
|
11817
|
+
printf("Oops: found point %u not on grid:", u);
|
11818
|
+
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
11819
|
+
printf("\n");
|
11820
|
+
GGML_ASSERT(false);
|
11821
|
+
}
|
11822
|
+
if (grid_size == 256) {
|
11823
|
+
q3[8*ib+k] = grid_index;
|
11824
|
+
} else {
|
11825
|
+
q3[8*ib+k] = grid_index & 255;
|
11826
|
+
qh[ib] |= ((grid_index >> 8) << k);
|
11827
|
+
}
|
11828
|
+
|
11829
|
+
}
|
11830
|
+
scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
|
11831
|
+
GGML_ASSERT(scale >= 0);
|
11832
|
+
scales[ib] = scale;
|
11833
|
+
max_scale = MAX(max_scale, scale);
|
11834
|
+
}
|
11835
|
+
|
11836
|
+
if (!max_scale) {
|
11837
|
+
memset(qs, 0, quant_size);
|
11838
|
+
dh += block_size/sizeof(ggml_fp16_t);
|
11839
|
+
qs += block_size;
|
11840
|
+
continue;
|
11841
|
+
}
|
11842
|
+
|
11843
|
+
float d = max_scale/31;
|
11844
|
+
dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor
|
11845
|
+
float id = 1/d;
|
11846
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
11847
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
11848
|
+
l = MAX(0, MIN(15, l));
|
11849
|
+
scales_and_signs[ib] |= ((uint32_t)l << 28);
|
11850
|
+
}
|
11851
|
+
memcpy(qs, q3, quant_size);
|
11852
|
+
|
11853
|
+
dh += block_size/sizeof(ggml_fp16_t);
|
11854
|
+
qs += block_size;
|
11855
|
+
|
11856
|
+
}
|
11857
|
+
}
|
11858
|
+
|
11859
|
+
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
11860
|
+
(void)hist;
|
11861
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
11862
|
+
int nblock = n_per_row/QK_K;
|
11863
|
+
char * qrow = (char *)dst;
|
11864
|
+
for (int row = 0; row < nrow; ++row) {
|
11865
|
+
quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
|
11866
|
+
src += n_per_row;
|
11867
|
+
qrow += nblock*sizeof(block_iq3_xxs);
|
11868
|
+
}
|
11869
|
+
return nrow * nblock * sizeof(block_iq3_xxs);
|
11870
|
+
}
|
11871
|
+
|
11872
|
+
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
|
11873
|
+
assert(k % QK_K == 0);
|
11874
|
+
block_iq3_xxs * restrict y = vy;
|
11875
|
+
quantize_row_iq3_xxs_reference(x, y, k);
|
11876
|
+
}
|
11877
|
+
|
11878
|
+
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
|
11879
|
+
assert(k % QK_K == 0);
|
11880
|
+
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
11881
|
+
}
|
11882
|
+
|
11883
|
+
static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
|
11884
|
+
const float * restrict quant_weights,
|
11885
|
+
float * scales,
|
11886
|
+
float * weight,
|
11887
|
+
float * xval,
|
11888
|
+
int8_t * L,
|
11889
|
+
int8_t * Laux,
|
11890
|
+
float * waux,
|
11891
|
+
bool * is_on_grid,
|
11892
|
+
bool * is_on_grid_aux,
|
11893
|
+
uint8_t * block_signs) {
|
11894
|
+
|
11895
|
+
const int gindex = iq3_data_index(512);
|
11896
|
+
|
11897
|
+
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
11898
|
+
const int * kmap_q3xs = iq3_data[gindex].map;
|
11899
|
+
const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
|
11900
|
+
|
11901
|
+
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
11902
|
+
GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
|
11903
|
+
GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
|
11904
|
+
GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
|
11905
|
+
GGML_ASSERT(n%QK_K == 0);
|
11906
|
+
|
11907
|
+
const int kMaxQ = 8;
|
11908
|
+
|
11909
|
+
const int nbl = n/QK_K;
|
11910
|
+
|
11911
|
+
block_iq3_s * y = vy;
|
11912
|
+
|
11913
|
+
const int bs4 = block_size/4;
|
11914
|
+
const int bs8 = block_size/8;
|
11915
|
+
|
11916
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
11917
|
+
|
11918
|
+
memset(&y[ibl], 0, sizeof(block_iq3_s));
|
11919
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
11920
|
+
|
11921
|
+
uint8_t * qs = y[ibl].qs;
|
11922
|
+
uint8_t * qh = y[ibl].qh;
|
11923
|
+
uint8_t * signs = y[ibl].signs;
|
11924
|
+
|
11925
|
+
float max_scale = 0;
|
11926
|
+
|
11927
|
+
const float * xbl = x + QK_K*ibl;
|
11928
|
+
float sumx2 = 0;
|
11929
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
11930
|
+
float sigma2 = 2*sumx2/QK_K;
|
11931
|
+
|
11932
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
11933
|
+
const float * xb = xbl + block_size*ib;
|
11934
|
+
if (quant_weights) {
|
11935
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
11936
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
10462
11937
|
} else {
|
10463
|
-
for (int i = 0; i <
|
11938
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
10464
11939
|
}
|
10465
|
-
for (int i = 0; i <
|
10466
|
-
for (int k = 0; k <
|
10467
|
-
int nflip = 0;
|
11940
|
+
for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
|
11941
|
+
for (int k = 0; k < bs8; ++k) {
|
10468
11942
|
uint8_t s = 0;
|
10469
11943
|
for (int i = 0; i < 8; ++i) {
|
10470
11944
|
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
10471
11945
|
else {
|
10472
|
-
xval[8*k + i] = -xb[8*k + i];
|
10473
|
-
}
|
10474
|
-
}
|
10475
|
-
if (nflip%2) {
|
10476
|
-
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
10477
|
-
for (int i = 1; i < 8; ++i) {
|
10478
|
-
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
10479
|
-
if (ax < min) {
|
10480
|
-
min = ax; imin = i;
|
10481
|
-
}
|
11946
|
+
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
10482
11947
|
}
|
10483
|
-
xval[8*k+imin] = -xval[8*k+imin];
|
10484
|
-
s ^= (1 << imin);
|
10485
11948
|
}
|
10486
|
-
block_signs[k] = s
|
11949
|
+
block_signs[k] = s;
|
10487
11950
|
}
|
10488
11951
|
float max = xval[0];
|
10489
|
-
for (int i = 1; i <
|
11952
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
|
10490
11953
|
if (!max) {
|
10491
11954
|
scales[ib] = 0;
|
10492
|
-
memset(L, 0, 32);
|
10493
11955
|
continue;
|
10494
11956
|
}
|
10495
11957
|
float best = 0;
|
10496
11958
|
float scale = max/(2*kMaxQ-1);
|
10497
|
-
for (int
|
11959
|
+
for (int k = 0; k < bs4; ++k) is_on_grid[k] = false;
|
11960
|
+
for (int is = -9; is <= 9; ++is) {
|
10498
11961
|
float id = (2*kMaxQ-1+is*0.2f)/max;
|
10499
11962
|
float this_scale = 1/id;
|
10500
|
-
for (int k = 0; k <
|
11963
|
+
for (int k = 0; k < bs4; ++k) {
|
10501
11964
|
for (int i = 0; i < 4; ++i) {
|
10502
11965
|
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
10503
11966
|
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
@@ -10513,7 +11976,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10513
11976
|
}
|
10514
11977
|
}
|
10515
11978
|
float sumqx = 0, sumq2 = 0;
|
10516
|
-
for (int i = 0; i <
|
11979
|
+
for (int i = 0; i < block_size; ++i) {
|
10517
11980
|
float w = weight[i];
|
10518
11981
|
float q = 2*Laux[i] + 1;
|
10519
11982
|
sumqx += w*xval[i]*q;
|
@@ -10521,16 +11984,16 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10521
11984
|
}
|
10522
11985
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
10523
11986
|
scale = sumqx/sumq2; best = scale*sumqx;
|
10524
|
-
for (int i = 0; i <
|
10525
|
-
for (int k = 0; k <
|
11987
|
+
for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
|
11988
|
+
for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
10526
11989
|
}
|
10527
11990
|
}
|
10528
11991
|
int n_not_ongrid = 0;
|
10529
|
-
for (int k = 0; k <
|
11992
|
+
for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
10530
11993
|
if (n_not_ongrid > 0 && scale > 0) {
|
10531
11994
|
float id = 1/scale;
|
10532
|
-
for (int k = 0; k <
|
10533
|
-
if (is_on_grid[k]) continue;
|
11995
|
+
for (int k = 0; k < bs4; ++k) {
|
11996
|
+
//if (is_on_grid[k]) continue;
|
10534
11997
|
uint16_t u = 0;
|
10535
11998
|
for (int i = 0; i < 4; ++i) {
|
10536
11999
|
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
@@ -10546,7 +12009,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10546
12009
|
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
10547
12010
|
}
|
10548
12011
|
float sumqx = 0, sumq2 = 0;
|
10549
|
-
for (int i = 0; i <
|
12012
|
+
for (int i = 0; i < block_size; ++i) {
|
10550
12013
|
float w = weight[i];
|
10551
12014
|
float q = 2*L[i] + 1;
|
10552
12015
|
sumqx += w*xval[i]*q;
|
@@ -10558,9 +12021,9 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10558
12021
|
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
10559
12022
|
// and correspondingly flip quant signs.
|
10560
12023
|
scale = -scale;
|
10561
|
-
for (int k = 0; k <
|
12024
|
+
for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
|
10562
12025
|
}
|
10563
|
-
for (int k = 0; k <
|
12026
|
+
for (int k = 0; k < bs4; ++k) {
|
10564
12027
|
uint16_t u = 0;
|
10565
12028
|
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
10566
12029
|
int grid_index = kmap_q3xs[u];
|
@@ -10570,99 +12033,71 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10570
12033
|
printf("\n");
|
10571
12034
|
GGML_ASSERT(false);
|
10572
12035
|
}
|
10573
|
-
|
12036
|
+
qs[k] = grid_index & 255;
|
12037
|
+
qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
|
10574
12038
|
}
|
10575
|
-
|
12039
|
+
qs += bs4;
|
12040
|
+
for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
|
12041
|
+
signs += bs8;
|
10576
12042
|
GGML_ASSERT(scale >= 0);
|
10577
12043
|
scales[ib] = scale;
|
10578
12044
|
max_scale = MAX(max_scale, scale);
|
10579
12045
|
}
|
10580
12046
|
|
10581
12047
|
if (!max_scale) {
|
10582
|
-
memset(y[ibl].qs, 0, 3*QK_K/8);
|
10583
12048
|
continue;
|
10584
12049
|
}
|
10585
12050
|
|
10586
12051
|
float d = max_scale/31;
|
10587
|
-
y[ibl].d = GGML_FP32_TO_FP16(d);
|
12052
|
+
y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f);
|
10588
12053
|
float id = 1/d;
|
10589
|
-
|
10590
|
-
|
10591
|
-
|
10592
|
-
|
10593
|
-
|
10594
|
-
|
10595
|
-
const float * xb = xbl + 32*ib;
|
10596
|
-
if (quant_weights) {
|
10597
|
-
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
10598
|
-
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
10599
|
-
} else {
|
10600
|
-
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
10601
|
-
}
|
10602
|
-
const float db = 0.25f * d * (1 + 2*l);
|
10603
|
-
for (int k = 0; k < 8; ++k) {
|
10604
|
-
const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
|
10605
|
-
const float * xk = xb + 4*k;
|
10606
|
-
const float * wk = weight + 4*k;
|
10607
|
-
//const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
|
10608
|
-
const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
|
10609
|
-
float best_mse = 0; int best_index = q3[8*ib+k];
|
10610
|
-
for (int j = 0; j < 4; ++j) {
|
10611
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
10612
|
-
best_mse += wk[j] * diff * diff;
|
10613
|
-
}
|
10614
|
-
for (int idx = 0; idx < 256; ++idx) {
|
10615
|
-
//grid = (const uint8_t *)(kgrid_q3xs + idx);
|
10616
|
-
grid = (const uint8_t *)(iq3xxs_grid + idx);
|
10617
|
-
float mse = 0;
|
10618
|
-
for (int j = 0; j < 4; ++j) {
|
10619
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
10620
|
-
mse += wk[j] * diff * diff;
|
10621
|
-
}
|
10622
|
-
if (mse < best_mse) {
|
10623
|
-
best_mse = mse; best_index = idx;
|
10624
|
-
}
|
10625
|
-
}
|
10626
|
-
q3[8*ib+k] = best_index;
|
10627
|
-
//grid = (const uint8_t *)(kgrid_q3xs + best_index);
|
10628
|
-
grid = (const uint8_t *)(iq3xxs_grid + best_index);
|
10629
|
-
for (int j = 0; j < 4; ++j) {
|
10630
|
-
float q = db * grid[j] * signs[j];
|
10631
|
-
sumqx += wk[j] * q * xk[j];
|
10632
|
-
sumq2 += wk[j] * q * q;
|
10633
|
-
}
|
10634
|
-
}
|
10635
|
-
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
10636
|
-
}
|
12054
|
+
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
12055
|
+
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|
12056
|
+
l1 = MAX(0, MIN(15, l1));
|
12057
|
+
int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
|
12058
|
+
l2 = MAX(0, MIN(15, l2));
|
12059
|
+
y[ibl].scales[ib/2] = l1 | (l2 << 4);
|
10637
12060
|
}
|
10638
|
-
|
12061
|
+
|
10639
12062
|
}
|
10640
12063
|
}
|
10641
12064
|
|
10642
|
-
|
12065
|
+
#define IQ3S_BLOCK_SIZE 32
|
12066
|
+
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
10643
12067
|
(void)hist;
|
10644
12068
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
10645
12069
|
int nblock = n_per_row/QK_K;
|
12070
|
+
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
12071
|
+
float weight[IQ3S_BLOCK_SIZE];
|
12072
|
+
float xval[IQ3S_BLOCK_SIZE];
|
12073
|
+
int8_t L[IQ3S_BLOCK_SIZE];
|
12074
|
+
int8_t Laux[IQ3S_BLOCK_SIZE];
|
12075
|
+
float waux[IQ3S_BLOCK_SIZE];
|
12076
|
+
bool is_on_grid[IQ3S_BLOCK_SIZE/4];
|
12077
|
+
bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
|
12078
|
+
uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
|
10646
12079
|
char * qrow = (char *)dst;
|
10647
12080
|
for (int row = 0; row < nrow; ++row) {
|
10648
|
-
|
12081
|
+
quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
|
12082
|
+
scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
|
10649
12083
|
src += n_per_row;
|
10650
|
-
qrow += nblock*sizeof(
|
12084
|
+
qrow += nblock*sizeof(block_iq3_s);
|
10651
12085
|
}
|
10652
|
-
return nrow * nblock * sizeof(
|
12086
|
+
return nrow * nblock * sizeof(block_iq3_s);
|
10653
12087
|
}
|
10654
12088
|
|
10655
|
-
void
|
12089
|
+
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
|
10656
12090
|
assert(k % QK_K == 0);
|
10657
|
-
|
10658
|
-
|
12091
|
+
block_iq3_s * restrict y = vy;
|
12092
|
+
quantize_row_iq3_s_reference(x, y, k);
|
10659
12093
|
}
|
10660
12094
|
|
10661
|
-
void
|
12095
|
+
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
10662
12096
|
assert(k % QK_K == 0);
|
10663
|
-
|
12097
|
+
quantize_iq3_s(x, y, 1, k, NULL, NULL);
|
10664
12098
|
}
|
10665
12099
|
|
12100
|
+
|
10666
12101
|
// =================================== 1.5 bpw ===================================================
|
10667
12102
|
|
10668
12103
|
static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
@@ -10745,7 +12180,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
10745
12180
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
10746
12181
|
GGML_ASSERT(n%QK_K == 0);
|
10747
12182
|
|
10748
|
-
const int nbl = n/
|
12183
|
+
const int nbl = n/QK_K;
|
10749
12184
|
|
10750
12185
|
block_iq1_s * y = vy;
|
10751
12186
|
|
@@ -10880,23 +12315,23 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|
10880
12315
|
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
10881
12316
|
}
|
10882
12317
|
|
10883
|
-
static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x,
|
10884
|
-
ggml_fp16_t * dh, uint8_t * q4,
|
10885
|
-
float * weight, uint8_t * L,
|
12318
|
+
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
|
12319
|
+
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
12320
|
+
float * scales, float * weight, uint8_t * L,
|
10886
12321
|
const int8_t * values,
|
10887
12322
|
const float * quant_weights) {
|
10888
12323
|
|
10889
12324
|
const int ntry = 7;
|
10890
12325
|
|
10891
12326
|
float sigma2 = 0;
|
10892
|
-
for (int j = 0; j <
|
10893
|
-
sigma2 *= 2.f/
|
12327
|
+
for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
|
12328
|
+
sigma2 *= 2.f/super_block_size;
|
10894
12329
|
|
10895
|
-
|
12330
|
+
memset(q4, 0, super_block_size/2);
|
12331
|
+
dh[0] = GGML_FP32_TO_FP16(0.f);
|
10896
12332
|
|
10897
|
-
|
10898
|
-
for (int ib = 0; ib <
|
10899
|
-
dh[ib] = GGML_FP32_TO_FP16(0.f);
|
12333
|
+
float max_scale = 0, amax_scale = 0;
|
12334
|
+
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
10900
12335
|
const float * xb = x + ib*block_size;
|
10901
12336
|
if (quant_weights) {
|
10902
12337
|
const float * qw = quant_weights + ib*block_size;
|
@@ -10912,6 +12347,7 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
|
|
10912
12347
|
}
|
10913
12348
|
}
|
10914
12349
|
if (!amax) {
|
12350
|
+
scales[ib] = 0;
|
10915
12351
|
continue;
|
10916
12352
|
}
|
10917
12353
|
float d = -max/values[0];
|
@@ -10925,7 +12361,6 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
|
|
10925
12361
|
sumqx += w*q*xb[j];
|
10926
12362
|
sumq2 += w*q*q;
|
10927
12363
|
}
|
10928
|
-
float best_id = id;
|
10929
12364
|
d = sumqx/sumq2;
|
10930
12365
|
float best = d*sumqx;
|
10931
12366
|
for (int itry = -ntry; itry <= ntry; ++itry) {
|
@@ -10941,15 +12376,47 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
|
|
10941
12376
|
}
|
10942
12377
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
10943
12378
|
d = sumqx/sumq2; best = d * sumqx;
|
10944
|
-
best_id = id;
|
10945
12379
|
}
|
10946
12380
|
}
|
10947
|
-
|
10948
|
-
|
10949
|
-
|
12381
|
+
scales[ib] = d;
|
12382
|
+
float abs_d = fabsf(d);
|
12383
|
+
if (abs_d > amax_scale) {
|
12384
|
+
amax_scale = abs_d; max_scale = d;
|
12385
|
+
}
|
12386
|
+
}
|
12387
|
+
|
12388
|
+
if (super_block_size/block_size > 1) {
|
12389
|
+
int nb = super_block_size/block_size;
|
12390
|
+
memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
|
12391
|
+
float d = -max_scale/32;
|
12392
|
+
dh[0] = GGML_FP32_TO_FP16(d);
|
12393
|
+
float id = d ? 1/d : 0.f;
|
12394
|
+
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
12395
|
+
int l = nearest_int(id*scales[ib]);
|
12396
|
+
l = MAX(-32, MIN(31, l));
|
12397
|
+
float dl = d * l;
|
12398
|
+
float idl = dl ? 1/dl : 0.f;
|
12399
|
+
uint8_t * Lb = L + ib*block_size;
|
12400
|
+
const float * xb = x + ib*block_size;
|
12401
|
+
for (int j = 0; j < block_size; ++j) {
|
12402
|
+
Lb[j] = best_index_int8(16, values, idl*xb[j]);
|
12403
|
+
}
|
12404
|
+
l += 32;
|
12405
|
+
uint8_t l_l = l & 0xf;
|
12406
|
+
uint8_t l_h = l >> 4;
|
12407
|
+
if (ib%2 == 0) scales_l[ib/2] = l_l;
|
12408
|
+
else scales_l[ib/2] |= (l_l << 4);
|
12409
|
+
scales_h[ib/8] |= (l_h << 2*(ib%8));
|
12410
|
+
}
|
12411
|
+
} else {
|
12412
|
+
dh[0] = GGML_FP32_TO_FP16(scales[0]);
|
12413
|
+
float id = scales[0] ? 1/scales[0] : 0;
|
12414
|
+
for (int j = 0; j < super_block_size; ++j) {
|
12415
|
+
L[j] = best_index_int8(16, values, id*x[j]);
|
10950
12416
|
}
|
10951
12417
|
}
|
10952
|
-
|
12418
|
+
|
12419
|
+
for (int i = 0; i < super_block_size/32; ++i) {
|
10953
12420
|
for (int j = 0; j < 16; ++j) {
|
10954
12421
|
q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
|
10955
12422
|
}
|
@@ -10962,12 +12429,16 @@ size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, i
|
|
10962
12429
|
int nblock = n_per_row/QK4_NL;
|
10963
12430
|
char * qrow = (char *)dst;
|
10964
12431
|
uint8_t L[QK4_NL];
|
10965
|
-
float weight[
|
12432
|
+
float weight[QK4_NL];
|
12433
|
+
uint16_t unused_h;
|
12434
|
+
uint8_t * unused_l = NULL;
|
12435
|
+
float scale;
|
10966
12436
|
for (int row = 0; row < nrow; ++row) {
|
10967
12437
|
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
10968
12438
|
for (int ibl = 0; ibl < nblock; ++ibl) {
|
10969
12439
|
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
10970
|
-
quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs,
|
12440
|
+
quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
|
12441
|
+
&scale, weight, L, kvalues_iq4nl, qw);
|
10971
12442
|
}
|
10972
12443
|
src += n_per_row;
|
10973
12444
|
qrow += nblock*sizeof(block_iq4_nl);
|
@@ -10986,3 +12457,232 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
|
|
10986
12457
|
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
10987
12458
|
}
|
10988
12459
|
|
12460
|
+
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
12461
|
+
#if QK_K == 64
|
12462
|
+
return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
|
12463
|
+
#else
|
12464
|
+
(void)hist;
|
12465
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
12466
|
+
int nblock = n_per_row/QK_K;
|
12467
|
+
char * qrow = (char *)dst;
|
12468
|
+
uint8_t L[QK_K];
|
12469
|
+
float weight[32];
|
12470
|
+
float scales[QK_K/32];
|
12471
|
+
for (int row = 0; row < nrow; ++row) {
|
12472
|
+
block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
|
12473
|
+
for (int ibl = 0; ibl < nblock; ++ibl) {
|
12474
|
+
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
|
12475
|
+
quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
|
12476
|
+
scales, weight, L, kvalues_iq4nl, qw);
|
12477
|
+
}
|
12478
|
+
src += n_per_row;
|
12479
|
+
qrow += nblock*sizeof(block_iq4_xs);
|
12480
|
+
}
|
12481
|
+
return nrow * nblock * sizeof(block_iq4_xs);
|
12482
|
+
#endif
|
12483
|
+
}
|
12484
|
+
|
12485
|
+
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
12486
|
+
assert(k % QK_K == 0);
|
12487
|
+
block_iq4_xs * restrict y = vy;
|
12488
|
+
quantize_row_iq4_xs_reference(x, y, k);
|
12489
|
+
}
|
12490
|
+
|
12491
|
+
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
12492
|
+
assert(k % QK_K == 0);
|
12493
|
+
quantize_iq4_xs(x, y, 1, k, NULL, NULL);
|
12494
|
+
}
|
12495
|
+
|
12496
|
+
// =============================== 2.5625 bpw
|
12497
|
+
|
12498
|
+
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
12499
|
+
|
12500
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
12501
|
+
|
12502
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
12503
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
12504
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
12505
|
+
|
12506
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
12507
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
12508
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
12509
|
+
GGML_ASSERT(n%QK_K == 0);
|
12510
|
+
|
12511
|
+
const int kMaxQ = 3;
|
12512
|
+
|
12513
|
+
const int nbl = n/QK_K;
|
12514
|
+
|
12515
|
+
block_iq2_s * y = vy;
|
12516
|
+
|
12517
|
+
float scales[QK_K/16];
|
12518
|
+
float weight[16];
|
12519
|
+
float xval[16];
|
12520
|
+
int8_t L[16];
|
12521
|
+
int8_t Laux[16];
|
12522
|
+
float waux[16];
|
12523
|
+
bool is_on_grid[2];
|
12524
|
+
bool is_on_grid_aux[2];
|
12525
|
+
uint8_t block_signs[2];
|
12526
|
+
|
12527
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
12528
|
+
|
12529
|
+
memset(&y[ibl], 0, sizeof(block_iq2_s));
|
12530
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
12531
|
+
|
12532
|
+
float max_scale = 0;
|
12533
|
+
|
12534
|
+
const float * xbl = x + QK_K*ibl;
|
12535
|
+
float sumx2 = 0;
|
12536
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
12537
|
+
float sigma2 = 2*sumx2/QK_K;
|
12538
|
+
|
12539
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
12540
|
+
const float * xb = xbl + 16*ib;
|
12541
|
+
if (quant_weights) {
|
12542
|
+
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
12543
|
+
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12544
|
+
} else {
|
12545
|
+
for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
|
12546
|
+
}
|
12547
|
+
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
12548
|
+
for (int k = 0; k < 2; ++k) {
|
12549
|
+
uint8_t s = 0;
|
12550
|
+
for (int i = 0; i < 8; ++i) {
|
12551
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
12552
|
+
else {
|
12553
|
+
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
12554
|
+
}
|
12555
|
+
}
|
12556
|
+
block_signs[k] = s;
|
12557
|
+
}
|
12558
|
+
float max = xval[0];
|
12559
|
+
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
12560
|
+
if (!max) {
|
12561
|
+
scales[ib] = 0;
|
12562
|
+
continue;
|
12563
|
+
}
|
12564
|
+
float best = 0;
|
12565
|
+
float scale = max/(2*kMaxQ-1);
|
12566
|
+
is_on_grid[0] = is_on_grid[1] = true;
|
12567
|
+
for (int is = -9; is <= 9; ++is) {
|
12568
|
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
12569
|
+
float this_scale = 1/id;
|
12570
|
+
for (int k = 0; k < 2; ++k) {
|
12571
|
+
for (int i = 0; i < 8; ++i) {
|
12572
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
12573
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
12574
|
+
}
|
12575
|
+
uint16_t u = 0;
|
12576
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
12577
|
+
int grid_index = kmap_q2xs[u];
|
12578
|
+
is_on_grid_aux[k] = true;
|
12579
|
+
if (grid_index < 0) {
|
12580
|
+
is_on_grid_aux[k] = false;
|
12581
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12582
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
12583
|
+
}
|
12584
|
+
}
|
12585
|
+
float sumqx = 0, sumq2 = 0;
|
12586
|
+
for (int i = 0; i < 16; ++i) {
|
12587
|
+
float w = weight[i];
|
12588
|
+
float q = 2*Laux[i] + 1;
|
12589
|
+
sumqx += w*xval[i]*q;
|
12590
|
+
sumq2 += w*q*q;
|
12591
|
+
}
|
12592
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
12593
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
12594
|
+
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
12595
|
+
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
12596
|
+
}
|
12597
|
+
}
|
12598
|
+
int n_not_ongrid = 0;
|
12599
|
+
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
12600
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
12601
|
+
float id = 1/scale;
|
12602
|
+
for (int k = 0; k < 2; ++k) {
|
12603
|
+
if (is_on_grid[k]) continue;
|
12604
|
+
uint16_t u = 0;
|
12605
|
+
for (int i = 0; i < 8; ++i) {
|
12606
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
12607
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
12608
|
+
u |= (l << 2*i);
|
12609
|
+
L[8*k + i] = l;
|
12610
|
+
}
|
12611
|
+
int grid_index = kmap_q2xs[u];
|
12612
|
+
if (grid_index < 0) {
|
12613
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12614
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
12615
|
+
}
|
12616
|
+
}
|
12617
|
+
float sumqx = 0, sumq2 = 0;
|
12618
|
+
for (int i = 0; i < 16; ++i) {
|
12619
|
+
float w = weight[i];
|
12620
|
+
float q = 2*L[i] + 1;
|
12621
|
+
sumqx += w*xval[i]*q;
|
12622
|
+
sumq2 += w*q*q;
|
12623
|
+
}
|
12624
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
12625
|
+
}
|
12626
|
+
if (scale < 0) {
|
12627
|
+
scale = -scale;
|
12628
|
+
for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
|
12629
|
+
}
|
12630
|
+
for (int k = 0; k < 2; ++k) {
|
12631
|
+
uint16_t u = 0;
|
12632
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
12633
|
+
int grid_index = kmap_q2xs[u];
|
12634
|
+
if (grid_index < 0) {
|
12635
|
+
printf("Oops: found point %u not on grid:", u);
|
12636
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
12637
|
+
printf("\n");
|
12638
|
+
GGML_ASSERT(false);
|
12639
|
+
}
|
12640
|
+
const int i8 = 2*ib + k;
|
12641
|
+
y[ibl].qs[i8] = grid_index & 255;
|
12642
|
+
y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
|
12643
|
+
y[ibl].qs[QK_K/8 + i8] = block_signs[k];
|
12644
|
+
}
|
12645
|
+
GGML_ASSERT(scale >= 0);
|
12646
|
+
scales[ib] = scale;
|
12647
|
+
max_scale = MAX(max_scale, scale);
|
12648
|
+
}
|
12649
|
+
|
12650
|
+
if (!max_scale) {
|
12651
|
+
continue;
|
12652
|
+
}
|
12653
|
+
|
12654
|
+
float d = max_scale/31;
|
12655
|
+
y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
|
12656
|
+
float id = 1/d;
|
12657
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
12658
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
12659
|
+
l = MAX(0, MIN(15, l));
|
12660
|
+
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
12661
|
+
else y[ibl].scales[ib/2] |= (l << 4);
|
12662
|
+
}
|
12663
|
+
}
|
12664
|
+
}
|
12665
|
+
|
12666
|
+
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
12667
|
+
(void)hist;
|
12668
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
12669
|
+
int nblock = n_per_row/QK_K;
|
12670
|
+
char * qrow = (char *)dst;
|
12671
|
+
for (int row = 0; row < nrow; ++row) {
|
12672
|
+
quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
|
12673
|
+
src += n_per_row;
|
12674
|
+
qrow += nblock*sizeof(block_iq2_s);
|
12675
|
+
}
|
12676
|
+
return nrow * nblock * sizeof(block_iq2_s);
|
12677
|
+
}
|
12678
|
+
|
12679
|
+
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
12680
|
+
assert(k % QK_K == 0);
|
12681
|
+
quantize_iq2_s(x, y, 1, k, NULL, NULL);
|
12682
|
+
}
|
12683
|
+
|
12684
|
+
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
12685
|
+
assert(k % QK_K == 0);
|
12686
|
+
block_iq2_s * restrict y = vy;
|
12687
|
+
quantize_row_iq2_s_reference(x, y, k);
|
12688
|
+
}
|