llama_cpp 0.12.7 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
462
462
|
return res;
|
463
463
|
}
|
464
464
|
|
465
|
+
// NOTE: not tested
|
466
|
+
inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
467
|
+
int8x16_t res;
|
468
|
+
|
469
|
+
res[ 0] = a[b[ 0]];
|
470
|
+
res[ 1] = a[b[ 1]];
|
471
|
+
res[ 2] = a[b[ 2]];
|
472
|
+
res[ 3] = a[b[ 3]];
|
473
|
+
res[ 4] = a[b[ 4]];
|
474
|
+
res[ 5] = a[b[ 5]];
|
475
|
+
res[ 6] = a[b[ 6]];
|
476
|
+
res[ 7] = a[b[ 7]];
|
477
|
+
res[ 8] = a[b[ 8]];
|
478
|
+
res[ 9] = a[b[ 9]];
|
479
|
+
res[10] = a[b[10]];
|
480
|
+
res[11] = a[b[11]];
|
481
|
+
res[12] = a[b[12]];
|
482
|
+
res[13] = a[b[13]];
|
483
|
+
res[14] = a[b[14]];
|
484
|
+
res[15] = a[b[15]];
|
485
|
+
|
486
|
+
return res;
|
487
|
+
}
|
488
|
+
|
465
489
|
#else
|
466
490
|
|
467
491
|
#define ggml_int16x8x2_t int16x8x2_t
|
@@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
476
500
|
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
477
501
|
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
478
502
|
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
503
|
+
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
479
504
|
|
480
505
|
#endif
|
481
506
|
|
@@ -1852,7 +1877,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1852
1877
|
float mins[QK_K/16];
|
1853
1878
|
float scales[QK_K/16];
|
1854
1879
|
float sw[QK_K/16];
|
1855
|
-
float weight[
|
1880
|
+
float weight[16];
|
1856
1881
|
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
1857
1882
|
|
1858
1883
|
for (int i = 0; i < nb; i++) {
|
@@ -1862,13 +1887,42 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1862
1887
|
float sigma2 = sumx2/QK_K;
|
1863
1888
|
for (int j = 0; j < QK_K/16; ++j) {
|
1864
1889
|
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
1865
|
-
for (int l = 0; l <
|
1890
|
+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
1866
1891
|
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
1867
|
-
scales[j] = make_qkx3_quants(
|
1892
|
+
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
1868
1893
|
}
|
1869
1894
|
|
1870
|
-
float dm
|
1871
|
-
|
1895
|
+
float dm, mm;
|
1896
|
+
#if QK_K == 64
|
1897
|
+
float max_scale = 0, max_min = 0;
|
1898
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1899
|
+
max_scale = MAX(max_scale, scales[j]);
|
1900
|
+
max_min = MAX(max_min, mins[j]);
|
1901
|
+
}
|
1902
|
+
dm = max_scale/15;
|
1903
|
+
mm = max_min/15;
|
1904
|
+
if (max_scale) {
|
1905
|
+
float id = 1/dm;
|
1906
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1907
|
+
int l = nearest_int(id*scales[j]);
|
1908
|
+
Ls[j] = MAX(0, MIN(15, l));
|
1909
|
+
}
|
1910
|
+
} else {
|
1911
|
+
memset(Ls, 0, QK_K/16);
|
1912
|
+
}
|
1913
|
+
if (max_min) {
|
1914
|
+
float id = 1/mm;
|
1915
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1916
|
+
int l = nearest_int(id*mins[j]);
|
1917
|
+
Lm[j] = MAX(0, MIN(15, l));
|
1918
|
+
}
|
1919
|
+
} else {
|
1920
|
+
memset(Lm, 0, QK_K/16);
|
1921
|
+
}
|
1922
|
+
#else
|
1923
|
+
dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
1924
|
+
mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
1925
|
+
#endif
|
1872
1926
|
y[i].d = GGML_FP32_TO_FP16(dm);
|
1873
1927
|
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
1874
1928
|
dm = GGML_FP16_TO_FP32(y[i].d);
|
@@ -3470,6 +3524,265 @@ static const uint64_t iq2xs_grid[512] = {
|
|
3470
3524
|
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
3471
3525
|
};
|
3472
3526
|
|
3527
|
+
static const uint64_t iq2s_grid[1024] = {
|
3528
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
3529
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
3530
|
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
3531
|
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
3532
|
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
3533
|
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
3534
|
+
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
3535
|
+
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
3536
|
+
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
3537
|
+
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
3538
|
+
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
3539
|
+
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
3540
|
+
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
3541
|
+
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
3542
|
+
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
3543
|
+
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
3544
|
+
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
3545
|
+
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
3546
|
+
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
3547
|
+
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
3548
|
+
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
3549
|
+
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
3550
|
+
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
3551
|
+
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
3552
|
+
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
3553
|
+
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
3554
|
+
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
3555
|
+
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
3556
|
+
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
3557
|
+
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
3558
|
+
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
3559
|
+
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
3560
|
+
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
3561
|
+
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
3562
|
+
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
3563
|
+
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
3564
|
+
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
3565
|
+
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
3566
|
+
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
3567
|
+
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
3568
|
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
3569
|
+
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
3570
|
+
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
3571
|
+
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
3572
|
+
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
3573
|
+
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
3574
|
+
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
3575
|
+
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
3576
|
+
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
3577
|
+
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
3578
|
+
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
3579
|
+
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
3580
|
+
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
3581
|
+
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
3582
|
+
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
3583
|
+
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
3584
|
+
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
3585
|
+
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
3586
|
+
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
3587
|
+
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
3588
|
+
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
3589
|
+
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
3590
|
+
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
3591
|
+
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
3592
|
+
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
3593
|
+
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
3594
|
+
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
3595
|
+
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
3596
|
+
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
3597
|
+
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
3598
|
+
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
3599
|
+
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
3600
|
+
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
3601
|
+
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
3602
|
+
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
3603
|
+
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
3604
|
+
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
3605
|
+
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
3606
|
+
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
3607
|
+
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
3608
|
+
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
3609
|
+
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
3610
|
+
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
3611
|
+
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
3612
|
+
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
3613
|
+
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
3614
|
+
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
3615
|
+
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
3616
|
+
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
3617
|
+
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
3618
|
+
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
3619
|
+
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
3620
|
+
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
3621
|
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
3622
|
+
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
3623
|
+
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
3624
|
+
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
3625
|
+
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
3626
|
+
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
3627
|
+
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
3628
|
+
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
3629
|
+
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
3630
|
+
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
3631
|
+
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
3632
|
+
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
3633
|
+
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
3634
|
+
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
3635
|
+
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
3636
|
+
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
3637
|
+
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
3638
|
+
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
3639
|
+
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
3640
|
+
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
3641
|
+
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
3642
|
+
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
3643
|
+
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
3644
|
+
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
3645
|
+
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
3646
|
+
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
3647
|
+
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
3648
|
+
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
3649
|
+
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
3650
|
+
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
3651
|
+
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
3652
|
+
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
3653
|
+
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
3654
|
+
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
3655
|
+
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
3656
|
+
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
3657
|
+
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
3658
|
+
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
3659
|
+
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
3660
|
+
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
3661
|
+
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
3662
|
+
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
3663
|
+
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
3664
|
+
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
3665
|
+
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
3666
|
+
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
3667
|
+
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
3668
|
+
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
3669
|
+
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
3670
|
+
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
3671
|
+
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
3672
|
+
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
3673
|
+
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
3674
|
+
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
3675
|
+
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
3676
|
+
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
3677
|
+
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
3678
|
+
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
3679
|
+
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
3680
|
+
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
3681
|
+
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
3682
|
+
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
3683
|
+
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
3684
|
+
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
3685
|
+
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
3686
|
+
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
3687
|
+
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
3688
|
+
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
3689
|
+
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
3690
|
+
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
3691
|
+
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
3692
|
+
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
3693
|
+
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
3694
|
+
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
3695
|
+
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
3696
|
+
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
3697
|
+
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
3698
|
+
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
3699
|
+
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
3700
|
+
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
3701
|
+
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
3702
|
+
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
3703
|
+
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
3704
|
+
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
3705
|
+
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
3706
|
+
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
3707
|
+
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
3708
|
+
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
3709
|
+
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
3710
|
+
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
3711
|
+
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
3712
|
+
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
3713
|
+
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
3714
|
+
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
3715
|
+
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
3716
|
+
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
3717
|
+
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
3718
|
+
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
3719
|
+
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
3720
|
+
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
3721
|
+
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
3722
|
+
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
3723
|
+
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
3724
|
+
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
3725
|
+
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
3726
|
+
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
3727
|
+
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
3728
|
+
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
3729
|
+
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
3730
|
+
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
3731
|
+
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
3732
|
+
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
3733
|
+
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
3734
|
+
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
3735
|
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
3736
|
+
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
3737
|
+
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
3738
|
+
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
3739
|
+
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
3740
|
+
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
3741
|
+
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
3742
|
+
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
3743
|
+
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
3744
|
+
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
3745
|
+
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
3746
|
+
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
3747
|
+
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
3748
|
+
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
3749
|
+
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
3750
|
+
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
3751
|
+
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
3752
|
+
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
3753
|
+
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
3754
|
+
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
3755
|
+
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
3756
|
+
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
3757
|
+
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
3758
|
+
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
3759
|
+
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
3760
|
+
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
3761
|
+
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
3762
|
+
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
3763
|
+
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
3764
|
+
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
3765
|
+
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
3766
|
+
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
3767
|
+
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
3768
|
+
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
3769
|
+
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
3770
|
+
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
3771
|
+
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
3772
|
+
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
3773
|
+
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
3774
|
+
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
3775
|
+
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
3776
|
+
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
3777
|
+
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
3778
|
+
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
3779
|
+
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
3780
|
+
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
3781
|
+
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
3782
|
+
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
3783
|
+
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
3784
|
+
};
|
3785
|
+
|
3473
3786
|
static const uint32_t iq3xxs_grid[256] = {
|
3474
3787
|
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
3475
3788
|
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
@@ -3505,6 +3818,73 @@ static const uint32_t iq3xxs_grid[256] = {
|
|
3505
3818
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
3506
3819
|
};
|
3507
3820
|
|
3821
|
+
static const uint32_t iq3xs_grid[512] = {
|
3822
|
+
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
3823
|
+
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
3824
|
+
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
3825
|
+
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
3826
|
+
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
3827
|
+
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
3828
|
+
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
3829
|
+
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
3830
|
+
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
3831
|
+
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
3832
|
+
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
3833
|
+
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
3834
|
+
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
3835
|
+
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
3836
|
+
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
3837
|
+
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
3838
|
+
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
3839
|
+
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
3840
|
+
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
3841
|
+
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
3842
|
+
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
3843
|
+
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
3844
|
+
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
3845
|
+
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
3846
|
+
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
3847
|
+
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
3848
|
+
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
3849
|
+
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
3850
|
+
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
3851
|
+
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
3852
|
+
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
3853
|
+
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
3854
|
+
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
3855
|
+
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
3856
|
+
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
3857
|
+
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
3858
|
+
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
3859
|
+
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
3860
|
+
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
3861
|
+
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
3862
|
+
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
3863
|
+
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
3864
|
+
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
3865
|
+
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
3866
|
+
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
3867
|
+
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
3868
|
+
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
3869
|
+
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
3870
|
+
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
3871
|
+
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
3872
|
+
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
3873
|
+
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
3874
|
+
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
3875
|
+
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
3876
|
+
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
3877
|
+
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
3878
|
+
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
3879
|
+
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
3880
|
+
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
3881
|
+
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
3882
|
+
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
3883
|
+
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
3884
|
+
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
3885
|
+
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
3886
|
+
};
|
3887
|
+
|
3508
3888
|
#define NGRID_IQ2XXS 512
|
3509
3889
|
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
3510
3890
|
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
@@ -3704,6 +4084,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
3704
4084
|
}
|
3705
4085
|
}
|
3706
4086
|
|
4087
|
+
// ====================== 2.5625 bpw (de)-quantization
|
4088
|
+
|
4089
|
+
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
|
4090
|
+
assert(k % QK_K == 0);
|
4091
|
+
const int nb = k / QK_K;
|
4092
|
+
|
4093
|
+
float db[2];
|
4094
|
+
|
4095
|
+
for (int i = 0; i < nb; i++) {
|
4096
|
+
|
4097
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4098
|
+
const uint8_t * qs = x[i].qs;
|
4099
|
+
const uint8_t * qh = x[i].qh;
|
4100
|
+
const uint8_t * signs = qs + QK_K/8;
|
4101
|
+
|
4102
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
4103
|
+
db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
|
4104
|
+
db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
|
4105
|
+
for (int l = 0; l < 4; ++l) {
|
4106
|
+
const float dl = db[l/2];
|
4107
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
4108
|
+
for (int j = 0; j < 8; ++j) {
|
4109
|
+
y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
4110
|
+
}
|
4111
|
+
y += 8;
|
4112
|
+
}
|
4113
|
+
qs += 4;
|
4114
|
+
signs += 4;
|
4115
|
+
}
|
4116
|
+
}
|
4117
|
+
}
|
4118
|
+
|
3707
4119
|
// ====================== 3.0625 bpw (de)-quantization
|
3708
4120
|
|
3709
4121
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
|
@@ -3736,6 +4148,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
|
3736
4148
|
}
|
3737
4149
|
}
|
3738
4150
|
|
4151
|
+
// ====================== 3.3125 bpw (de)-quantization
|
4152
|
+
|
4153
|
+
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
|
4154
|
+
assert(k % QK_K == 0);
|
4155
|
+
const int nb = k / QK_K;
|
4156
|
+
|
4157
|
+
for (int i = 0; i < nb; i++) {
|
4158
|
+
|
4159
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4160
|
+
const uint8_t * qs = x[i].qs;
|
4161
|
+
const uint8_t * qh = x[i].qh;
|
4162
|
+
const uint8_t * signs = x[i].signs;
|
4163
|
+
|
4164
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
4165
|
+
const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f;
|
4166
|
+
const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f;
|
4167
|
+
for (int l = 0; l < 4; ++l) {
|
4168
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
4169
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
4170
|
+
for (int j = 0; j < 4; ++j) {
|
4171
|
+
y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4172
|
+
y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4173
|
+
}
|
4174
|
+
y += 8;
|
4175
|
+
}
|
4176
|
+
qs += 8;
|
4177
|
+
signs += 4;
|
4178
|
+
for (int l = 0; l < 4; ++l) {
|
4179
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
|
4180
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
|
4181
|
+
for (int j = 0; j < 4; ++j) {
|
4182
|
+
y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4183
|
+
y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4184
|
+
}
|
4185
|
+
y += 8;
|
4186
|
+
}
|
4187
|
+
qh += 2;
|
4188
|
+
qs += 8;
|
4189
|
+
signs += 4;
|
4190
|
+
}
|
4191
|
+
}
|
4192
|
+
}
|
4193
|
+
|
3739
4194
|
// ====================== 1.5625 bpw (de)-quantization
|
3740
4195
|
|
3741
4196
|
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
|
@@ -3799,6 +4254,33 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
|
3799
4254
|
}
|
3800
4255
|
}
|
3801
4256
|
|
4257
|
+
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
|
4258
|
+
assert(k % QK_K == 0);
|
4259
|
+
#if QK_K == 64
|
4260
|
+
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
4261
|
+
#else
|
4262
|
+
const int nb = k / QK_K;
|
4263
|
+
|
4264
|
+
for (int i = 0; i < nb; i++) {
|
4265
|
+
|
4266
|
+
const uint8_t * qs = x[i].qs;
|
4267
|
+
|
4268
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4269
|
+
|
4270
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
4271
|
+
const int ls = ((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4);
|
4272
|
+
const float dl = d * (ls - 32);
|
4273
|
+
for (int j = 0; j < 16; ++j) {
|
4274
|
+
y[j+ 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
|
4275
|
+
y[j+16] = dl * kvalues_iq4nl[qs[j] >> 4];
|
4276
|
+
}
|
4277
|
+
y += 32;
|
4278
|
+
qs += 16;
|
4279
|
+
}
|
4280
|
+
}
|
4281
|
+
#endif
|
4282
|
+
}
|
4283
|
+
|
3802
4284
|
//===================================== Q8_K ==============================================
|
3803
4285
|
|
3804
4286
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
@@ -5857,7 +6339,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5857
6339
|
|
5858
6340
|
float sumf = 0;
|
5859
6341
|
|
5860
|
-
int isum[
|
6342
|
+
int isum[QK_K/16];
|
5861
6343
|
|
5862
6344
|
for (int i = 0; i < nb; ++i) {
|
5863
6345
|
|
@@ -5873,14 +6355,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5873
6355
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
5874
6356
|
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
5875
6357
|
|
5876
|
-
isum
|
6358
|
+
memset(isum, 0, (QK_K/16)*sizeof(int));
|
5877
6359
|
for (int l = 0; l < 16; ++l) {
|
5878
6360
|
isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
|
5879
6361
|
isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
|
5880
6362
|
isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
|
5881
6363
|
isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
|
5882
6364
|
}
|
5883
|
-
for (int l = 0; l <
|
6365
|
+
for (int l = 0; l < QK_K/16; ++l) {
|
5884
6366
|
isum[l] *= (sc[l] & 0xF);
|
5885
6367
|
}
|
5886
6368
|
sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
|
@@ -8806,6 +9288,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8806
9288
|
|
8807
9289
|
#endif
|
8808
9290
|
|
9291
|
+
#if defined (__AVX2__) || defined (__ARM_NEON)
|
8809
9292
|
static const int8_t keven_signs_q2xs[1024] = {
|
8810
9293
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
8811
9294
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
@@ -8840,6 +9323,7 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|
8840
9323
|
1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
|
8841
9324
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
8842
9325
|
};
|
9326
|
+
#endif
|
8843
9327
|
|
8844
9328
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8845
9329
|
assert(n % QK_K == 0);
|
@@ -9037,15 +9521,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9037
9521
|
|
9038
9522
|
#elif defined(__AVX2__)
|
9039
9523
|
|
9040
|
-
const __m128i m4 = _mm_set1_epi8(0xf);
|
9041
|
-
const __m128i m1 = _mm_set1_epi8(1);
|
9042
|
-
const __m256i m511 = _mm256_set1_epi16(511);
|
9043
9524
|
const __m256i mone = _mm256_set1_epi8(1);
|
9044
|
-
|
9045
|
-
static const uint8_t k_bit_helper[32] = {
|
9046
|
-
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9047
|
-
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9048
|
-
};
|
9049
9525
|
static const char block_sign_shuffle_mask_1[32] = {
|
9050
9526
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
9051
9527
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
@@ -9059,11 +9535,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9059
9535
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9060
9536
|
};
|
9061
9537
|
|
9062
|
-
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
9063
9538
|
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
|
9064
9539
|
const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
|
9065
9540
|
const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
|
9066
9541
|
|
9542
|
+
#if QK_K == 64
|
9543
|
+
static const uint8_t k_bit_helper[16] = {
|
9544
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9545
|
+
};
|
9546
|
+
const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
9547
|
+
const __m128i m511 = _mm_set1_epi16(511);
|
9548
|
+
typedef union {
|
9549
|
+
__m128i vec_index;
|
9550
|
+
uint16_t index[8];
|
9551
|
+
} index_t;
|
9552
|
+
|
9553
|
+
index_t idx;
|
9554
|
+
__m256 accumf = _mm256_setzero_ps();
|
9555
|
+
for (int i = 0; i < nb; ++i) {
|
9556
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9557
|
+
const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
|
9558
|
+
idx.vec_index = _mm_and_si128(q2_data, m511);
|
9559
|
+
|
9560
|
+
const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
|
9561
|
+
const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
|
9562
|
+
const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
|
9563
|
+
|
9564
|
+
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
9565
|
+
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
9566
|
+
const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
|
9567
|
+
|
9568
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
9569
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
9570
|
+
|
9571
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
|
9572
|
+
iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
|
9573
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
|
9574
|
+
iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
|
9575
|
+
|
9576
|
+
__m256i signs;
|
9577
|
+
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
|
9578
|
+
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
9579
|
+
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
|
9580
|
+
|
9581
|
+
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
|
9582
|
+
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
9583
|
+
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
|
9584
|
+
|
9585
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
9586
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
9587
|
+
|
9588
|
+
const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
9589
|
+
const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
9590
|
+
|
9591
|
+
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
9592
|
+
|
9593
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
|
9594
|
+
|
9595
|
+
}
|
9596
|
+
|
9597
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9598
|
+
#else
|
9599
|
+
|
9600
|
+
static const uint8_t k_bit_helper[32] = {
|
9601
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9602
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9603
|
+
};
|
9604
|
+
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
9605
|
+
const __m256i m511 = _mm256_set1_epi16(511);
|
9606
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9607
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9608
|
+
|
9067
9609
|
uint64_t aux64;
|
9068
9610
|
|
9069
9611
|
// somewhat hacky, but gives a significant boost in performance
|
@@ -9152,6 +9694,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9152
9694
|
}
|
9153
9695
|
|
9154
9696
|
*s = 0.125f * hsum_float_8(accumf);
|
9697
|
+
#endif
|
9155
9698
|
|
9156
9699
|
#else
|
9157
9700
|
|
@@ -9193,7 +9736,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9193
9736
|
#endif
|
9194
9737
|
}
|
9195
9738
|
|
9196
|
-
void
|
9739
|
+
void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9197
9740
|
assert(n % QK_K == 0);
|
9198
9741
|
assert(nrc == 1);
|
9199
9742
|
UNUSED(nrc);
|
@@ -9201,88 +9744,148 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9201
9744
|
UNUSED(by);
|
9202
9745
|
UNUSED(bs);
|
9203
9746
|
|
9204
|
-
const
|
9205
|
-
const block_q8_K
|
9747
|
+
const block_iq2_s * restrict x = vx;
|
9748
|
+
const block_q8_K * restrict y = vy;
|
9206
9749
|
|
9207
9750
|
const int nb = n / QK_K;
|
9208
9751
|
|
9209
9752
|
#if defined(__ARM_NEON)
|
9210
9753
|
|
9211
|
-
|
9754
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9755
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
9756
|
+
};
|
9212
9757
|
|
9213
|
-
|
9758
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
9214
9759
|
|
9215
|
-
|
9760
|
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
9761
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
9762
|
+
const uint8x16_t m1 = vdupq_n_u8(1);
|
9763
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
9764
|
+
|
9765
|
+
uint8x16x2_t vs;
|
9766
|
+
ggml_int8x16x4_t q2s;
|
9216
9767
|
ggml_int8x16x4_t q8b;
|
9217
9768
|
|
9218
9769
|
float sumf = 0;
|
9219
9770
|
for (int i = 0; i < nb; ++i) {
|
9771
|
+
|
9220
9772
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9221
|
-
|
9222
|
-
const uint8_t * restrict
|
9223
|
-
const
|
9224
|
-
|
9773
|
+
|
9774
|
+
const uint8_t * restrict qs = x[i].qs;
|
9775
|
+
const uint8_t * restrict qh = x[i].qh;
|
9776
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
9777
|
+
const int8_t * restrict q8 = y[i].qs;
|
9778
|
+
|
9779
|
+
int sumi1 = 0, sumi2 = 0;
|
9225
9780
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9226
9781
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9227
|
-
|
9228
|
-
|
9229
|
-
const
|
9230
|
-
|
9231
|
-
const
|
9232
|
-
|
9233
|
-
|
9234
|
-
|
9235
|
-
|
9236
|
-
|
9237
|
-
|
9238
|
-
|
9239
|
-
|
9240
|
-
|
9241
|
-
|
9242
|
-
|
9243
|
-
|
9244
|
-
|
9782
|
+
q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
|
9783
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
|
9784
|
+
q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
|
9785
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
|
9786
|
+
q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
|
9787
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
|
9788
|
+
q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
|
9789
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
|
9790
|
+
qs += 8;
|
9791
|
+
|
9792
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
9793
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9794
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9795
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
9796
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9797
|
+
|
9798
|
+
q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
|
9799
|
+
q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
|
9800
|
+
|
9801
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
9802
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9803
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9804
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
9805
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9806
|
+
|
9807
|
+
signs += 4;
|
9808
|
+
|
9809
|
+
q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
|
9810
|
+
q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
|
9811
|
+
|
9812
|
+
const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
|
9813
|
+
const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
|
9814
|
+
const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
|
9815
|
+
const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
|
9816
|
+
|
9817
|
+
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
|
9818
|
+
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
|
9819
|
+
sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
|
9820
|
+
sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
|
9245
9821
|
}
|
9246
|
-
sumf += d*(
|
9822
|
+
sumf += d*(sumi1 + sumi2);
|
9247
9823
|
}
|
9248
|
-
|
9824
|
+
|
9825
|
+
*s = 0.125f * sumf;
|
9249
9826
|
|
9250
9827
|
#elif defined(__AVX2__)
|
9251
9828
|
|
9252
|
-
|
9829
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9830
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
9831
|
+
};
|
9253
9832
|
|
9254
|
-
|
9833
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9834
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9835
|
+
};
|
9836
|
+
|
9837
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9838
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9839
|
+
|
9840
|
+
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
9841
|
+
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
9842
|
+
|
9843
|
+
uint64_t aux64;
|
9255
9844
|
|
9256
9845
|
__m256 accumf = _mm256_setzero_ps();
|
9257
9846
|
for (int i = 0; i < nb; ++i) {
|
9258
9847
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9259
|
-
const uint8_t * restrict
|
9260
|
-
const uint8_t * restrict
|
9848
|
+
const uint8_t * restrict qs = x[i].qs;
|
9849
|
+
const uint8_t * restrict qh = x[i].qh;
|
9850
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
9261
9851
|
const int8_t * restrict q8 = y[i].qs;
|
9852
|
+
|
9853
|
+
memcpy(&aux64, x[i].scales, 8);
|
9854
|
+
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
9855
|
+
const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
|
9856
|
+
|
9262
9857
|
__m256i sumi1 = _mm256_setzero_si256();
|
9263
9858
|
__m256i sumi2 = _mm256_setzero_si256();
|
9264
9859
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9265
9860
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
9266
9861
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
9267
|
-
const __m256i q2_1 =
|
9268
|
-
|
9269
|
-
|
9270
|
-
|
9271
|
-
|
9272
|
-
|
9273
|
-
|
9274
|
-
|
9275
|
-
|
9276
|
-
|
9277
|
-
|
9278
|
-
|
9279
|
-
const __m256i
|
9280
|
-
const __m256i
|
9281
|
-
|
9282
|
-
|
9283
|
-
|
9284
|
-
const __m256i
|
9285
|
-
const __m256i
|
9862
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
9863
|
+
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
|
9864
|
+
iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
9865
|
+
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
9866
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
9867
|
+
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
|
9868
|
+
iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
9869
|
+
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
9870
|
+
qs += 8;
|
9871
|
+
|
9872
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
9873
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9874
|
+
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
9875
|
+
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
9876
|
+
|
9877
|
+
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
9878
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9879
|
+
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
9880
|
+
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
9881
|
+
|
9882
|
+
signs += 4;
|
9883
|
+
|
9884
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
|
9885
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
|
9886
|
+
|
9887
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
|
9888
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
|
9286
9889
|
sumi1 = _mm256_add_epi32(sumi1, p1);
|
9287
9890
|
sumi2 = _mm256_add_epi32(sumi2, p2);
|
9288
9891
|
}
|
@@ -9291,18 +9894,162 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9291
9894
|
|
9292
9895
|
}
|
9293
9896
|
|
9294
|
-
*s = 0.
|
9897
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9295
9898
|
|
9296
9899
|
#else
|
9297
9900
|
|
9298
|
-
|
9901
|
+
float sumf = 0;
|
9902
|
+
for (int i = 0; i < nb; i++) {
|
9299
9903
|
|
9300
|
-
float sumf = 0.f;
|
9301
|
-
for (int i = 0; i < nb; ++i) {
|
9302
9904
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9303
|
-
const
|
9304
|
-
const uint8_t *
|
9305
|
-
const
|
9905
|
+
const int8_t * q8 = y[i].qs;
|
9906
|
+
const uint8_t * qs = x[i].qs;
|
9907
|
+
const uint8_t * qh = x[i].qh;
|
9908
|
+
const uint8_t * signs = qs + QK_K/8;
|
9909
|
+
|
9910
|
+
int bsum = 0;
|
9911
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
9912
|
+
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
9913
|
+
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
9914
|
+
int sumi1 = 0, sumi2 = 0;
|
9915
|
+
for (int l = 0; l < 2; ++l) {
|
9916
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
9917
|
+
for (int j = 0; j < 8; ++j) {
|
9918
|
+
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
9919
|
+
}
|
9920
|
+
q8 += 8;
|
9921
|
+
}
|
9922
|
+
for (int l = 2; l < 4; ++l) {
|
9923
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
9924
|
+
for (int j = 0; j < 8; ++j) {
|
9925
|
+
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
9926
|
+
}
|
9927
|
+
q8 += 8;
|
9928
|
+
}
|
9929
|
+
bsum += ls1 * sumi1 + ls2 * sumi2;
|
9930
|
+
qs += 4;
|
9931
|
+
signs += 4;
|
9932
|
+
}
|
9933
|
+
|
9934
|
+
sumf += d * bsum;
|
9935
|
+
}
|
9936
|
+
|
9937
|
+
*s = 0.125f * sumf;
|
9938
|
+
|
9939
|
+
#endif
|
9940
|
+
|
9941
|
+
}
|
9942
|
+
|
9943
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9944
|
+
assert(n % QK_K == 0);
|
9945
|
+
assert(nrc == 1);
|
9946
|
+
UNUSED(nrc);
|
9947
|
+
UNUSED(bx);
|
9948
|
+
UNUSED(by);
|
9949
|
+
UNUSED(bs);
|
9950
|
+
|
9951
|
+
const block_iq3_xxs * restrict x = vx;
|
9952
|
+
const block_q8_K * restrict y = vy;
|
9953
|
+
|
9954
|
+
const int nb = n / QK_K;
|
9955
|
+
|
9956
|
+
#if defined(__ARM_NEON)
|
9957
|
+
|
9958
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
9959
|
+
|
9960
|
+
uint32_t aux32[2];
|
9961
|
+
|
9962
|
+
ggml_int8x16x4_t q3s;
|
9963
|
+
ggml_int8x16x4_t q8b;
|
9964
|
+
|
9965
|
+
float sumf = 0;
|
9966
|
+
for (int i = 0; i < nb; ++i) {
|
9967
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9968
|
+
const uint8_t * restrict q3 = x[i].qs;
|
9969
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
9970
|
+
const int8_t * restrict q8 = y[i].qs;
|
9971
|
+
float sumf1 = 0, sumf2 = 0;
|
9972
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9973
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9974
|
+
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
9975
|
+
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
|
9976
|
+
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
|
9977
|
+
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
|
9978
|
+
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
|
9979
|
+
q3 += 16;
|
9980
|
+
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
9981
|
+
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
9982
|
+
q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
|
9983
|
+
q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
|
9984
|
+
q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
|
9985
|
+
q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
|
9986
|
+
q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
|
9987
|
+
q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
|
9988
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
9989
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
9990
|
+
sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
|
9991
|
+
sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
|
9992
|
+
}
|
9993
|
+
sumf += d*(sumf1 + sumf2);
|
9994
|
+
}
|
9995
|
+
*s = 0.5f * sumf;
|
9996
|
+
|
9997
|
+
#elif defined(__AVX2__)
|
9998
|
+
|
9999
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10000
|
+
|
10001
|
+
uint32_t aux32[2];
|
10002
|
+
|
10003
|
+
__m256 accumf = _mm256_setzero_ps();
|
10004
|
+
for (int i = 0; i < nb; ++i) {
|
10005
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10006
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10007
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
10008
|
+
const int8_t * restrict q8 = y[i].qs;
|
10009
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
10010
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
10011
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10012
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10013
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10014
|
+
const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
10015
|
+
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10016
|
+
q3 += 8;
|
10017
|
+
const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
10018
|
+
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10019
|
+
q3 += 8;
|
10020
|
+
memcpy(aux32, gas, 8); gas += 8;
|
10021
|
+
const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
|
10022
|
+
signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
10023
|
+
const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
|
10024
|
+
signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
10025
|
+
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
10026
|
+
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
10027
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
10028
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
10029
|
+
const uint16_t ls1 = aux32[0] >> 28;
|
10030
|
+
const uint16_t ls2 = aux32[1] >> 28;
|
10031
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
10032
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
10033
|
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
10034
|
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
10035
|
+
}
|
10036
|
+
|
10037
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
10038
|
+
|
10039
|
+
}
|
10040
|
+
|
10041
|
+
*s = 0.25f * hsum_float_8(accumf);
|
10042
|
+
|
10043
|
+
#else
|
10044
|
+
|
10045
|
+
uint32_t aux32;
|
10046
|
+
|
10047
|
+
float sumf = 0.f;
|
10048
|
+
for (int i = 0; i < nb; ++i) {
|
10049
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10050
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10051
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
10052
|
+
const int8_t * restrict q8 = y[i].qs;
|
9306
10053
|
int32_t bsum = 0;
|
9307
10054
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
9308
10055
|
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
@@ -9327,6 +10074,202 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9327
10074
|
#endif
|
9328
10075
|
}
|
9329
10076
|
|
10077
|
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
10078
|
+
assert(n % QK_K == 0);
|
10079
|
+
assert(nrc == 1);
|
10080
|
+
UNUSED(nrc);
|
10081
|
+
UNUSED(bx);
|
10082
|
+
UNUSED(by);
|
10083
|
+
UNUSED(bs);
|
10084
|
+
|
10085
|
+
const block_iq3_s * restrict x = vx;
|
10086
|
+
const block_q8_K * restrict y = vy;
|
10087
|
+
|
10088
|
+
const int nb = n / QK_K;
|
10089
|
+
|
10090
|
+
#if defined(__ARM_NEON)
|
10091
|
+
|
10092
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10093
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10094
|
+
};
|
10095
|
+
|
10096
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
10097
|
+
|
10098
|
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
10099
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
10100
|
+
|
10101
|
+
uint8x16x2_t vs;
|
10102
|
+
ggml_int8x16x4_t q3s;
|
10103
|
+
ggml_int8x16x4_t q8b;
|
10104
|
+
|
10105
|
+
float sumf = 0;
|
10106
|
+
for (int i = 0; i < nb; ++i) {
|
10107
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10108
|
+
const uint8_t * restrict qs = x[i].qs;
|
10109
|
+
const uint8_t * restrict qh = x[i].qh;
|
10110
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10111
|
+
const int8_t * restrict q8 = y[i].qs;
|
10112
|
+
int sumi1 = 0, sumi2 = 0;
|
10113
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10114
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
10115
|
+
const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
|
10116
|
+
iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
|
10117
|
+
const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
|
10118
|
+
iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
|
10119
|
+
const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
|
10120
|
+
iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
|
10121
|
+
const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
|
10122
|
+
iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
|
10123
|
+
qs += 16;
|
10124
|
+
|
10125
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
10126
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10127
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10128
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
10129
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
10130
|
+
|
10131
|
+
q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0]));
|
10132
|
+
q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
|
10133
|
+
|
10134
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
10135
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10136
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10137
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
10138
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
10139
|
+
|
10140
|
+
signs += 4;
|
10141
|
+
|
10142
|
+
q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0]));
|
10143
|
+
q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1]));
|
10144
|
+
|
10145
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
10146
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
10147
|
+
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
10148
|
+
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
10149
|
+
}
|
10150
|
+
sumf += d*(sumi1 + sumi2);
|
10151
|
+
}
|
10152
|
+
*s = 0.25f * sumf;
|
10153
|
+
|
10154
|
+
#elif defined(__AVX2__)
|
10155
|
+
|
10156
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10157
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10158
|
+
};
|
10159
|
+
|
10160
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10161
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10162
|
+
};
|
10163
|
+
|
10164
|
+
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
10165
|
+
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
10166
|
+
|
10167
|
+
__m256 accumf = _mm256_setzero_ps();
|
10168
|
+
for (int i = 0; i < nb; ++i) {
|
10169
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10170
|
+
const uint8_t * restrict qs = x[i].qs;
|
10171
|
+
const uint8_t * restrict qh = x[i].qh;
|
10172
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10173
|
+
const int8_t * restrict q8 = y[i].qs;
|
10174
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
10175
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
10176
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10177
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10178
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10179
|
+
const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)],
|
10180
|
+
iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)],
|
10181
|
+
iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)],
|
10182
|
+
iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)],
|
10183
|
+
iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)],
|
10184
|
+
iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)],
|
10185
|
+
iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)],
|
10186
|
+
iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
|
10187
|
+
qs += 8;
|
10188
|
+
const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)],
|
10189
|
+
iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)],
|
10190
|
+
iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)],
|
10191
|
+
iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)],
|
10192
|
+
iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)],
|
10193
|
+
iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)],
|
10194
|
+
iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)],
|
10195
|
+
iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]);
|
10196
|
+
qs += 8;
|
10197
|
+
|
10198
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
10199
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
10200
|
+
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
10201
|
+
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
10202
|
+
|
10203
|
+
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
10204
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
10205
|
+
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
10206
|
+
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
10207
|
+
|
10208
|
+
signs += 4;
|
10209
|
+
|
10210
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
10211
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
10212
|
+
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
10213
|
+
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
10214
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
10215
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
10216
|
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
10217
|
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
10218
|
+
}
|
10219
|
+
|
10220
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
10221
|
+
|
10222
|
+
}
|
10223
|
+
|
10224
|
+
*s = 0.25f * hsum_float_8(accumf);
|
10225
|
+
|
10226
|
+
#else
|
10227
|
+
|
10228
|
+
float sumf = 0.f;
|
10229
|
+
for (int i = 0; i < nb; ++i) {
|
10230
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10231
|
+
const uint8_t * restrict qs = x[i].qs;
|
10232
|
+
const uint8_t * restrict qh = x[i].qh;
|
10233
|
+
const uint8_t * restrict signs = x[i].signs;
|
10234
|
+
const int8_t * restrict q8 = y[i].qs;
|
10235
|
+
int32_t bsum = 0;
|
10236
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10237
|
+
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
10238
|
+
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
10239
|
+
int32_t sumi = 0;
|
10240
|
+
for (int l = 0; l < 4; ++l) {
|
10241
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
10242
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
10243
|
+
for (int j = 0; j < 4; ++j) {
|
10244
|
+
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10245
|
+
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
10246
|
+
}
|
10247
|
+
q8 += 8;
|
10248
|
+
}
|
10249
|
+
qs += 8;
|
10250
|
+
signs += 4;
|
10251
|
+
bsum += sumi * ls1;
|
10252
|
+
sumi = 0;
|
10253
|
+
for (int l = 0; l < 4; ++l) {
|
10254
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
10255
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
10256
|
+
for (int j = 0; j < 4; ++j) {
|
10257
|
+
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10258
|
+
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
10259
|
+
}
|
10260
|
+
q8 += 8;
|
10261
|
+
}
|
10262
|
+
qs += 8;
|
10263
|
+
signs += 4;
|
10264
|
+
bsum += sumi * ls2;
|
10265
|
+
}
|
10266
|
+
sumf += d * bsum;
|
10267
|
+
}
|
10268
|
+
*s = 0.25f * sumf;
|
10269
|
+
#endif
|
10270
|
+
}
|
10271
|
+
|
10272
|
+
|
9330
10273
|
#ifdef __AVX2__
|
9331
10274
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
9332
10275
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
@@ -9348,7 +10291,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9348
10291
|
|
9349
10292
|
const int nb = n / QK_K;
|
9350
10293
|
|
9351
|
-
|
10294
|
+
// TODO: implement for QK_K = 64
|
10295
|
+
#if defined __ARM_NEON && QK_K == 256
|
9352
10296
|
|
9353
10297
|
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
9354
10298
|
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
@@ -9405,7 +10349,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9405
10349
|
|
9406
10350
|
*s = sumf;
|
9407
10351
|
|
9408
|
-
|
10352
|
+
// TODO: implement for QK_K = 64
|
10353
|
+
#elif defined __AVX2__ && QK_K == 256
|
9409
10354
|
|
9410
10355
|
const __m128i m8 = _mm_set1_epi8(0x08);
|
9411
10356
|
const __m128i m7 = _mm_set1_epi8(0x07);
|
@@ -9420,8 +10365,12 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9420
10365
|
|
9421
10366
|
uint64_t aux64;
|
9422
10367
|
|
9423
|
-
|
9424
|
-
|
10368
|
+
typedef union m256i_uint16 {
|
10369
|
+
__m256i reg;
|
10370
|
+
uint16_t s[16];
|
10371
|
+
} m256i_uint16_t;
|
10372
|
+
|
10373
|
+
m256i_uint16_t v_gindex;
|
9425
10374
|
|
9426
10375
|
__m256 accum = _mm256_setzero_ps();
|
9427
10376
|
for (int i = 0; i < nb; ++i) {
|
@@ -9436,13 +10385,13 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9436
10385
|
memcpy(&aux64, sc, 8); sc += 8;
|
9437
10386
|
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
9438
10387
|
const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
|
9439
|
-
v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
10388
|
+
v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
9440
10389
|
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
9441
10390
|
|
9442
10391
|
for (int i32 = 0; i32 < 4; ++i32) {
|
9443
10392
|
const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9444
|
-
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[
|
9445
|
-
iq1s_grid[
|
10393
|
+
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
|
10394
|
+
iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
|
9446
10395
|
const __m256i dot = mul_add_epi8(q1b, q8b);
|
9447
10396
|
const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
|
9448
10397
|
const __m256i p = _mm256_madd_epi16(s16, dot);
|
@@ -9523,6 +10472,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
9523
10472
|
float sumf = 0;
|
9524
10473
|
|
9525
10474
|
for (int ib = 0; ib < nb; ib += 2) {
|
10475
|
+
|
9526
10476
|
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
9527
10477
|
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
9528
10478
|
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
@@ -9592,6 +10542,138 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
9592
10542
|
#endif
|
9593
10543
|
}
|
9594
10544
|
|
10545
|
+
void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
10546
|
+
assert(nrc == 1);
|
10547
|
+
UNUSED(nrc);
|
10548
|
+
UNUSED(bx);
|
10549
|
+
UNUSED(by);
|
10550
|
+
UNUSED(bs);
|
10551
|
+
assert(n % QK_K == 0);
|
10552
|
+
#if QK_K == 64
|
10553
|
+
ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
|
10554
|
+
#else
|
10555
|
+
|
10556
|
+
const block_iq4_xs * restrict x = vx;
|
10557
|
+
const block_q8_K * restrict y = vy;
|
10558
|
+
|
10559
|
+
const int nb = n / QK_K;
|
10560
|
+
|
10561
|
+
#if defined __ARM_NEON
|
10562
|
+
const int8x16_t values = vld1q_s8(kvalues_iq4nl);
|
10563
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
10564
|
+
ggml_uint8x16x2_t q4bits;
|
10565
|
+
ggml_int8x16x4_t q4b;
|
10566
|
+
ggml_int8x16x4_t q8b;
|
10567
|
+
int32x4_t prod_1, prod_2;
|
10568
|
+
|
10569
|
+
float sumf = 0;
|
10570
|
+
|
10571
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
10572
|
+
|
10573
|
+
const int8_t * q8 = y[ibl].qs;
|
10574
|
+
const uint8_t * q4 = x[ibl].qs;
|
10575
|
+
uint16_t h = x[ibl].scales_h;
|
10576
|
+
|
10577
|
+
int sumi1 = 0, sumi2 = 0;
|
10578
|
+
for (int ib = 0; ib < QK_K/64; ++ib) {
|
10579
|
+
|
10580
|
+
q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
10581
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
10582
|
+
|
10583
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
10584
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
10585
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
10586
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
10587
|
+
|
10588
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
10589
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
10590
|
+
|
10591
|
+
int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
|
10592
|
+
int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
|
10593
|
+
h >>= 4;
|
10594
|
+
sumi1 += vaddvq_s32(prod_1) * ls1;
|
10595
|
+
sumi2 += vaddvq_s32(prod_2) * ls2;
|
10596
|
+
|
10597
|
+
}
|
10598
|
+
|
10599
|
+
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
10600
|
+
}
|
10601
|
+
|
10602
|
+
*s = sumf;
|
10603
|
+
|
10604
|
+
#elif defined __AVX2__
|
10605
|
+
|
10606
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
10607
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
10608
|
+
|
10609
|
+
__m256 accum = _mm256_setzero_ps();
|
10610
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
10611
|
+
const uint8_t * qs = x[ibl].qs;
|
10612
|
+
const int8_t * q8 = y[ibl].qs;
|
10613
|
+
uint16_t sh = x[ibl].scales_h;
|
10614
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
10615
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
10616
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
10617
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
10618
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
10619
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10620
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10621
|
+
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
10622
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
10623
|
+
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
10624
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
10625
|
+
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
10626
|
+
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
10627
|
+
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
10628
|
+
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
10629
|
+
sh >>= 4;
|
10630
|
+
const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
|
10631
|
+
const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
|
10632
|
+
sumi1 = _mm256_add_epi32(p_1, sumi1);
|
10633
|
+
sumi2 = _mm256_add_epi32(p_2, sumi2);
|
10634
|
+
}
|
10635
|
+
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
10636
|
+
_mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
|
10637
|
+
}
|
10638
|
+
|
10639
|
+
*s = hsum_float_8(accum);
|
10640
|
+
|
10641
|
+
#else
|
10642
|
+
float sumf = 0;
|
10643
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
10644
|
+
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
10645
|
+
uint16_t h = x[ibl].scales_h;
|
10646
|
+
const uint8_t * qs = x[ibl].qs;
|
10647
|
+
const int8_t * q8 = y[ibl].qs;
|
10648
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
10649
|
+
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
10650
|
+
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
10651
|
+
h >>= 4;
|
10652
|
+
const float d1 = d4d8*(ls1 - 32);
|
10653
|
+
const float d2 = d4d8*(ls2 - 32);
|
10654
|
+
int sumi1 = 0, sumi2 = 0;
|
10655
|
+
for (int j = 0; j < 16; ++j) {
|
10656
|
+
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
10657
|
+
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
10658
|
+
}
|
10659
|
+
sumf += d1 * (sumi1 + sumi2);
|
10660
|
+
qs += 16;
|
10661
|
+
q8 += 32;
|
10662
|
+
sumi1 = sumi2 = 0;
|
10663
|
+
for (int j = 0; j < 16; ++j) {
|
10664
|
+
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
10665
|
+
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
10666
|
+
}
|
10667
|
+
sumf += d2 * (sumi1 + sumi2);
|
10668
|
+
qs += 16;
|
10669
|
+
q8 += 32;
|
10670
|
+
}
|
10671
|
+
}
|
10672
|
+
*s = sumf;
|
10673
|
+
#endif
|
10674
|
+
#endif
|
10675
|
+
}
|
10676
|
+
|
9595
10677
|
// ================================ IQ2 quantization =============================================
|
9596
10678
|
|
9597
10679
|
typedef struct {
|
@@ -9600,22 +10682,25 @@ typedef struct {
|
|
9600
10682
|
uint16_t * neighbours;
|
9601
10683
|
} iq2_entry_t;
|
9602
10684
|
|
9603
|
-
static iq2_entry_t iq2_data[
|
10685
|
+
static iq2_entry_t iq2_data[4] = {
|
10686
|
+
{NULL, NULL, NULL},
|
9604
10687
|
{NULL, NULL, NULL},
|
9605
10688
|
{NULL, NULL, NULL},
|
9606
10689
|
{NULL, NULL, NULL},
|
9607
10690
|
};
|
9608
10691
|
|
9609
10692
|
static inline int iq2_data_index(enum ggml_type type) {
|
9610
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
10693
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
9611
10694
|
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
9612
|
-
type == GGML_TYPE_IQ2_XS ? 1 :
|
10695
|
+
type == GGML_TYPE_IQ2_XS ? 1 :
|
10696
|
+
type == GGML_TYPE_IQ1_S ? 2 : 3;
|
9613
10697
|
}
|
9614
10698
|
|
9615
10699
|
static inline int iq2_grid_size(enum ggml_type type) {
|
9616
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
10700
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
9617
10701
|
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
9618
|
-
type == GGML_TYPE_IQ2_XS ? 512 :
|
10702
|
+
type == GGML_TYPE_IQ2_XS ? 512 :
|
10703
|
+
type == GGML_TYPE_IQ1_S ? 512 : 1024;
|
9619
10704
|
}
|
9620
10705
|
|
9621
10706
|
static int iq2_compare_func(const void * left, const void * right) {
|
@@ -9716,11 +10801,79 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
9716
10801
|
41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
|
9717
10802
|
42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
|
9718
10803
|
};
|
10804
|
+
static const uint16_t kgrid_2bit_1024[1024] = {
|
10805
|
+
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
10806
|
+
73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
|
10807
|
+
165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
|
10808
|
+
337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
|
10809
|
+
517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
|
10810
|
+
674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
|
10811
|
+
1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
|
10812
|
+
1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
|
10813
|
+
1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
|
10814
|
+
1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
|
10815
|
+
2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
|
10816
|
+
2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
|
10817
|
+
2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
|
10818
|
+
4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
|
10819
|
+
4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
|
10820
|
+
4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
|
10821
|
+
4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
|
10822
|
+
4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
|
10823
|
+
5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
|
10824
|
+
5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
|
10825
|
+
5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
|
10826
|
+
5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
|
10827
|
+
6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
|
10828
|
+
6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
|
10829
|
+
8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
|
10830
|
+
8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
|
10831
|
+
8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
|
10832
|
+
9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
|
10833
|
+
9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
|
10834
|
+
10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
|
10835
|
+
16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
|
10836
|
+
16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
|
10837
|
+
16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
|
10838
|
+
16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
|
10839
|
+
17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
|
10840
|
+
17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
|
10841
|
+
17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
|
10842
|
+
17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
|
10843
|
+
18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
|
10844
|
+
18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
|
10845
|
+
18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
|
10846
|
+
20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
|
10847
|
+
20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
|
10848
|
+
20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
|
10849
|
+
21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
|
10850
|
+
21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
|
10851
|
+
22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
|
10852
|
+
22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
|
10853
|
+
24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
|
10854
|
+
24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
|
10855
|
+
25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
|
10856
|
+
26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
|
10857
|
+
32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
|
10858
|
+
33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
|
10859
|
+
33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
|
10860
|
+
33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
|
10861
|
+
34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
|
10862
|
+
35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
|
10863
|
+
36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
|
10864
|
+
37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
|
10865
|
+
38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
|
10866
|
+
39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
|
10867
|
+
41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
|
10868
|
+
42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
|
10869
|
+
};
|
9719
10870
|
|
9720
10871
|
const int kmap_size = 43692;
|
9721
|
-
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
10872
|
+
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
10873
|
+
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
9722
10874
|
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
9723
|
-
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10875
|
+
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10876
|
+
type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
|
9724
10877
|
uint64_t * kgrid_q2xs;
|
9725
10878
|
int * kmap_q2xs;
|
9726
10879
|
uint16_t * kneighbors_q2xs;
|
@@ -9817,7 +10970,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
9817
10970
|
}
|
9818
10971
|
|
9819
10972
|
void iq2xs_free_impl(enum ggml_type type) {
|
9820
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
10973
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
9821
10974
|
const int gindex = iq2_data_index(type);
|
9822
10975
|
if (iq2_data[gindex].grid) {
|
9823
10976
|
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
@@ -9866,7 +11019,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
9866
11019
|
|
9867
11020
|
const int kMaxQ = 3;
|
9868
11021
|
|
9869
|
-
const int nbl = n/
|
11022
|
+
const int nbl = n/QK_K;
|
9870
11023
|
|
9871
11024
|
block_iq2_xxs * y = vy;
|
9872
11025
|
|
@@ -10039,7 +11192,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
10039
11192
|
|
10040
11193
|
const int kMaxQ = 3;
|
10041
11194
|
|
10042
|
-
const int nbl = n/
|
11195
|
+
const int nbl = n/QK_K;
|
10043
11196
|
|
10044
11197
|
block_iq2_xs * y = vy;
|
10045
11198
|
|
@@ -10239,14 +11392,15 @@ typedef struct {
|
|
10239
11392
|
uint16_t * neighbours;
|
10240
11393
|
} iq3_entry_t;
|
10241
11394
|
|
10242
|
-
static iq3_entry_t iq3_data[
|
11395
|
+
static iq3_entry_t iq3_data[2] = {
|
11396
|
+
{NULL, NULL, NULL},
|
10243
11397
|
{NULL, NULL, NULL},
|
10244
11398
|
};
|
10245
11399
|
|
10246
11400
|
static inline int iq3_data_index(int grid_size) {
|
10247
11401
|
(void)grid_size;
|
10248
|
-
GGML_ASSERT(grid_size == 256);
|
10249
|
-
return 0;
|
11402
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
11403
|
+
return grid_size == 256 ? 0 : 1;
|
10250
11404
|
}
|
10251
11405
|
|
10252
11406
|
static int iq3_compare_func(const void * left, const void * right) {
|
@@ -10278,9 +11432,44 @@ void iq3xs_init_impl(int grid_size) {
|
|
10278
11432
|
3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
|
10279
11433
|
3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
|
10280
11434
|
};
|
11435
|
+
static const uint16_t kgrid_512[512] = {
|
11436
|
+
0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34,
|
11437
|
+
37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77,
|
11438
|
+
80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142,
|
11439
|
+
145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210,
|
11440
|
+
217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288,
|
11441
|
+
291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393,
|
11442
|
+
395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514,
|
11443
|
+
516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576,
|
11444
|
+
577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653,
|
11445
|
+
655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727,
|
11446
|
+
728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833,
|
11447
|
+
840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977,
|
11448
|
+
989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047,
|
11449
|
+
1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103,
|
11450
|
+
1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199,
|
11451
|
+
1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296,
|
11452
|
+
1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415,
|
11453
|
+
1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561,
|
11454
|
+
1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648,
|
11455
|
+
1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761,
|
11456
|
+
1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877,
|
11457
|
+
1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068,
|
11458
|
+
2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177,
|
11459
|
+
2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269,
|
11460
|
+
2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520,
|
11461
|
+
2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634,
|
11462
|
+
2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805,
|
11463
|
+
2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083,
|
11464
|
+
3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276,
|
11465
|
+
3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591,
|
11466
|
+
3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729,
|
11467
|
+
3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032,
|
11468
|
+
};
|
11469
|
+
|
10281
11470
|
const int kmap_size = 4096;
|
10282
|
-
const int nwant = 2;
|
10283
|
-
const uint16_t * kgrid = kgrid_256;
|
11471
|
+
const int nwant = grid_size == 256 ? 2 : 3;
|
11472
|
+
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
10284
11473
|
uint32_t * kgrid_q3xs;
|
10285
11474
|
int * kmap_q3xs;
|
10286
11475
|
uint16_t * kneighbors_q3xs;
|
@@ -10377,7 +11566,7 @@ void iq3xs_init_impl(int grid_size) {
|
|
10377
11566
|
}
|
10378
11567
|
|
10379
11568
|
void iq3xs_free_impl(int grid_size) {
|
10380
|
-
GGML_ASSERT(grid_size == 256);
|
11569
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
10381
11570
|
const int gindex = iq3_data_index(grid_size);
|
10382
11571
|
if (iq3_data[gindex].grid) {
|
10383
11572
|
free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
|
@@ -10410,9 +11599,10 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
10410
11599
|
return grid_index;
|
10411
11600
|
}
|
10412
11601
|
|
10413
|
-
static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n,
|
11602
|
+
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
|
11603
|
+
const float * restrict quant_weights) {
|
10414
11604
|
|
10415
|
-
const int gindex = iq3_data_index(
|
11605
|
+
const int gindex = iq3_data_index(grid_size);
|
10416
11606
|
|
10417
11607
|
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
10418
11608
|
const int * kmap_q3xs = iq3_data[gindex].map;
|
@@ -10426,9 +11616,23 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10426
11616
|
|
10427
11617
|
const int kMaxQ = 8;
|
10428
11618
|
|
10429
|
-
const int nbl = n/
|
11619
|
+
const int nbl = n/QK_K;
|
10430
11620
|
|
10431
|
-
|
11621
|
+
ggml_fp16_t * dh;
|
11622
|
+
uint8_t * qs;
|
11623
|
+
int block_size;
|
11624
|
+
if (grid_size == 256) {
|
11625
|
+
block_iq3_xxs * y = vy;
|
11626
|
+
dh = &y->d;
|
11627
|
+
qs = y->qs;
|
11628
|
+
block_size = sizeof(block_iq3_xxs);
|
11629
|
+
} else {
|
11630
|
+
block_iq3_s * y = vy;
|
11631
|
+
dh = &y->d;
|
11632
|
+
qs = y->qs;
|
11633
|
+
block_size = sizeof(block_iq3_s);
|
11634
|
+
}
|
11635
|
+
int quant_size = block_size - sizeof(ggml_fp16_t);
|
10432
11636
|
|
10433
11637
|
float scales[QK_K/32];
|
10434
11638
|
float weight[32];
|
@@ -10439,57 +11643,271 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10439
11643
|
bool is_on_grid[8];
|
10440
11644
|
bool is_on_grid_aux[8];
|
10441
11645
|
uint8_t block_signs[8];
|
10442
|
-
uint8_t q3[3*(QK_K/8)];
|
11646
|
+
uint8_t q3[3*(QK_K/8)+QK_K/32];
|
10443
11647
|
uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
|
11648
|
+
uint8_t * qh = q3 + 3*(QK_K/8);
|
10444
11649
|
|
10445
11650
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
10446
11651
|
|
11652
|
+
dh[0] = GGML_FP32_TO_FP16(0.f);
|
11653
|
+
memset(q3, 0, 3*QK_K/8+QK_K/32);
|
11654
|
+
|
11655
|
+
float max_scale = 0;
|
11656
|
+
|
11657
|
+
const float * xbl = x + QK_K*ibl;
|
11658
|
+
float sumx2 = 0;
|
11659
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
11660
|
+
float sigma2 = 2*sumx2/QK_K;
|
11661
|
+
|
11662
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
11663
|
+
const float * xb = xbl + 32*ib;
|
11664
|
+
if (quant_weights) {
|
11665
|
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
11666
|
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
11667
|
+
} else {
|
11668
|
+
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
11669
|
+
}
|
11670
|
+
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
11671
|
+
for (int k = 0; k < 4; ++k) {
|
11672
|
+
int nflip = 0;
|
11673
|
+
uint8_t s = 0;
|
11674
|
+
for (int i = 0; i < 8; ++i) {
|
11675
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
11676
|
+
else {
|
11677
|
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
11678
|
+
}
|
11679
|
+
}
|
11680
|
+
if (nflip%2) {
|
11681
|
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
11682
|
+
for (int i = 1; i < 8; ++i) {
|
11683
|
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
11684
|
+
if (ax < min) {
|
11685
|
+
min = ax; imin = i;
|
11686
|
+
}
|
11687
|
+
}
|
11688
|
+
xval[8*k+imin] = -xval[8*k+imin];
|
11689
|
+
s ^= (1 << imin);
|
11690
|
+
}
|
11691
|
+
block_signs[k] = s & 127;
|
11692
|
+
}
|
11693
|
+
float max = xval[0];
|
11694
|
+
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
11695
|
+
if (!max) {
|
11696
|
+
scales[ib] = 0;
|
11697
|
+
memset(L, 0, 32);
|
11698
|
+
continue;
|
11699
|
+
}
|
11700
|
+
float best = 0;
|
11701
|
+
float scale = max/(2*kMaxQ-1);
|
11702
|
+
for (int is = -15; is <= 15; ++is) {
|
11703
|
+
float id = (2*kMaxQ-1+is*0.2f)/max;
|
11704
|
+
float this_scale = 1/id;
|
11705
|
+
for (int k = 0; k < 8; ++k) {
|
11706
|
+
for (int i = 0; i < 4; ++i) {
|
11707
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
11708
|
+
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
11709
|
+
}
|
11710
|
+
uint16_t u = 0;
|
11711
|
+
for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
|
11712
|
+
int grid_index = kmap_q3xs[u];
|
11713
|
+
is_on_grid_aux[k] = true;
|
11714
|
+
if (grid_index < 0) {
|
11715
|
+
is_on_grid_aux[k] = false;
|
11716
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
11717
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
|
11718
|
+
}
|
11719
|
+
}
|
11720
|
+
float sumqx = 0, sumq2 = 0;
|
11721
|
+
for (int i = 0; i < 32; ++i) {
|
11722
|
+
float w = weight[i];
|
11723
|
+
float q = 2*Laux[i] + 1;
|
11724
|
+
sumqx += w*xval[i]*q;
|
11725
|
+
sumq2 += w*q*q;
|
11726
|
+
}
|
11727
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
11728
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
11729
|
+
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
|
11730
|
+
for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
11731
|
+
}
|
11732
|
+
}
|
11733
|
+
int n_not_ongrid = 0;
|
11734
|
+
for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
11735
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
11736
|
+
float id = 1/scale;
|
11737
|
+
for (int k = 0; k < 8; ++k) {
|
11738
|
+
if (is_on_grid[k]) continue;
|
11739
|
+
uint16_t u = 0;
|
11740
|
+
for (int i = 0; i < 4; ++i) {
|
11741
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
11742
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
11743
|
+
u |= (l << 3*i);
|
11744
|
+
}
|
11745
|
+
int grid_index = kmap_q3xs[u];
|
11746
|
+
if (grid_index < 0) {
|
11747
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
11748
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
|
11749
|
+
}
|
11750
|
+
const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
|
11751
|
+
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
11752
|
+
}
|
11753
|
+
float sumqx = 0, sumq2 = 0;
|
11754
|
+
for (int i = 0; i < 32; ++i) {
|
11755
|
+
float w = weight[i];
|
11756
|
+
float q = 2*L[i] + 1;
|
11757
|
+
sumqx += w*xval[i]*q;
|
11758
|
+
sumq2 += w*q*q;
|
11759
|
+
}
|
11760
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
11761
|
+
}
|
11762
|
+
if (scale < 0) {
|
11763
|
+
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
11764
|
+
// and correspondingly flip quant signs.
|
11765
|
+
scale = -scale;
|
11766
|
+
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
11767
|
+
}
|
11768
|
+
for (int k = 0; k < 8; ++k) {
|
11769
|
+
uint16_t u = 0;
|
11770
|
+
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
11771
|
+
int grid_index = kmap_q3xs[u];
|
11772
|
+
if (grid_index < 0) {
|
11773
|
+
printf("Oops: found point %u not on grid:", u);
|
11774
|
+
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
11775
|
+
printf("\n");
|
11776
|
+
GGML_ASSERT(false);
|
11777
|
+
}
|
11778
|
+
if (grid_size == 256) {
|
11779
|
+
q3[8*ib+k] = grid_index;
|
11780
|
+
} else {
|
11781
|
+
q3[8*ib+k] = grid_index & 255;
|
11782
|
+
qh[ib] |= ((grid_index >> 8) << k);
|
11783
|
+
}
|
11784
|
+
|
11785
|
+
}
|
11786
|
+
scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
|
11787
|
+
GGML_ASSERT(scale >= 0);
|
11788
|
+
scales[ib] = scale;
|
11789
|
+
max_scale = MAX(max_scale, scale);
|
11790
|
+
}
|
11791
|
+
|
11792
|
+
if (!max_scale) {
|
11793
|
+
memset(qs, 0, quant_size);
|
11794
|
+
dh += block_size/sizeof(ggml_fp16_t);
|
11795
|
+
qs += block_size;
|
11796
|
+
continue;
|
11797
|
+
}
|
11798
|
+
|
11799
|
+
float d = max_scale/31;
|
11800
|
+
dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor
|
11801
|
+
float id = 1/d;
|
11802
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
11803
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
11804
|
+
l = MAX(0, MIN(15, l));
|
11805
|
+
scales_and_signs[ib] |= ((uint32_t)l << 28);
|
11806
|
+
}
|
11807
|
+
memcpy(qs, q3, quant_size);
|
11808
|
+
|
11809
|
+
dh += block_size/sizeof(ggml_fp16_t);
|
11810
|
+
qs += block_size;
|
11811
|
+
|
11812
|
+
}
|
11813
|
+
}
|
11814
|
+
|
11815
|
+
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
11816
|
+
(void)hist;
|
11817
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
11818
|
+
int nblock = n_per_row/QK_K;
|
11819
|
+
char * qrow = (char *)dst;
|
11820
|
+
for (int row = 0; row < nrow; ++row) {
|
11821
|
+
quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
|
11822
|
+
src += n_per_row;
|
11823
|
+
qrow += nblock*sizeof(block_iq3_xxs);
|
11824
|
+
}
|
11825
|
+
return nrow * nblock * sizeof(block_iq3_xxs);
|
11826
|
+
}
|
11827
|
+
|
11828
|
+
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
|
11829
|
+
assert(k % QK_K == 0);
|
11830
|
+
block_iq3_xxs * restrict y = vy;
|
11831
|
+
quantize_row_iq3_xxs_reference(x, y, k);
|
11832
|
+
}
|
11833
|
+
|
11834
|
+
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
|
11835
|
+
assert(k % QK_K == 0);
|
11836
|
+
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
11837
|
+
}
|
11838
|
+
|
11839
|
+
static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
|
11840
|
+
const float * restrict quant_weights,
|
11841
|
+
float * scales,
|
11842
|
+
float * weight,
|
11843
|
+
float * xval,
|
11844
|
+
int8_t * L,
|
11845
|
+
int8_t * Laux,
|
11846
|
+
float * waux,
|
11847
|
+
bool * is_on_grid,
|
11848
|
+
bool * is_on_grid_aux,
|
11849
|
+
uint8_t * block_signs) {
|
11850
|
+
|
11851
|
+
const int gindex = iq3_data_index(512);
|
11852
|
+
|
11853
|
+
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
11854
|
+
const int * kmap_q3xs = iq3_data[gindex].map;
|
11855
|
+
const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
|
11856
|
+
|
11857
|
+
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
11858
|
+
GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
|
11859
|
+
GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
|
11860
|
+
GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
|
11861
|
+
GGML_ASSERT(n%QK_K == 0);
|
11862
|
+
|
11863
|
+
const int kMaxQ = 8;
|
11864
|
+
|
11865
|
+
const int nbl = n/QK_K;
|
11866
|
+
|
11867
|
+
block_iq3_s * y = vy;
|
11868
|
+
|
11869
|
+
const int bs4 = block_size/4;
|
11870
|
+
const int bs8 = block_size/8;
|
11871
|
+
|
11872
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
11873
|
+
|
11874
|
+
memset(&y[ibl], 0, sizeof(block_iq3_s));
|
10447
11875
|
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
10448
|
-
|
11876
|
+
|
11877
|
+
uint8_t * qs = y[ibl].qs;
|
11878
|
+
uint8_t * qh = y[ibl].qh;
|
11879
|
+
uint8_t * signs = y[ibl].signs;
|
10449
11880
|
|
10450
11881
|
float max_scale = 0;
|
10451
11882
|
|
10452
11883
|
const float * xbl = x + QK_K*ibl;
|
10453
11884
|
float sumx2 = 0;
|
10454
11885
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
10455
|
-
float sigma2 = sumx2/QK_K;
|
11886
|
+
float sigma2 = 2*sumx2/QK_K;
|
10456
11887
|
|
10457
|
-
for (int ib = 0; ib < QK_K/
|
10458
|
-
const float * xb = xbl +
|
11888
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
11889
|
+
const float * xb = xbl + block_size*ib;
|
10459
11890
|
if (quant_weights) {
|
10460
|
-
const float * qw = quant_weights + QK_K*ibl +
|
10461
|
-
for (int i = 0; i <
|
11891
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
11892
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
10462
11893
|
} else {
|
10463
|
-
for (int i = 0; i <
|
11894
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
10464
11895
|
}
|
10465
|
-
for (int i = 0; i <
|
10466
|
-
for (int k = 0; k <
|
10467
|
-
int nflip = 0;
|
11896
|
+
for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
|
11897
|
+
for (int k = 0; k < bs8; ++k) {
|
10468
11898
|
uint8_t s = 0;
|
10469
11899
|
for (int i = 0; i < 8; ++i) {
|
10470
11900
|
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
10471
11901
|
else {
|
10472
|
-
xval[8*k + i] = -xb[8*k + i];
|
10473
|
-
}
|
10474
|
-
}
|
10475
|
-
if (nflip%2) {
|
10476
|
-
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
10477
|
-
for (int i = 1; i < 8; ++i) {
|
10478
|
-
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
10479
|
-
if (ax < min) {
|
10480
|
-
min = ax; imin = i;
|
10481
|
-
}
|
11902
|
+
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
10482
11903
|
}
|
10483
|
-
xval[8*k+imin] = -xval[8*k+imin];
|
10484
|
-
s ^= (1 << imin);
|
10485
11904
|
}
|
10486
|
-
block_signs[k] = s
|
11905
|
+
block_signs[k] = s;
|
10487
11906
|
}
|
10488
11907
|
float max = xval[0];
|
10489
|
-
for (int i = 1; i <
|
11908
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
|
10490
11909
|
if (!max) {
|
10491
11910
|
scales[ib] = 0;
|
10492
|
-
memset(L, 0, 32);
|
10493
11911
|
continue;
|
10494
11912
|
}
|
10495
11913
|
float best = 0;
|
@@ -10497,7 +11915,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10497
11915
|
for (int is = -15; is <= 15; ++is) {
|
10498
11916
|
float id = (2*kMaxQ-1+is*0.2f)/max;
|
10499
11917
|
float this_scale = 1/id;
|
10500
|
-
for (int k = 0; k <
|
11918
|
+
for (int k = 0; k < bs4; ++k) {
|
10501
11919
|
for (int i = 0; i < 4; ++i) {
|
10502
11920
|
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
10503
11921
|
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
@@ -10513,7 +11931,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10513
11931
|
}
|
10514
11932
|
}
|
10515
11933
|
float sumqx = 0, sumq2 = 0;
|
10516
|
-
for (int i = 0; i <
|
11934
|
+
for (int i = 0; i < block_size; ++i) {
|
10517
11935
|
float w = weight[i];
|
10518
11936
|
float q = 2*Laux[i] + 1;
|
10519
11937
|
sumqx += w*xval[i]*q;
|
@@ -10521,15 +11939,15 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10521
11939
|
}
|
10522
11940
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
10523
11941
|
scale = sumqx/sumq2; best = scale*sumqx;
|
10524
|
-
for (int i = 0; i <
|
10525
|
-
for (int k = 0; k <
|
11942
|
+
for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
|
11943
|
+
for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
10526
11944
|
}
|
10527
11945
|
}
|
10528
11946
|
int n_not_ongrid = 0;
|
10529
|
-
for (int k = 0; k <
|
11947
|
+
for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
10530
11948
|
if (n_not_ongrid > 0 && scale > 0) {
|
10531
11949
|
float id = 1/scale;
|
10532
|
-
for (int k = 0; k <
|
11950
|
+
for (int k = 0; k < bs4; ++k) {
|
10533
11951
|
if (is_on_grid[k]) continue;
|
10534
11952
|
uint16_t u = 0;
|
10535
11953
|
for (int i = 0; i < 4; ++i) {
|
@@ -10546,7 +11964,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10546
11964
|
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
10547
11965
|
}
|
10548
11966
|
float sumqx = 0, sumq2 = 0;
|
10549
|
-
for (int i = 0; i <
|
11967
|
+
for (int i = 0; i < block_size; ++i) {
|
10550
11968
|
float w = weight[i];
|
10551
11969
|
float q = 2*L[i] + 1;
|
10552
11970
|
sumqx += w*xval[i]*q;
|
@@ -10558,9 +11976,9 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10558
11976
|
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
10559
11977
|
// and correspondingly flip quant signs.
|
10560
11978
|
scale = -scale;
|
10561
|
-
for (int k = 0; k <
|
11979
|
+
for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
|
10562
11980
|
}
|
10563
|
-
for (int k = 0; k <
|
11981
|
+
for (int k = 0; k < bs4; ++k) {
|
10564
11982
|
uint16_t u = 0;
|
10565
11983
|
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
10566
11984
|
int grid_index = kmap_q3xs[u];
|
@@ -10570,99 +11988,71 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10570
11988
|
printf("\n");
|
10571
11989
|
GGML_ASSERT(false);
|
10572
11990
|
}
|
10573
|
-
|
11991
|
+
qs[k] = grid_index & 255;
|
11992
|
+
qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
|
10574
11993
|
}
|
10575
|
-
|
11994
|
+
qs += bs4;
|
11995
|
+
for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
|
11996
|
+
signs += bs8;
|
10576
11997
|
GGML_ASSERT(scale >= 0);
|
10577
11998
|
scales[ib] = scale;
|
10578
11999
|
max_scale = MAX(max_scale, scale);
|
10579
12000
|
}
|
10580
12001
|
|
10581
12002
|
if (!max_scale) {
|
10582
|
-
memset(y[ibl].qs, 0, 3*QK_K/8);
|
10583
12003
|
continue;
|
10584
12004
|
}
|
10585
12005
|
|
10586
12006
|
float d = max_scale/31;
|
10587
12007
|
y[ibl].d = GGML_FP32_TO_FP16(d);
|
10588
12008
|
float id = 1/d;
|
10589
|
-
|
10590
|
-
|
10591
|
-
|
10592
|
-
|
10593
|
-
|
10594
|
-
|
10595
|
-
const float * xb = xbl + 32*ib;
|
10596
|
-
if (quant_weights) {
|
10597
|
-
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
10598
|
-
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
10599
|
-
} else {
|
10600
|
-
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
10601
|
-
}
|
10602
|
-
const float db = 0.25f * d * (1 + 2*l);
|
10603
|
-
for (int k = 0; k < 8; ++k) {
|
10604
|
-
const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
|
10605
|
-
const float * xk = xb + 4*k;
|
10606
|
-
const float * wk = weight + 4*k;
|
10607
|
-
//const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
|
10608
|
-
const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
|
10609
|
-
float best_mse = 0; int best_index = q3[8*ib+k];
|
10610
|
-
for (int j = 0; j < 4; ++j) {
|
10611
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
10612
|
-
best_mse += wk[j] * diff * diff;
|
10613
|
-
}
|
10614
|
-
for (int idx = 0; idx < 256; ++idx) {
|
10615
|
-
//grid = (const uint8_t *)(kgrid_q3xs + idx);
|
10616
|
-
grid = (const uint8_t *)(iq3xxs_grid + idx);
|
10617
|
-
float mse = 0;
|
10618
|
-
for (int j = 0; j < 4; ++j) {
|
10619
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
10620
|
-
mse += wk[j] * diff * diff;
|
10621
|
-
}
|
10622
|
-
if (mse < best_mse) {
|
10623
|
-
best_mse = mse; best_index = idx;
|
10624
|
-
}
|
10625
|
-
}
|
10626
|
-
q3[8*ib+k] = best_index;
|
10627
|
-
//grid = (const uint8_t *)(kgrid_q3xs + best_index);
|
10628
|
-
grid = (const uint8_t *)(iq3xxs_grid + best_index);
|
10629
|
-
for (int j = 0; j < 4; ++j) {
|
10630
|
-
float q = db * grid[j] * signs[j];
|
10631
|
-
sumqx += wk[j] * q * xk[j];
|
10632
|
-
sumq2 += wk[j] * q * q;
|
10633
|
-
}
|
10634
|
-
}
|
10635
|
-
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
10636
|
-
}
|
12009
|
+
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
12010
|
+
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|
12011
|
+
l1 = MAX(0, MIN(15, l1));
|
12012
|
+
int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
|
12013
|
+
l2 = MAX(0, MIN(15, l2));
|
12014
|
+
y[ibl].scales[ib/2] = l1 | (l2 << 4);
|
10637
12015
|
}
|
10638
|
-
|
12016
|
+
|
10639
12017
|
}
|
10640
12018
|
}
|
10641
12019
|
|
10642
|
-
|
12020
|
+
#define IQ3S_BLOCK_SIZE 32
|
12021
|
+
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
10643
12022
|
(void)hist;
|
10644
12023
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
10645
12024
|
int nblock = n_per_row/QK_K;
|
12025
|
+
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
12026
|
+
float weight[IQ3S_BLOCK_SIZE];
|
12027
|
+
float xval[IQ3S_BLOCK_SIZE];
|
12028
|
+
int8_t L[IQ3S_BLOCK_SIZE];
|
12029
|
+
int8_t Laux[IQ3S_BLOCK_SIZE];
|
12030
|
+
float waux[IQ3S_BLOCK_SIZE];
|
12031
|
+
bool is_on_grid[IQ3S_BLOCK_SIZE/4];
|
12032
|
+
bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
|
12033
|
+
uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
|
10646
12034
|
char * qrow = (char *)dst;
|
10647
12035
|
for (int row = 0; row < nrow; ++row) {
|
10648
|
-
|
12036
|
+
quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
|
12037
|
+
scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
|
10649
12038
|
src += n_per_row;
|
10650
|
-
qrow += nblock*sizeof(
|
12039
|
+
qrow += nblock*sizeof(block_iq3_s);
|
10651
12040
|
}
|
10652
|
-
return nrow * nblock * sizeof(
|
12041
|
+
return nrow * nblock * sizeof(block_iq3_s);
|
10653
12042
|
}
|
10654
12043
|
|
10655
|
-
void
|
12044
|
+
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
|
10656
12045
|
assert(k % QK_K == 0);
|
10657
|
-
|
10658
|
-
|
12046
|
+
block_iq3_s * restrict y = vy;
|
12047
|
+
quantize_row_iq3_s_reference(x, y, k);
|
10659
12048
|
}
|
10660
12049
|
|
10661
|
-
void
|
12050
|
+
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
10662
12051
|
assert(k % QK_K == 0);
|
10663
|
-
|
12052
|
+
quantize_iq3_s(x, y, 1, k, NULL, NULL);
|
10664
12053
|
}
|
10665
12054
|
|
12055
|
+
|
10666
12056
|
// =================================== 1.5 bpw ===================================================
|
10667
12057
|
|
10668
12058
|
static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
@@ -10745,7 +12135,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
10745
12135
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
10746
12136
|
GGML_ASSERT(n%QK_K == 0);
|
10747
12137
|
|
10748
|
-
const int nbl = n/
|
12138
|
+
const int nbl = n/QK_K;
|
10749
12139
|
|
10750
12140
|
block_iq1_s * y = vy;
|
10751
12141
|
|
@@ -10880,23 +12270,23 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|
10880
12270
|
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
10881
12271
|
}
|
10882
12272
|
|
10883
|
-
static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x,
|
10884
|
-
ggml_fp16_t * dh, uint8_t * q4,
|
10885
|
-
float * weight, uint8_t * L,
|
12273
|
+
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
|
12274
|
+
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
12275
|
+
float * scales, float * weight, uint8_t * L,
|
10886
12276
|
const int8_t * values,
|
10887
12277
|
const float * quant_weights) {
|
10888
12278
|
|
10889
12279
|
const int ntry = 7;
|
10890
12280
|
|
10891
12281
|
float sigma2 = 0;
|
10892
|
-
for (int j = 0; j <
|
10893
|
-
sigma2 *= 2.f/
|
12282
|
+
for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
|
12283
|
+
sigma2 *= 2.f/super_block_size;
|
10894
12284
|
|
10895
|
-
|
12285
|
+
memset(q4, 0, super_block_size/2);
|
12286
|
+
dh[0] = GGML_FP32_TO_FP16(0.f);
|
10896
12287
|
|
10897
|
-
|
10898
|
-
for (int ib = 0; ib <
|
10899
|
-
dh[ib] = GGML_FP32_TO_FP16(0.f);
|
12288
|
+
float max_scale = 0, amax_scale = 0;
|
12289
|
+
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
10900
12290
|
const float * xb = x + ib*block_size;
|
10901
12291
|
if (quant_weights) {
|
10902
12292
|
const float * qw = quant_weights + ib*block_size;
|
@@ -10912,6 +12302,7 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
|
|
10912
12302
|
}
|
10913
12303
|
}
|
10914
12304
|
if (!amax) {
|
12305
|
+
scales[ib] = 0;
|
10915
12306
|
continue;
|
10916
12307
|
}
|
10917
12308
|
float d = -max/values[0];
|
@@ -10925,7 +12316,6 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
|
|
10925
12316
|
sumqx += w*q*xb[j];
|
10926
12317
|
sumq2 += w*q*q;
|
10927
12318
|
}
|
10928
|
-
float best_id = id;
|
10929
12319
|
d = sumqx/sumq2;
|
10930
12320
|
float best = d*sumqx;
|
10931
12321
|
for (int itry = -ntry; itry <= ntry; ++itry) {
|
@@ -10941,15 +12331,47 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
|
|
10941
12331
|
}
|
10942
12332
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
10943
12333
|
d = sumqx/sumq2; best = d * sumqx;
|
10944
|
-
best_id = id;
|
10945
12334
|
}
|
10946
12335
|
}
|
10947
|
-
|
10948
|
-
|
10949
|
-
|
12336
|
+
scales[ib] = d;
|
12337
|
+
float abs_d = fabsf(d);
|
12338
|
+
if (abs_d > amax_scale) {
|
12339
|
+
amax_scale = abs_d; max_scale = d;
|
12340
|
+
}
|
12341
|
+
}
|
12342
|
+
|
12343
|
+
if (super_block_size/block_size > 1) {
|
12344
|
+
int nb = super_block_size/block_size;
|
12345
|
+
memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
|
12346
|
+
float d = -max_scale/32;
|
12347
|
+
dh[0] = GGML_FP32_TO_FP16(d);
|
12348
|
+
float id = d ? 1/d : 0.f;
|
12349
|
+
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
12350
|
+
int l = nearest_int(id*scales[ib]);
|
12351
|
+
l = MAX(-32, MIN(31, l));
|
12352
|
+
float dl = d * l;
|
12353
|
+
float idl = dl ? 1/dl : 0.f;
|
12354
|
+
uint8_t * Lb = L + ib*block_size;
|
12355
|
+
const float * xb = x + ib*block_size;
|
12356
|
+
for (int j = 0; j < block_size; ++j) {
|
12357
|
+
Lb[j] = best_index_int8(16, values, idl*xb[j]);
|
12358
|
+
}
|
12359
|
+
l += 32;
|
12360
|
+
uint8_t l_l = l & 0xf;
|
12361
|
+
uint8_t l_h = l >> 4;
|
12362
|
+
if (ib%2 == 0) scales_l[ib/2] = l_l;
|
12363
|
+
else scales_l[ib/2] |= (l_l << 4);
|
12364
|
+
scales_h[ib/8] |= (l_h << 2*(ib%8));
|
12365
|
+
}
|
12366
|
+
} else {
|
12367
|
+
dh[0] = GGML_FP32_TO_FP16(scales[0]);
|
12368
|
+
float id = scales[0] ? 1/scales[0] : 0;
|
12369
|
+
for (int j = 0; j < super_block_size; ++j) {
|
12370
|
+
L[j] = best_index_int8(16, values, id*x[j]);
|
10950
12371
|
}
|
10951
12372
|
}
|
10952
|
-
|
12373
|
+
|
12374
|
+
for (int i = 0; i < super_block_size/32; ++i) {
|
10953
12375
|
for (int j = 0; j < 16; ++j) {
|
10954
12376
|
q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
|
10955
12377
|
}
|
@@ -10962,12 +12384,16 @@ size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, i
|
|
10962
12384
|
int nblock = n_per_row/QK4_NL;
|
10963
12385
|
char * qrow = (char *)dst;
|
10964
12386
|
uint8_t L[QK4_NL];
|
10965
|
-
float weight[
|
12387
|
+
float weight[QK4_NL];
|
12388
|
+
uint16_t unused_h;
|
12389
|
+
uint8_t * unused_l = NULL;
|
12390
|
+
float scale;
|
10966
12391
|
for (int row = 0; row < nrow; ++row) {
|
10967
12392
|
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
10968
12393
|
for (int ibl = 0; ibl < nblock; ++ibl) {
|
10969
12394
|
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
10970
|
-
quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs,
|
12395
|
+
quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
|
12396
|
+
&scale, weight, L, kvalues_iq4nl, qw);
|
10971
12397
|
}
|
10972
12398
|
src += n_per_row;
|
10973
12399
|
qrow += nblock*sizeof(block_iq4_nl);
|
@@ -10986,3 +12412,232 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
|
|
10986
12412
|
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
10987
12413
|
}
|
10988
12414
|
|
12415
|
+
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
12416
|
+
#if QK_K == 64
|
12417
|
+
return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
|
12418
|
+
#else
|
12419
|
+
(void)hist;
|
12420
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
12421
|
+
int nblock = n_per_row/QK_K;
|
12422
|
+
char * qrow = (char *)dst;
|
12423
|
+
uint8_t L[QK_K];
|
12424
|
+
float weight[32];
|
12425
|
+
float scales[QK_K/32];
|
12426
|
+
for (int row = 0; row < nrow; ++row) {
|
12427
|
+
block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
|
12428
|
+
for (int ibl = 0; ibl < nblock; ++ibl) {
|
12429
|
+
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
|
12430
|
+
quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
|
12431
|
+
scales, weight, L, kvalues_iq4nl, qw);
|
12432
|
+
}
|
12433
|
+
src += n_per_row;
|
12434
|
+
qrow += nblock*sizeof(block_iq4_xs);
|
12435
|
+
}
|
12436
|
+
return nrow * nblock * sizeof(block_iq4_xs);
|
12437
|
+
#endif
|
12438
|
+
}
|
12439
|
+
|
12440
|
+
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
12441
|
+
assert(k % QK_K == 0);
|
12442
|
+
block_iq4_xs * restrict y = vy;
|
12443
|
+
quantize_row_iq4_xs_reference(x, y, k);
|
12444
|
+
}
|
12445
|
+
|
12446
|
+
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
12447
|
+
assert(k % QK_K == 0);
|
12448
|
+
quantize_iq4_xs(x, y, 1, k, NULL, NULL);
|
12449
|
+
}
|
12450
|
+
|
12451
|
+
// =============================== 2.5625 bpw
|
12452
|
+
|
12453
|
+
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
12454
|
+
|
12455
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
12456
|
+
|
12457
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
12458
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
12459
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
12460
|
+
|
12461
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
12462
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
12463
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
12464
|
+
GGML_ASSERT(n%QK_K == 0);
|
12465
|
+
|
12466
|
+
const int kMaxQ = 3;
|
12467
|
+
|
12468
|
+
const int nbl = n/QK_K;
|
12469
|
+
|
12470
|
+
block_iq2_s * y = vy;
|
12471
|
+
|
12472
|
+
float scales[QK_K/16];
|
12473
|
+
float weight[16];
|
12474
|
+
float xval[16];
|
12475
|
+
int8_t L[16];
|
12476
|
+
int8_t Laux[16];
|
12477
|
+
float waux[16];
|
12478
|
+
bool is_on_grid[2];
|
12479
|
+
bool is_on_grid_aux[2];
|
12480
|
+
uint8_t block_signs[2];
|
12481
|
+
|
12482
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
12483
|
+
|
12484
|
+
memset(&y[ibl], 0, sizeof(block_iq2_s));
|
12485
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
12486
|
+
|
12487
|
+
float max_scale = 0;
|
12488
|
+
|
12489
|
+
const float * xbl = x + QK_K*ibl;
|
12490
|
+
float sumx2 = 0;
|
12491
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
12492
|
+
float sigma2 = 2*sumx2/QK_K;
|
12493
|
+
|
12494
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
12495
|
+
const float * xb = xbl + 16*ib;
|
12496
|
+
if (quant_weights) {
|
12497
|
+
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
12498
|
+
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12499
|
+
} else {
|
12500
|
+
for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
|
12501
|
+
}
|
12502
|
+
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
12503
|
+
for (int k = 0; k < 2; ++k) {
|
12504
|
+
uint8_t s = 0;
|
12505
|
+
for (int i = 0; i < 8; ++i) {
|
12506
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
12507
|
+
else {
|
12508
|
+
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
12509
|
+
}
|
12510
|
+
}
|
12511
|
+
block_signs[k] = s;
|
12512
|
+
}
|
12513
|
+
float max = xval[0];
|
12514
|
+
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
12515
|
+
if (!max) {
|
12516
|
+
scales[ib] = 0;
|
12517
|
+
continue;
|
12518
|
+
}
|
12519
|
+
float best = 0;
|
12520
|
+
float scale = max/(2*kMaxQ-1);
|
12521
|
+
is_on_grid[0] = is_on_grid[1] = true;
|
12522
|
+
for (int is = -9; is <= 9; ++is) {
|
12523
|
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
12524
|
+
float this_scale = 1/id;
|
12525
|
+
for (int k = 0; k < 2; ++k) {
|
12526
|
+
for (int i = 0; i < 8; ++i) {
|
12527
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
12528
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
12529
|
+
}
|
12530
|
+
uint16_t u = 0;
|
12531
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
12532
|
+
int grid_index = kmap_q2xs[u];
|
12533
|
+
is_on_grid_aux[k] = true;
|
12534
|
+
if (grid_index < 0) {
|
12535
|
+
is_on_grid_aux[k] = false;
|
12536
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12537
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
12538
|
+
}
|
12539
|
+
}
|
12540
|
+
float sumqx = 0, sumq2 = 0;
|
12541
|
+
for (int i = 0; i < 16; ++i) {
|
12542
|
+
float w = weight[i];
|
12543
|
+
float q = 2*Laux[i] + 1;
|
12544
|
+
sumqx += w*xval[i]*q;
|
12545
|
+
sumq2 += w*q*q;
|
12546
|
+
}
|
12547
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
12548
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
12549
|
+
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
12550
|
+
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
12551
|
+
}
|
12552
|
+
}
|
12553
|
+
int n_not_ongrid = 0;
|
12554
|
+
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
12555
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
12556
|
+
float id = 1/scale;
|
12557
|
+
for (int k = 0; k < 2; ++k) {
|
12558
|
+
if (is_on_grid[k]) continue;
|
12559
|
+
uint16_t u = 0;
|
12560
|
+
for (int i = 0; i < 8; ++i) {
|
12561
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
12562
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
12563
|
+
u |= (l << 2*i);
|
12564
|
+
L[8*k + i] = l;
|
12565
|
+
}
|
12566
|
+
int grid_index = kmap_q2xs[u];
|
12567
|
+
if (grid_index < 0) {
|
12568
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12569
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
12570
|
+
}
|
12571
|
+
}
|
12572
|
+
float sumqx = 0, sumq2 = 0;
|
12573
|
+
for (int i = 0; i < 16; ++i) {
|
12574
|
+
float w = weight[i];
|
12575
|
+
float q = 2*L[i] + 1;
|
12576
|
+
sumqx += w*xval[i]*q;
|
12577
|
+
sumq2 += w*q*q;
|
12578
|
+
}
|
12579
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
12580
|
+
}
|
12581
|
+
if (scale < 0) {
|
12582
|
+
scale = -scale;
|
12583
|
+
for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
|
12584
|
+
}
|
12585
|
+
for (int k = 0; k < 2; ++k) {
|
12586
|
+
uint16_t u = 0;
|
12587
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
12588
|
+
int grid_index = kmap_q2xs[u];
|
12589
|
+
if (grid_index < 0) {
|
12590
|
+
printf("Oops: found point %u not on grid:", u);
|
12591
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
12592
|
+
printf("\n");
|
12593
|
+
GGML_ASSERT(false);
|
12594
|
+
}
|
12595
|
+
const int i8 = 2*ib + k;
|
12596
|
+
y[ibl].qs[i8] = grid_index & 255;
|
12597
|
+
y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
|
12598
|
+
y[ibl].qs[QK_K/8 + i8] = block_signs[k];
|
12599
|
+
}
|
12600
|
+
GGML_ASSERT(scale >= 0);
|
12601
|
+
scales[ib] = scale;
|
12602
|
+
max_scale = MAX(max_scale, scale);
|
12603
|
+
}
|
12604
|
+
|
12605
|
+
if (!max_scale) {
|
12606
|
+
continue;
|
12607
|
+
}
|
12608
|
+
|
12609
|
+
float d = max_scale/31;
|
12610
|
+
y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
|
12611
|
+
float id = 1/d;
|
12612
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
12613
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
12614
|
+
l = MAX(0, MIN(15, l));
|
12615
|
+
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
12616
|
+
else y[ibl].scales[ib/2] |= (l << 4);
|
12617
|
+
}
|
12618
|
+
}
|
12619
|
+
}
|
12620
|
+
|
12621
|
+
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
12622
|
+
(void)hist;
|
12623
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
12624
|
+
int nblock = n_per_row/QK_K;
|
12625
|
+
char * qrow = (char *)dst;
|
12626
|
+
for (int row = 0; row < nrow; ++row) {
|
12627
|
+
quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
|
12628
|
+
src += n_per_row;
|
12629
|
+
qrow += nblock*sizeof(block_iq2_s);
|
12630
|
+
}
|
12631
|
+
return nrow * nblock * sizeof(block_iq2_s);
|
12632
|
+
}
|
12633
|
+
|
12634
|
+
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
12635
|
+
assert(k % QK_K == 0);
|
12636
|
+
quantize_iq2_s(x, y, 1, k, NULL, NULL);
|
12637
|
+
}
|
12638
|
+
|
12639
|
+
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
12640
|
+
assert(k % QK_K == 0);
|
12641
|
+
block_iq2_s * restrict y = vy;
|
12642
|
+
quantize_row_iq2_s_reference(x, y, k);
|
12643
|
+
}
|