llama_cpp 0.12.7 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
462
462
|
return res;
|
463
463
|
}
|
464
464
|
|
465
|
+
// NOTE: not tested
|
466
|
+
inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
467
|
+
int8x16_t res;
|
468
|
+
|
469
|
+
res[ 0] = a[b[ 0]];
|
470
|
+
res[ 1] = a[b[ 1]];
|
471
|
+
res[ 2] = a[b[ 2]];
|
472
|
+
res[ 3] = a[b[ 3]];
|
473
|
+
res[ 4] = a[b[ 4]];
|
474
|
+
res[ 5] = a[b[ 5]];
|
475
|
+
res[ 6] = a[b[ 6]];
|
476
|
+
res[ 7] = a[b[ 7]];
|
477
|
+
res[ 8] = a[b[ 8]];
|
478
|
+
res[ 9] = a[b[ 9]];
|
479
|
+
res[10] = a[b[10]];
|
480
|
+
res[11] = a[b[11]];
|
481
|
+
res[12] = a[b[12]];
|
482
|
+
res[13] = a[b[13]];
|
483
|
+
res[14] = a[b[14]];
|
484
|
+
res[15] = a[b[15]];
|
485
|
+
|
486
|
+
return res;
|
487
|
+
}
|
488
|
+
|
465
489
|
#else
|
466
490
|
|
467
491
|
#define ggml_int16x8x2_t int16x8x2_t
|
@@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
476
500
|
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
477
501
|
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
478
502
|
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
503
|
+
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
479
504
|
|
480
505
|
#endif
|
481
506
|
|
@@ -1852,7 +1877,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1852
1877
|
float mins[QK_K/16];
|
1853
1878
|
float scales[QK_K/16];
|
1854
1879
|
float sw[QK_K/16];
|
1855
|
-
float weight[
|
1880
|
+
float weight[16];
|
1856
1881
|
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
1857
1882
|
|
1858
1883
|
for (int i = 0; i < nb; i++) {
|
@@ -1862,13 +1887,42 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1862
1887
|
float sigma2 = sumx2/QK_K;
|
1863
1888
|
for (int j = 0; j < QK_K/16; ++j) {
|
1864
1889
|
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
1865
|
-
for (int l = 0; l <
|
1890
|
+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
1866
1891
|
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
1867
|
-
scales[j] = make_qkx3_quants(
|
1892
|
+
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
1868
1893
|
}
|
1869
1894
|
|
1870
|
-
float dm
|
1871
|
-
|
1895
|
+
float dm, mm;
|
1896
|
+
#if QK_K == 64
|
1897
|
+
float max_scale = 0, max_min = 0;
|
1898
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1899
|
+
max_scale = MAX(max_scale, scales[j]);
|
1900
|
+
max_min = MAX(max_min, mins[j]);
|
1901
|
+
}
|
1902
|
+
dm = max_scale/15;
|
1903
|
+
mm = max_min/15;
|
1904
|
+
if (max_scale) {
|
1905
|
+
float id = 1/dm;
|
1906
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1907
|
+
int l = nearest_int(id*scales[j]);
|
1908
|
+
Ls[j] = MAX(0, MIN(15, l));
|
1909
|
+
}
|
1910
|
+
} else {
|
1911
|
+
memset(Ls, 0, QK_K/16);
|
1912
|
+
}
|
1913
|
+
if (max_min) {
|
1914
|
+
float id = 1/mm;
|
1915
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1916
|
+
int l = nearest_int(id*mins[j]);
|
1917
|
+
Lm[j] = MAX(0, MIN(15, l));
|
1918
|
+
}
|
1919
|
+
} else {
|
1920
|
+
memset(Lm, 0, QK_K/16);
|
1921
|
+
}
|
1922
|
+
#else
|
1923
|
+
dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
1924
|
+
mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
1925
|
+
#endif
|
1872
1926
|
y[i].d = GGML_FP32_TO_FP16(dm);
|
1873
1927
|
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
1874
1928
|
dm = GGML_FP16_TO_FP32(y[i].d);
|
@@ -3470,6 +3524,265 @@ static const uint64_t iq2xs_grid[512] = {
|
|
3470
3524
|
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
3471
3525
|
};
|
3472
3526
|
|
3527
|
+
static const uint64_t iq2s_grid[1024] = {
|
3528
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
3529
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
3530
|
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
3531
|
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
3532
|
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
3533
|
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
3534
|
+
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
3535
|
+
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
3536
|
+
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
3537
|
+
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
3538
|
+
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
3539
|
+
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
3540
|
+
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
3541
|
+
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
3542
|
+
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
3543
|
+
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
3544
|
+
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
3545
|
+
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
3546
|
+
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
3547
|
+
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
3548
|
+
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
3549
|
+
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
3550
|
+
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
3551
|
+
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
3552
|
+
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
3553
|
+
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
3554
|
+
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
3555
|
+
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
3556
|
+
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
3557
|
+
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
3558
|
+
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
3559
|
+
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
3560
|
+
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
3561
|
+
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
3562
|
+
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
3563
|
+
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
3564
|
+
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
3565
|
+
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
3566
|
+
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
3567
|
+
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
3568
|
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
3569
|
+
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
3570
|
+
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
3571
|
+
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
3572
|
+
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
3573
|
+
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
3574
|
+
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
3575
|
+
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
3576
|
+
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
3577
|
+
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
3578
|
+
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
3579
|
+
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
3580
|
+
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
3581
|
+
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
3582
|
+
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
3583
|
+
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
3584
|
+
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
3585
|
+
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
3586
|
+
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
3587
|
+
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
3588
|
+
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
3589
|
+
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
3590
|
+
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
3591
|
+
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
3592
|
+
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
3593
|
+
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
3594
|
+
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
3595
|
+
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
3596
|
+
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
3597
|
+
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
3598
|
+
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
3599
|
+
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
3600
|
+
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
3601
|
+
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
3602
|
+
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
3603
|
+
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
3604
|
+
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
3605
|
+
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
3606
|
+
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
3607
|
+
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
3608
|
+
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
3609
|
+
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
3610
|
+
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
3611
|
+
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
3612
|
+
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
3613
|
+
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
3614
|
+
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
3615
|
+
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
3616
|
+
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
3617
|
+
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
3618
|
+
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
3619
|
+
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
3620
|
+
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
3621
|
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
3622
|
+
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
3623
|
+
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
3624
|
+
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
3625
|
+
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
3626
|
+
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
3627
|
+
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
3628
|
+
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
3629
|
+
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
3630
|
+
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
3631
|
+
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
3632
|
+
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
3633
|
+
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
3634
|
+
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
3635
|
+
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
3636
|
+
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
3637
|
+
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
3638
|
+
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
3639
|
+
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
3640
|
+
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
3641
|
+
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
3642
|
+
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
3643
|
+
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
3644
|
+
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
3645
|
+
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
3646
|
+
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
3647
|
+
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
3648
|
+
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
3649
|
+
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
3650
|
+
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
3651
|
+
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
3652
|
+
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
3653
|
+
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
3654
|
+
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
3655
|
+
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
3656
|
+
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
3657
|
+
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
3658
|
+
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
3659
|
+
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
3660
|
+
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
3661
|
+
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
3662
|
+
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
3663
|
+
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
3664
|
+
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
3665
|
+
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
3666
|
+
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
3667
|
+
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
3668
|
+
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
3669
|
+
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
3670
|
+
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
3671
|
+
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
3672
|
+
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
3673
|
+
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
3674
|
+
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
3675
|
+
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
3676
|
+
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
3677
|
+
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
3678
|
+
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
3679
|
+
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
3680
|
+
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
3681
|
+
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
3682
|
+
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
3683
|
+
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
3684
|
+
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
3685
|
+
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
3686
|
+
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
3687
|
+
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
3688
|
+
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
3689
|
+
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
3690
|
+
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
3691
|
+
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
3692
|
+
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
3693
|
+
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
3694
|
+
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
3695
|
+
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
3696
|
+
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
3697
|
+
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
3698
|
+
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
3699
|
+
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
3700
|
+
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
3701
|
+
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
3702
|
+
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
3703
|
+
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
3704
|
+
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
3705
|
+
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
3706
|
+
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
3707
|
+
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
3708
|
+
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
3709
|
+
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
3710
|
+
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
3711
|
+
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
3712
|
+
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
3713
|
+
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
3714
|
+
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
3715
|
+
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
3716
|
+
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
3717
|
+
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
3718
|
+
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
3719
|
+
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
3720
|
+
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
3721
|
+
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
3722
|
+
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
3723
|
+
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
3724
|
+
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
3725
|
+
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
3726
|
+
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
3727
|
+
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
3728
|
+
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
3729
|
+
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
3730
|
+
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
3731
|
+
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
3732
|
+
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
3733
|
+
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
3734
|
+
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
3735
|
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
3736
|
+
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
3737
|
+
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
3738
|
+
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
3739
|
+
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
3740
|
+
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
3741
|
+
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
3742
|
+
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
3743
|
+
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
3744
|
+
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
3745
|
+
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
3746
|
+
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
3747
|
+
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
3748
|
+
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
3749
|
+
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
3750
|
+
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
3751
|
+
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
3752
|
+
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
3753
|
+
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
3754
|
+
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
3755
|
+
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
3756
|
+
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
3757
|
+
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
3758
|
+
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
3759
|
+
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
3760
|
+
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
3761
|
+
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
3762
|
+
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
3763
|
+
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
3764
|
+
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
3765
|
+
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
3766
|
+
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
3767
|
+
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
3768
|
+
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
3769
|
+
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
3770
|
+
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
3771
|
+
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
3772
|
+
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
3773
|
+
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
3774
|
+
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
3775
|
+
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
3776
|
+
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
3777
|
+
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
3778
|
+
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
3779
|
+
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
3780
|
+
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
3781
|
+
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
3782
|
+
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
3783
|
+
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
3784
|
+
};
|
3785
|
+
|
3473
3786
|
static const uint32_t iq3xxs_grid[256] = {
|
3474
3787
|
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
3475
3788
|
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
@@ -3505,6 +3818,73 @@ static const uint32_t iq3xxs_grid[256] = {
|
|
3505
3818
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
3506
3819
|
};
|
3507
3820
|
|
3821
|
+
static const uint32_t iq3xs_grid[512] = {
|
3822
|
+
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
3823
|
+
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
3824
|
+
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
3825
|
+
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
3826
|
+
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
3827
|
+
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
3828
|
+
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
3829
|
+
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
3830
|
+
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
3831
|
+
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
3832
|
+
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
3833
|
+
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
3834
|
+
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
3835
|
+
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
3836
|
+
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
3837
|
+
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
3838
|
+
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
3839
|
+
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
3840
|
+
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
3841
|
+
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
3842
|
+
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
3843
|
+
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
3844
|
+
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
3845
|
+
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
3846
|
+
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
3847
|
+
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
3848
|
+
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
3849
|
+
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
3850
|
+
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
3851
|
+
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
3852
|
+
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
3853
|
+
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
3854
|
+
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
3855
|
+
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
3856
|
+
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
3857
|
+
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
3858
|
+
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
3859
|
+
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
3860
|
+
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
3861
|
+
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
3862
|
+
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
3863
|
+
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
3864
|
+
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
3865
|
+
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
3866
|
+
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
3867
|
+
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
3868
|
+
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
3869
|
+
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
3870
|
+
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
3871
|
+
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
3872
|
+
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
3873
|
+
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
3874
|
+
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
3875
|
+
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
3876
|
+
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
3877
|
+
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
3878
|
+
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
3879
|
+
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
3880
|
+
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
3881
|
+
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
3882
|
+
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
3883
|
+
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
3884
|
+
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
3885
|
+
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
3886
|
+
};
|
3887
|
+
|
3508
3888
|
#define NGRID_IQ2XXS 512
|
3509
3889
|
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
3510
3890
|
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
@@ -3704,6 +4084,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
3704
4084
|
}
|
3705
4085
|
}
|
3706
4086
|
|
4087
|
+
// ====================== 2.5625 bpw (de)-quantization
|
4088
|
+
|
4089
|
+
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
|
4090
|
+
assert(k % QK_K == 0);
|
4091
|
+
const int nb = k / QK_K;
|
4092
|
+
|
4093
|
+
float db[2];
|
4094
|
+
|
4095
|
+
for (int i = 0; i < nb; i++) {
|
4096
|
+
|
4097
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4098
|
+
const uint8_t * qs = x[i].qs;
|
4099
|
+
const uint8_t * qh = x[i].qh;
|
4100
|
+
const uint8_t * signs = qs + QK_K/8;
|
4101
|
+
|
4102
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
4103
|
+
db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
|
4104
|
+
db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
|
4105
|
+
for (int l = 0; l < 4; ++l) {
|
4106
|
+
const float dl = db[l/2];
|
4107
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
4108
|
+
for (int j = 0; j < 8; ++j) {
|
4109
|
+
y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
4110
|
+
}
|
4111
|
+
y += 8;
|
4112
|
+
}
|
4113
|
+
qs += 4;
|
4114
|
+
signs += 4;
|
4115
|
+
}
|
4116
|
+
}
|
4117
|
+
}
|
4118
|
+
|
3707
4119
|
// ====================== 3.0625 bpw (de)-quantization
|
3708
4120
|
|
3709
4121
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
|
@@ -3736,6 +4148,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
|
3736
4148
|
}
|
3737
4149
|
}
|
3738
4150
|
|
4151
|
+
// ====================== 3.3125 bpw (de)-quantization
|
4152
|
+
|
4153
|
+
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
|
4154
|
+
assert(k % QK_K == 0);
|
4155
|
+
const int nb = k / QK_K;
|
4156
|
+
|
4157
|
+
for (int i = 0; i < nb; i++) {
|
4158
|
+
|
4159
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4160
|
+
const uint8_t * qs = x[i].qs;
|
4161
|
+
const uint8_t * qh = x[i].qh;
|
4162
|
+
const uint8_t * signs = x[i].signs;
|
4163
|
+
|
4164
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
4165
|
+
const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f;
|
4166
|
+
const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f;
|
4167
|
+
for (int l = 0; l < 4; ++l) {
|
4168
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
4169
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
4170
|
+
for (int j = 0; j < 4; ++j) {
|
4171
|
+
y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4172
|
+
y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4173
|
+
}
|
4174
|
+
y += 8;
|
4175
|
+
}
|
4176
|
+
qs += 8;
|
4177
|
+
signs += 4;
|
4178
|
+
for (int l = 0; l < 4; ++l) {
|
4179
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
|
4180
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
|
4181
|
+
for (int j = 0; j < 4; ++j) {
|
4182
|
+
y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4183
|
+
y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
4184
|
+
}
|
4185
|
+
y += 8;
|
4186
|
+
}
|
4187
|
+
qh += 2;
|
4188
|
+
qs += 8;
|
4189
|
+
signs += 4;
|
4190
|
+
}
|
4191
|
+
}
|
4192
|
+
}
|
4193
|
+
|
3739
4194
|
// ====================== 1.5625 bpw (de)-quantization
|
3740
4195
|
|
3741
4196
|
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
|
@@ -3799,6 +4254,33 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
|
3799
4254
|
}
|
3800
4255
|
}
|
3801
4256
|
|
4257
|
+
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
|
4258
|
+
assert(k % QK_K == 0);
|
4259
|
+
#if QK_K == 64
|
4260
|
+
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
4261
|
+
#else
|
4262
|
+
const int nb = k / QK_K;
|
4263
|
+
|
4264
|
+
for (int i = 0; i < nb; i++) {
|
4265
|
+
|
4266
|
+
const uint8_t * qs = x[i].qs;
|
4267
|
+
|
4268
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4269
|
+
|
4270
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
4271
|
+
const int ls = ((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4);
|
4272
|
+
const float dl = d * (ls - 32);
|
4273
|
+
for (int j = 0; j < 16; ++j) {
|
4274
|
+
y[j+ 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
|
4275
|
+
y[j+16] = dl * kvalues_iq4nl[qs[j] >> 4];
|
4276
|
+
}
|
4277
|
+
y += 32;
|
4278
|
+
qs += 16;
|
4279
|
+
}
|
4280
|
+
}
|
4281
|
+
#endif
|
4282
|
+
}
|
4283
|
+
|
3802
4284
|
//===================================== Q8_K ==============================================
|
3803
4285
|
|
3804
4286
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
@@ -5857,7 +6339,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5857
6339
|
|
5858
6340
|
float sumf = 0;
|
5859
6341
|
|
5860
|
-
int isum[
|
6342
|
+
int isum[QK_K/16];
|
5861
6343
|
|
5862
6344
|
for (int i = 0; i < nb; ++i) {
|
5863
6345
|
|
@@ -5873,14 +6355,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5873
6355
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
5874
6356
|
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
5875
6357
|
|
5876
|
-
isum
|
6358
|
+
memset(isum, 0, (QK_K/16)*sizeof(int));
|
5877
6359
|
for (int l = 0; l < 16; ++l) {
|
5878
6360
|
isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
|
5879
6361
|
isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
|
5880
6362
|
isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
|
5881
6363
|
isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
|
5882
6364
|
}
|
5883
|
-
for (int l = 0; l <
|
6365
|
+
for (int l = 0; l < QK_K/16; ++l) {
|
5884
6366
|
isum[l] *= (sc[l] & 0xF);
|
5885
6367
|
}
|
5886
6368
|
sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
|
@@ -8806,6 +9288,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8806
9288
|
|
8807
9289
|
#endif
|
8808
9290
|
|
9291
|
+
#if defined (__AVX2__) || defined (__ARM_NEON)
|
8809
9292
|
static const int8_t keven_signs_q2xs[1024] = {
|
8810
9293
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
8811
9294
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
@@ -8840,6 +9323,7 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|
8840
9323
|
1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
|
8841
9324
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
8842
9325
|
};
|
9326
|
+
#endif
|
8843
9327
|
|
8844
9328
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8845
9329
|
assert(n % QK_K == 0);
|
@@ -9037,15 +9521,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9037
9521
|
|
9038
9522
|
#elif defined(__AVX2__)
|
9039
9523
|
|
9040
|
-
const __m128i m4 = _mm_set1_epi8(0xf);
|
9041
|
-
const __m128i m1 = _mm_set1_epi8(1);
|
9042
|
-
const __m256i m511 = _mm256_set1_epi16(511);
|
9043
9524
|
const __m256i mone = _mm256_set1_epi8(1);
|
9044
|
-
|
9045
|
-
static const uint8_t k_bit_helper[32] = {
|
9046
|
-
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9047
|
-
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9048
|
-
};
|
9049
9525
|
static const char block_sign_shuffle_mask_1[32] = {
|
9050
9526
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
9051
9527
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
@@ -9059,11 +9535,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9059
9535
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9060
9536
|
};
|
9061
9537
|
|
9062
|
-
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
9063
9538
|
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
|
9064
9539
|
const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
|
9065
9540
|
const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
|
9066
9541
|
|
9542
|
+
#if QK_K == 64
|
9543
|
+
static const uint8_t k_bit_helper[16] = {
|
9544
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9545
|
+
};
|
9546
|
+
const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
9547
|
+
const __m128i m511 = _mm_set1_epi16(511);
|
9548
|
+
typedef union {
|
9549
|
+
__m128i vec_index;
|
9550
|
+
uint16_t index[8];
|
9551
|
+
} index_t;
|
9552
|
+
|
9553
|
+
index_t idx;
|
9554
|
+
__m256 accumf = _mm256_setzero_ps();
|
9555
|
+
for (int i = 0; i < nb; ++i) {
|
9556
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9557
|
+
const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
|
9558
|
+
idx.vec_index = _mm_and_si128(q2_data, m511);
|
9559
|
+
|
9560
|
+
const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
|
9561
|
+
const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
|
9562
|
+
const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
|
9563
|
+
|
9564
|
+
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
9565
|
+
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
9566
|
+
const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
|
9567
|
+
|
9568
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
9569
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
9570
|
+
|
9571
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
|
9572
|
+
iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
|
9573
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
|
9574
|
+
iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
|
9575
|
+
|
9576
|
+
__m256i signs;
|
9577
|
+
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
|
9578
|
+
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
9579
|
+
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
|
9580
|
+
|
9581
|
+
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
|
9582
|
+
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
9583
|
+
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
|
9584
|
+
|
9585
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
9586
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
9587
|
+
|
9588
|
+
const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
9589
|
+
const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
9590
|
+
|
9591
|
+
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
9592
|
+
|
9593
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
|
9594
|
+
|
9595
|
+
}
|
9596
|
+
|
9597
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9598
|
+
#else
|
9599
|
+
|
9600
|
+
static const uint8_t k_bit_helper[32] = {
|
9601
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9602
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9603
|
+
};
|
9604
|
+
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
9605
|
+
const __m256i m511 = _mm256_set1_epi16(511);
|
9606
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9607
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9608
|
+
|
9067
9609
|
uint64_t aux64;
|
9068
9610
|
|
9069
9611
|
// somewhat hacky, but gives a significant boost in performance
|
@@ -9152,6 +9694,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9152
9694
|
}
|
9153
9695
|
|
9154
9696
|
*s = 0.125f * hsum_float_8(accumf);
|
9697
|
+
#endif
|
9155
9698
|
|
9156
9699
|
#else
|
9157
9700
|
|
@@ -9193,7 +9736,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9193
9736
|
#endif
|
9194
9737
|
}
|
9195
9738
|
|
9196
|
-
void
|
9739
|
+
void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9197
9740
|
assert(n % QK_K == 0);
|
9198
9741
|
assert(nrc == 1);
|
9199
9742
|
UNUSED(nrc);
|
@@ -9201,88 +9744,148 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9201
9744
|
UNUSED(by);
|
9202
9745
|
UNUSED(bs);
|
9203
9746
|
|
9204
|
-
const
|
9205
|
-
const block_q8_K
|
9747
|
+
const block_iq2_s * restrict x = vx;
|
9748
|
+
const block_q8_K * restrict y = vy;
|
9206
9749
|
|
9207
9750
|
const int nb = n / QK_K;
|
9208
9751
|
|
9209
9752
|
#if defined(__ARM_NEON)
|
9210
9753
|
|
9211
|
-
|
9754
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9755
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
9756
|
+
};
|
9212
9757
|
|
9213
|
-
|
9758
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
9214
9759
|
|
9215
|
-
|
9760
|
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
9761
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
9762
|
+
const uint8x16_t m1 = vdupq_n_u8(1);
|
9763
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
9764
|
+
|
9765
|
+
uint8x16x2_t vs;
|
9766
|
+
ggml_int8x16x4_t q2s;
|
9216
9767
|
ggml_int8x16x4_t q8b;
|
9217
9768
|
|
9218
9769
|
float sumf = 0;
|
9219
9770
|
for (int i = 0; i < nb; ++i) {
|
9771
|
+
|
9220
9772
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9221
|
-
|
9222
|
-
const uint8_t * restrict
|
9223
|
-
const
|
9224
|
-
|
9773
|
+
|
9774
|
+
const uint8_t * restrict qs = x[i].qs;
|
9775
|
+
const uint8_t * restrict qh = x[i].qh;
|
9776
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
9777
|
+
const int8_t * restrict q8 = y[i].qs;
|
9778
|
+
|
9779
|
+
int sumi1 = 0, sumi2 = 0;
|
9225
9780
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9226
9781
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9227
|
-
|
9228
|
-
|
9229
|
-
const
|
9230
|
-
|
9231
|
-
const
|
9232
|
-
|
9233
|
-
|
9234
|
-
|
9235
|
-
|
9236
|
-
|
9237
|
-
|
9238
|
-
|
9239
|
-
|
9240
|
-
|
9241
|
-
|
9242
|
-
|
9243
|
-
|
9244
|
-
|
9782
|
+
q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
|
9783
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
|
9784
|
+
q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
|
9785
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
|
9786
|
+
q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
|
9787
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
|
9788
|
+
q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
|
9789
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
|
9790
|
+
qs += 8;
|
9791
|
+
|
9792
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
9793
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9794
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9795
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
9796
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9797
|
+
|
9798
|
+
q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
|
9799
|
+
q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
|
9800
|
+
|
9801
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
9802
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9803
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9804
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
9805
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9806
|
+
|
9807
|
+
signs += 4;
|
9808
|
+
|
9809
|
+
q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
|
9810
|
+
q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
|
9811
|
+
|
9812
|
+
const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
|
9813
|
+
const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
|
9814
|
+
const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
|
9815
|
+
const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
|
9816
|
+
|
9817
|
+
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
|
9818
|
+
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
|
9819
|
+
sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
|
9820
|
+
sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
|
9245
9821
|
}
|
9246
|
-
sumf += d*(
|
9822
|
+
sumf += d*(sumi1 + sumi2);
|
9247
9823
|
}
|
9248
|
-
|
9824
|
+
|
9825
|
+
*s = 0.125f * sumf;
|
9249
9826
|
|
9250
9827
|
#elif defined(__AVX2__)
|
9251
9828
|
|
9252
|
-
|
9829
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9830
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
9831
|
+
};
|
9253
9832
|
|
9254
|
-
|
9833
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9834
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9835
|
+
};
|
9836
|
+
|
9837
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9838
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9839
|
+
|
9840
|
+
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
9841
|
+
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
9842
|
+
|
9843
|
+
uint64_t aux64;
|
9255
9844
|
|
9256
9845
|
__m256 accumf = _mm256_setzero_ps();
|
9257
9846
|
for (int i = 0; i < nb; ++i) {
|
9258
9847
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9259
|
-
const uint8_t * restrict
|
9260
|
-
const uint8_t * restrict
|
9848
|
+
const uint8_t * restrict qs = x[i].qs;
|
9849
|
+
const uint8_t * restrict qh = x[i].qh;
|
9850
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
9261
9851
|
const int8_t * restrict q8 = y[i].qs;
|
9852
|
+
|
9853
|
+
memcpy(&aux64, x[i].scales, 8);
|
9854
|
+
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
9855
|
+
const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
|
9856
|
+
|
9262
9857
|
__m256i sumi1 = _mm256_setzero_si256();
|
9263
9858
|
__m256i sumi2 = _mm256_setzero_si256();
|
9264
9859
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9265
9860
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
9266
9861
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
9267
|
-
const __m256i q2_1 =
|
9268
|
-
|
9269
|
-
|
9270
|
-
|
9271
|
-
|
9272
|
-
|
9273
|
-
|
9274
|
-
|
9275
|
-
|
9276
|
-
|
9277
|
-
|
9278
|
-
|
9279
|
-
const __m256i
|
9280
|
-
const __m256i
|
9281
|
-
|
9282
|
-
|
9283
|
-
|
9284
|
-
const __m256i
|
9285
|
-
const __m256i
|
9862
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
9863
|
+
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
|
9864
|
+
iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
9865
|
+
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
9866
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
9867
|
+
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
|
9868
|
+
iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
9869
|
+
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
9870
|
+
qs += 8;
|
9871
|
+
|
9872
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
9873
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9874
|
+
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
9875
|
+
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
9876
|
+
|
9877
|
+
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
9878
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9879
|
+
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
9880
|
+
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
9881
|
+
|
9882
|
+
signs += 4;
|
9883
|
+
|
9884
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
|
9885
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
|
9886
|
+
|
9887
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
|
9888
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
|
9286
9889
|
sumi1 = _mm256_add_epi32(sumi1, p1);
|
9287
9890
|
sumi2 = _mm256_add_epi32(sumi2, p2);
|
9288
9891
|
}
|
@@ -9291,18 +9894,162 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9291
9894
|
|
9292
9895
|
}
|
9293
9896
|
|
9294
|
-
*s = 0.
|
9897
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9295
9898
|
|
9296
9899
|
#else
|
9297
9900
|
|
9298
|
-
|
9901
|
+
float sumf = 0;
|
9902
|
+
for (int i = 0; i < nb; i++) {
|
9299
9903
|
|
9300
|
-
float sumf = 0.f;
|
9301
|
-
for (int i = 0; i < nb; ++i) {
|
9302
9904
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9303
|
-
const
|
9304
|
-
const uint8_t *
|
9305
|
-
const
|
9905
|
+
const int8_t * q8 = y[i].qs;
|
9906
|
+
const uint8_t * qs = x[i].qs;
|
9907
|
+
const uint8_t * qh = x[i].qh;
|
9908
|
+
const uint8_t * signs = qs + QK_K/8;
|
9909
|
+
|
9910
|
+
int bsum = 0;
|
9911
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
9912
|
+
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
9913
|
+
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
9914
|
+
int sumi1 = 0, sumi2 = 0;
|
9915
|
+
for (int l = 0; l < 2; ++l) {
|
9916
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
9917
|
+
for (int j = 0; j < 8; ++j) {
|
9918
|
+
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
9919
|
+
}
|
9920
|
+
q8 += 8;
|
9921
|
+
}
|
9922
|
+
for (int l = 2; l < 4; ++l) {
|
9923
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
9924
|
+
for (int j = 0; j < 8; ++j) {
|
9925
|
+
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
9926
|
+
}
|
9927
|
+
q8 += 8;
|
9928
|
+
}
|
9929
|
+
bsum += ls1 * sumi1 + ls2 * sumi2;
|
9930
|
+
qs += 4;
|
9931
|
+
signs += 4;
|
9932
|
+
}
|
9933
|
+
|
9934
|
+
sumf += d * bsum;
|
9935
|
+
}
|
9936
|
+
|
9937
|
+
*s = 0.125f * sumf;
|
9938
|
+
|
9939
|
+
#endif
|
9940
|
+
|
9941
|
+
}
|
9942
|
+
|
9943
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9944
|
+
assert(n % QK_K == 0);
|
9945
|
+
assert(nrc == 1);
|
9946
|
+
UNUSED(nrc);
|
9947
|
+
UNUSED(bx);
|
9948
|
+
UNUSED(by);
|
9949
|
+
UNUSED(bs);
|
9950
|
+
|
9951
|
+
const block_iq3_xxs * restrict x = vx;
|
9952
|
+
const block_q8_K * restrict y = vy;
|
9953
|
+
|
9954
|
+
const int nb = n / QK_K;
|
9955
|
+
|
9956
|
+
#if defined(__ARM_NEON)
|
9957
|
+
|
9958
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
9959
|
+
|
9960
|
+
uint32_t aux32[2];
|
9961
|
+
|
9962
|
+
ggml_int8x16x4_t q3s;
|
9963
|
+
ggml_int8x16x4_t q8b;
|
9964
|
+
|
9965
|
+
float sumf = 0;
|
9966
|
+
for (int i = 0; i < nb; ++i) {
|
9967
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9968
|
+
const uint8_t * restrict q3 = x[i].qs;
|
9969
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
9970
|
+
const int8_t * restrict q8 = y[i].qs;
|
9971
|
+
float sumf1 = 0, sumf2 = 0;
|
9972
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9973
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9974
|
+
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
9975
|
+
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
|
9976
|
+
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
|
9977
|
+
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
|
9978
|
+
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
|
9979
|
+
q3 += 16;
|
9980
|
+
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
9981
|
+
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
9982
|
+
q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
|
9983
|
+
q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
|
9984
|
+
q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
|
9985
|
+
q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
|
9986
|
+
q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
|
9987
|
+
q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
|
9988
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
9989
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
9990
|
+
sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
|
9991
|
+
sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
|
9992
|
+
}
|
9993
|
+
sumf += d*(sumf1 + sumf2);
|
9994
|
+
}
|
9995
|
+
*s = 0.5f * sumf;
|
9996
|
+
|
9997
|
+
#elif defined(__AVX2__)
|
9998
|
+
|
9999
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10000
|
+
|
10001
|
+
uint32_t aux32[2];
|
10002
|
+
|
10003
|
+
__m256 accumf = _mm256_setzero_ps();
|
10004
|
+
for (int i = 0; i < nb; ++i) {
|
10005
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10006
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10007
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
10008
|
+
const int8_t * restrict q8 = y[i].qs;
|
10009
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
10010
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
10011
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10012
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10013
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10014
|
+
const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
10015
|
+
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10016
|
+
q3 += 8;
|
10017
|
+
const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
10018
|
+
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10019
|
+
q3 += 8;
|
10020
|
+
memcpy(aux32, gas, 8); gas += 8;
|
10021
|
+
const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
|
10022
|
+
signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
10023
|
+
const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
|
10024
|
+
signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
10025
|
+
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
10026
|
+
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
10027
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
10028
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
10029
|
+
const uint16_t ls1 = aux32[0] >> 28;
|
10030
|
+
const uint16_t ls2 = aux32[1] >> 28;
|
10031
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
10032
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
10033
|
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
10034
|
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
10035
|
+
}
|
10036
|
+
|
10037
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
10038
|
+
|
10039
|
+
}
|
10040
|
+
|
10041
|
+
*s = 0.25f * hsum_float_8(accumf);
|
10042
|
+
|
10043
|
+
#else
|
10044
|
+
|
10045
|
+
uint32_t aux32;
|
10046
|
+
|
10047
|
+
float sumf = 0.f;
|
10048
|
+
for (int i = 0; i < nb; ++i) {
|
10049
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10050
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10051
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
10052
|
+
const int8_t * restrict q8 = y[i].qs;
|
9306
10053
|
int32_t bsum = 0;
|
9307
10054
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
9308
10055
|
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
@@ -9327,6 +10074,202 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9327
10074
|
#endif
|
9328
10075
|
}
|
9329
10076
|
|
10077
|
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
10078
|
+
assert(n % QK_K == 0);
|
10079
|
+
assert(nrc == 1);
|
10080
|
+
UNUSED(nrc);
|
10081
|
+
UNUSED(bx);
|
10082
|
+
UNUSED(by);
|
10083
|
+
UNUSED(bs);
|
10084
|
+
|
10085
|
+
const block_iq3_s * restrict x = vx;
|
10086
|
+
const block_q8_K * restrict y = vy;
|
10087
|
+
|
10088
|
+
const int nb = n / QK_K;
|
10089
|
+
|
10090
|
+
#if defined(__ARM_NEON)
|
10091
|
+
|
10092
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10093
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10094
|
+
};
|
10095
|
+
|
10096
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
10097
|
+
|
10098
|
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
10099
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
10100
|
+
|
10101
|
+
uint8x16x2_t vs;
|
10102
|
+
ggml_int8x16x4_t q3s;
|
10103
|
+
ggml_int8x16x4_t q8b;
|
10104
|
+
|
10105
|
+
float sumf = 0;
|
10106
|
+
for (int i = 0; i < nb; ++i) {
|
10107
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10108
|
+
const uint8_t * restrict qs = x[i].qs;
|
10109
|
+
const uint8_t * restrict qh = x[i].qh;
|
10110
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10111
|
+
const int8_t * restrict q8 = y[i].qs;
|
10112
|
+
int sumi1 = 0, sumi2 = 0;
|
10113
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10114
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
10115
|
+
const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
|
10116
|
+
iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
|
10117
|
+
const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
|
10118
|
+
iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
|
10119
|
+
const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
|
10120
|
+
iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
|
10121
|
+
const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
|
10122
|
+
iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
|
10123
|
+
qs += 16;
|
10124
|
+
|
10125
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
10126
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10127
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10128
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
10129
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
10130
|
+
|
10131
|
+
q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0]));
|
10132
|
+
q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
|
10133
|
+
|
10134
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
10135
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10136
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10137
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
10138
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
10139
|
+
|
10140
|
+
signs += 4;
|
10141
|
+
|
10142
|
+
q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0]));
|
10143
|
+
q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1]));
|
10144
|
+
|
10145
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
10146
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
10147
|
+
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
10148
|
+
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
10149
|
+
}
|
10150
|
+
sumf += d*(sumi1 + sumi2);
|
10151
|
+
}
|
10152
|
+
*s = 0.25f * sumf;
|
10153
|
+
|
10154
|
+
#elif defined(__AVX2__)
|
10155
|
+
|
10156
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10157
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10158
|
+
};
|
10159
|
+
|
10160
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10161
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10162
|
+
};
|
10163
|
+
|
10164
|
+
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
10165
|
+
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
10166
|
+
|
10167
|
+
__m256 accumf = _mm256_setzero_ps();
|
10168
|
+
for (int i = 0; i < nb; ++i) {
|
10169
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10170
|
+
const uint8_t * restrict qs = x[i].qs;
|
10171
|
+
const uint8_t * restrict qh = x[i].qh;
|
10172
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10173
|
+
const int8_t * restrict q8 = y[i].qs;
|
10174
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
10175
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
10176
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10177
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10178
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10179
|
+
const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)],
|
10180
|
+
iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)],
|
10181
|
+
iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)],
|
10182
|
+
iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)],
|
10183
|
+
iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)],
|
10184
|
+
iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)],
|
10185
|
+
iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)],
|
10186
|
+
iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
|
10187
|
+
qs += 8;
|
10188
|
+
const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)],
|
10189
|
+
iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)],
|
10190
|
+
iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)],
|
10191
|
+
iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)],
|
10192
|
+
iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)],
|
10193
|
+
iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)],
|
10194
|
+
iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)],
|
10195
|
+
iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]);
|
10196
|
+
qs += 8;
|
10197
|
+
|
10198
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
10199
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
10200
|
+
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
10201
|
+
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
10202
|
+
|
10203
|
+
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
10204
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
10205
|
+
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
10206
|
+
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
10207
|
+
|
10208
|
+
signs += 4;
|
10209
|
+
|
10210
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
10211
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
10212
|
+
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
10213
|
+
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
10214
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
10215
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
10216
|
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
10217
|
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
10218
|
+
}
|
10219
|
+
|
10220
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
10221
|
+
|
10222
|
+
}
|
10223
|
+
|
10224
|
+
*s = 0.25f * hsum_float_8(accumf);
|
10225
|
+
|
10226
|
+
#else
|
10227
|
+
|
10228
|
+
float sumf = 0.f;
|
10229
|
+
for (int i = 0; i < nb; ++i) {
|
10230
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10231
|
+
const uint8_t * restrict qs = x[i].qs;
|
10232
|
+
const uint8_t * restrict qh = x[i].qh;
|
10233
|
+
const uint8_t * restrict signs = x[i].signs;
|
10234
|
+
const int8_t * restrict q8 = y[i].qs;
|
10235
|
+
int32_t bsum = 0;
|
10236
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10237
|
+
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
10238
|
+
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
10239
|
+
int32_t sumi = 0;
|
10240
|
+
for (int l = 0; l < 4; ++l) {
|
10241
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
10242
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
10243
|
+
for (int j = 0; j < 4; ++j) {
|
10244
|
+
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10245
|
+
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
10246
|
+
}
|
10247
|
+
q8 += 8;
|
10248
|
+
}
|
10249
|
+
qs += 8;
|
10250
|
+
signs += 4;
|
10251
|
+
bsum += sumi * ls1;
|
10252
|
+
sumi = 0;
|
10253
|
+
for (int l = 0; l < 4; ++l) {
|
10254
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
10255
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
10256
|
+
for (int j = 0; j < 4; ++j) {
|
10257
|
+
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10258
|
+
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
10259
|
+
}
|
10260
|
+
q8 += 8;
|
10261
|
+
}
|
10262
|
+
qs += 8;
|
10263
|
+
signs += 4;
|
10264
|
+
bsum += sumi * ls2;
|
10265
|
+
}
|
10266
|
+
sumf += d * bsum;
|
10267
|
+
}
|
10268
|
+
*s = 0.25f * sumf;
|
10269
|
+
#endif
|
10270
|
+
}
|
10271
|
+
|
10272
|
+
|
9330
10273
|
#ifdef __AVX2__
|
9331
10274
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
9332
10275
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
@@ -9348,7 +10291,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9348
10291
|
|
9349
10292
|
const int nb = n / QK_K;
|
9350
10293
|
|
9351
|
-
|
10294
|
+
// TODO: implement for QK_K = 64
|
10295
|
+
#if defined __ARM_NEON && QK_K == 256
|
9352
10296
|
|
9353
10297
|
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
9354
10298
|
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
@@ -9405,7 +10349,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9405
10349
|
|
9406
10350
|
*s = sumf;
|
9407
10351
|
|
9408
|
-
|
10352
|
+
// TODO: implement for QK_K = 64
|
10353
|
+
#elif defined __AVX2__ && QK_K == 256
|
9409
10354
|
|
9410
10355
|
const __m128i m8 = _mm_set1_epi8(0x08);
|
9411
10356
|
const __m128i m7 = _mm_set1_epi8(0x07);
|
@@ -9420,8 +10365,12 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9420
10365
|
|
9421
10366
|
uint64_t aux64;
|
9422
10367
|
|
9423
|
-
|
9424
|
-
|
10368
|
+
typedef union m256i_uint16 {
|
10369
|
+
__m256i reg;
|
10370
|
+
uint16_t s[16];
|
10371
|
+
} m256i_uint16_t;
|
10372
|
+
|
10373
|
+
m256i_uint16_t v_gindex;
|
9425
10374
|
|
9426
10375
|
__m256 accum = _mm256_setzero_ps();
|
9427
10376
|
for (int i = 0; i < nb; ++i) {
|
@@ -9436,13 +10385,13 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
9436
10385
|
memcpy(&aux64, sc, 8); sc += 8;
|
9437
10386
|
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
9438
10387
|
const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
|
9439
|
-
v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
10388
|
+
v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
9440
10389
|
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
9441
10390
|
|
9442
10391
|
for (int i32 = 0; i32 < 4; ++i32) {
|
9443
10392
|
const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9444
|
-
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[
|
9445
|
-
iq1s_grid[
|
10393
|
+
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
|
10394
|
+
iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
|
9446
10395
|
const __m256i dot = mul_add_epi8(q1b, q8b);
|
9447
10396
|
const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
|
9448
10397
|
const __m256i p = _mm256_madd_epi16(s16, dot);
|
@@ -9523,6 +10472,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
9523
10472
|
float sumf = 0;
|
9524
10473
|
|
9525
10474
|
for (int ib = 0; ib < nb; ib += 2) {
|
10475
|
+
|
9526
10476
|
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
9527
10477
|
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
9528
10478
|
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
@@ -9592,6 +10542,138 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
9592
10542
|
#endif
|
9593
10543
|
}
|
9594
10544
|
|
10545
|
+
void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
10546
|
+
assert(nrc == 1);
|
10547
|
+
UNUSED(nrc);
|
10548
|
+
UNUSED(bx);
|
10549
|
+
UNUSED(by);
|
10550
|
+
UNUSED(bs);
|
10551
|
+
assert(n % QK_K == 0);
|
10552
|
+
#if QK_K == 64
|
10553
|
+
ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
|
10554
|
+
#else
|
10555
|
+
|
10556
|
+
const block_iq4_xs * restrict x = vx;
|
10557
|
+
const block_q8_K * restrict y = vy;
|
10558
|
+
|
10559
|
+
const int nb = n / QK_K;
|
10560
|
+
|
10561
|
+
#if defined __ARM_NEON
|
10562
|
+
const int8x16_t values = vld1q_s8(kvalues_iq4nl);
|
10563
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
10564
|
+
ggml_uint8x16x2_t q4bits;
|
10565
|
+
ggml_int8x16x4_t q4b;
|
10566
|
+
ggml_int8x16x4_t q8b;
|
10567
|
+
int32x4_t prod_1, prod_2;
|
10568
|
+
|
10569
|
+
float sumf = 0;
|
10570
|
+
|
10571
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
10572
|
+
|
10573
|
+
const int8_t * q8 = y[ibl].qs;
|
10574
|
+
const uint8_t * q4 = x[ibl].qs;
|
10575
|
+
uint16_t h = x[ibl].scales_h;
|
10576
|
+
|
10577
|
+
int sumi1 = 0, sumi2 = 0;
|
10578
|
+
for (int ib = 0; ib < QK_K/64; ++ib) {
|
10579
|
+
|
10580
|
+
q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
10581
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
10582
|
+
|
10583
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
10584
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
10585
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
10586
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
10587
|
+
|
10588
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
10589
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
10590
|
+
|
10591
|
+
int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
|
10592
|
+
int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
|
10593
|
+
h >>= 4;
|
10594
|
+
sumi1 += vaddvq_s32(prod_1) * ls1;
|
10595
|
+
sumi2 += vaddvq_s32(prod_2) * ls2;
|
10596
|
+
|
10597
|
+
}
|
10598
|
+
|
10599
|
+
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
10600
|
+
}
|
10601
|
+
|
10602
|
+
*s = sumf;
|
10603
|
+
|
10604
|
+
#elif defined __AVX2__
|
10605
|
+
|
10606
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
10607
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
10608
|
+
|
10609
|
+
__m256 accum = _mm256_setzero_ps();
|
10610
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
10611
|
+
const uint8_t * qs = x[ibl].qs;
|
10612
|
+
const int8_t * q8 = y[ibl].qs;
|
10613
|
+
uint16_t sh = x[ibl].scales_h;
|
10614
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
10615
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
10616
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
10617
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
10618
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
10619
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10620
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10621
|
+
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
10622
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
10623
|
+
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
10624
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
10625
|
+
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
10626
|
+
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
10627
|
+
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
10628
|
+
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
10629
|
+
sh >>= 4;
|
10630
|
+
const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
|
10631
|
+
const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
|
10632
|
+
sumi1 = _mm256_add_epi32(p_1, sumi1);
|
10633
|
+
sumi2 = _mm256_add_epi32(p_2, sumi2);
|
10634
|
+
}
|
10635
|
+
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
10636
|
+
_mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
|
10637
|
+
}
|
10638
|
+
|
10639
|
+
*s = hsum_float_8(accum);
|
10640
|
+
|
10641
|
+
#else
|
10642
|
+
float sumf = 0;
|
10643
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
10644
|
+
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
10645
|
+
uint16_t h = x[ibl].scales_h;
|
10646
|
+
const uint8_t * qs = x[ibl].qs;
|
10647
|
+
const int8_t * q8 = y[ibl].qs;
|
10648
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
10649
|
+
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
10650
|
+
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
10651
|
+
h >>= 4;
|
10652
|
+
const float d1 = d4d8*(ls1 - 32);
|
10653
|
+
const float d2 = d4d8*(ls2 - 32);
|
10654
|
+
int sumi1 = 0, sumi2 = 0;
|
10655
|
+
for (int j = 0; j < 16; ++j) {
|
10656
|
+
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
10657
|
+
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
10658
|
+
}
|
10659
|
+
sumf += d1 * (sumi1 + sumi2);
|
10660
|
+
qs += 16;
|
10661
|
+
q8 += 32;
|
10662
|
+
sumi1 = sumi2 = 0;
|
10663
|
+
for (int j = 0; j < 16; ++j) {
|
10664
|
+
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
10665
|
+
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
10666
|
+
}
|
10667
|
+
sumf += d2 * (sumi1 + sumi2);
|
10668
|
+
qs += 16;
|
10669
|
+
q8 += 32;
|
10670
|
+
}
|
10671
|
+
}
|
10672
|
+
*s = sumf;
|
10673
|
+
#endif
|
10674
|
+
#endif
|
10675
|
+
}
|
10676
|
+
|
9595
10677
|
// ================================ IQ2 quantization =============================================
|
9596
10678
|
|
9597
10679
|
typedef struct {
|
@@ -9600,22 +10682,25 @@ typedef struct {
|
|
9600
10682
|
uint16_t * neighbours;
|
9601
10683
|
} iq2_entry_t;
|
9602
10684
|
|
9603
|
-
static iq2_entry_t iq2_data[
|
10685
|
+
static iq2_entry_t iq2_data[4] = {
|
10686
|
+
{NULL, NULL, NULL},
|
9604
10687
|
{NULL, NULL, NULL},
|
9605
10688
|
{NULL, NULL, NULL},
|
9606
10689
|
{NULL, NULL, NULL},
|
9607
10690
|
};
|
9608
10691
|
|
9609
10692
|
static inline int iq2_data_index(enum ggml_type type) {
|
9610
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
10693
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
9611
10694
|
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
9612
|
-
type == GGML_TYPE_IQ2_XS ? 1 :
|
10695
|
+
type == GGML_TYPE_IQ2_XS ? 1 :
|
10696
|
+
type == GGML_TYPE_IQ1_S ? 2 : 3;
|
9613
10697
|
}
|
9614
10698
|
|
9615
10699
|
static inline int iq2_grid_size(enum ggml_type type) {
|
9616
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
10700
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
9617
10701
|
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
9618
|
-
type == GGML_TYPE_IQ2_XS ? 512 :
|
10702
|
+
type == GGML_TYPE_IQ2_XS ? 512 :
|
10703
|
+
type == GGML_TYPE_IQ1_S ? 512 : 1024;
|
9619
10704
|
}
|
9620
10705
|
|
9621
10706
|
static int iq2_compare_func(const void * left, const void * right) {
|
@@ -9716,11 +10801,79 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
9716
10801
|
41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
|
9717
10802
|
42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
|
9718
10803
|
};
|
10804
|
+
static const uint16_t kgrid_2bit_1024[1024] = {
|
10805
|
+
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
10806
|
+
73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
|
10807
|
+
165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
|
10808
|
+
337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
|
10809
|
+
517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
|
10810
|
+
674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
|
10811
|
+
1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
|
10812
|
+
1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
|
10813
|
+
1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
|
10814
|
+
1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
|
10815
|
+
2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
|
10816
|
+
2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
|
10817
|
+
2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
|
10818
|
+
4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
|
10819
|
+
4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
|
10820
|
+
4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
|
10821
|
+
4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
|
10822
|
+
4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
|
10823
|
+
5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
|
10824
|
+
5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
|
10825
|
+
5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
|
10826
|
+
5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
|
10827
|
+
6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
|
10828
|
+
6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
|
10829
|
+
8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
|
10830
|
+
8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
|
10831
|
+
8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
|
10832
|
+
9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
|
10833
|
+
9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
|
10834
|
+
10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
|
10835
|
+
16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
|
10836
|
+
16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
|
10837
|
+
16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
|
10838
|
+
16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
|
10839
|
+
17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
|
10840
|
+
17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
|
10841
|
+
17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
|
10842
|
+
17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
|
10843
|
+
18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
|
10844
|
+
18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
|
10845
|
+
18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
|
10846
|
+
20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
|
10847
|
+
20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
|
10848
|
+
20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
|
10849
|
+
21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
|
10850
|
+
21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
|
10851
|
+
22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
|
10852
|
+
22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
|
10853
|
+
24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
|
10854
|
+
24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
|
10855
|
+
25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
|
10856
|
+
26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
|
10857
|
+
32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
|
10858
|
+
33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
|
10859
|
+
33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
|
10860
|
+
33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
|
10861
|
+
34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
|
10862
|
+
35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
|
10863
|
+
36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
|
10864
|
+
37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
|
10865
|
+
38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
|
10866
|
+
39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
|
10867
|
+
41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
|
10868
|
+
42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
|
10869
|
+
};
|
9719
10870
|
|
9720
10871
|
const int kmap_size = 43692;
|
9721
|
-
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
10872
|
+
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
10873
|
+
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
9722
10874
|
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
9723
|
-
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10875
|
+
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10876
|
+
type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
|
9724
10877
|
uint64_t * kgrid_q2xs;
|
9725
10878
|
int * kmap_q2xs;
|
9726
10879
|
uint16_t * kneighbors_q2xs;
|
@@ -9817,7 +10970,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
9817
10970
|
}
|
9818
10971
|
|
9819
10972
|
void iq2xs_free_impl(enum ggml_type type) {
|
9820
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
10973
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
9821
10974
|
const int gindex = iq2_data_index(type);
|
9822
10975
|
if (iq2_data[gindex].grid) {
|
9823
10976
|
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
@@ -9866,7 +11019,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
9866
11019
|
|
9867
11020
|
const int kMaxQ = 3;
|
9868
11021
|
|
9869
|
-
const int nbl = n/
|
11022
|
+
const int nbl = n/QK_K;
|
9870
11023
|
|
9871
11024
|
block_iq2_xxs * y = vy;
|
9872
11025
|
|
@@ -10039,7 +11192,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
10039
11192
|
|
10040
11193
|
const int kMaxQ = 3;
|
10041
11194
|
|
10042
|
-
const int nbl = n/
|
11195
|
+
const int nbl = n/QK_K;
|
10043
11196
|
|
10044
11197
|
block_iq2_xs * y = vy;
|
10045
11198
|
|
@@ -10239,14 +11392,15 @@ typedef struct {
|
|
10239
11392
|
uint16_t * neighbours;
|
10240
11393
|
} iq3_entry_t;
|
10241
11394
|
|
10242
|
-
static iq3_entry_t iq3_data[
|
11395
|
+
static iq3_entry_t iq3_data[2] = {
|
11396
|
+
{NULL, NULL, NULL},
|
10243
11397
|
{NULL, NULL, NULL},
|
10244
11398
|
};
|
10245
11399
|
|
10246
11400
|
static inline int iq3_data_index(int grid_size) {
|
10247
11401
|
(void)grid_size;
|
10248
|
-
GGML_ASSERT(grid_size == 256);
|
10249
|
-
return 0;
|
11402
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
11403
|
+
return grid_size == 256 ? 0 : 1;
|
10250
11404
|
}
|
10251
11405
|
|
10252
11406
|
static int iq3_compare_func(const void * left, const void * right) {
|
@@ -10278,9 +11432,44 @@ void iq3xs_init_impl(int grid_size) {
|
|
10278
11432
|
3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
|
10279
11433
|
3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
|
10280
11434
|
};
|
11435
|
+
static const uint16_t kgrid_512[512] = {
|
11436
|
+
0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34,
|
11437
|
+
37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77,
|
11438
|
+
80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142,
|
11439
|
+
145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210,
|
11440
|
+
217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288,
|
11441
|
+
291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393,
|
11442
|
+
395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514,
|
11443
|
+
516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576,
|
11444
|
+
577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653,
|
11445
|
+
655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727,
|
11446
|
+
728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833,
|
11447
|
+
840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977,
|
11448
|
+
989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047,
|
11449
|
+
1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103,
|
11450
|
+
1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199,
|
11451
|
+
1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296,
|
11452
|
+
1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415,
|
11453
|
+
1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561,
|
11454
|
+
1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648,
|
11455
|
+
1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761,
|
11456
|
+
1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877,
|
11457
|
+
1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068,
|
11458
|
+
2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177,
|
11459
|
+
2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269,
|
11460
|
+
2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520,
|
11461
|
+
2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634,
|
11462
|
+
2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805,
|
11463
|
+
2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083,
|
11464
|
+
3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276,
|
11465
|
+
3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591,
|
11466
|
+
3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729,
|
11467
|
+
3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032,
|
11468
|
+
};
|
11469
|
+
|
10281
11470
|
const int kmap_size = 4096;
|
10282
|
-
const int nwant = 2;
|
10283
|
-
const uint16_t * kgrid = kgrid_256;
|
11471
|
+
const int nwant = grid_size == 256 ? 2 : 3;
|
11472
|
+
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
10284
11473
|
uint32_t * kgrid_q3xs;
|
10285
11474
|
int * kmap_q3xs;
|
10286
11475
|
uint16_t * kneighbors_q3xs;
|
@@ -10377,7 +11566,7 @@ void iq3xs_init_impl(int grid_size) {
|
|
10377
11566
|
}
|
10378
11567
|
|
10379
11568
|
void iq3xs_free_impl(int grid_size) {
|
10380
|
-
GGML_ASSERT(grid_size == 256);
|
11569
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
10381
11570
|
const int gindex = iq3_data_index(grid_size);
|
10382
11571
|
if (iq3_data[gindex].grid) {
|
10383
11572
|
free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
|
@@ -10410,9 +11599,10 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
10410
11599
|
return grid_index;
|
10411
11600
|
}
|
10412
11601
|
|
10413
|
-
static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n,
|
11602
|
+
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
|
11603
|
+
const float * restrict quant_weights) {
|
10414
11604
|
|
10415
|
-
const int gindex = iq3_data_index(
|
11605
|
+
const int gindex = iq3_data_index(grid_size);
|
10416
11606
|
|
10417
11607
|
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
10418
11608
|
const int * kmap_q3xs = iq3_data[gindex].map;
|
@@ -10426,9 +11616,23 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10426
11616
|
|
10427
11617
|
const int kMaxQ = 8;
|
10428
11618
|
|
10429
|
-
const int nbl = n/
|
11619
|
+
const int nbl = n/QK_K;
|
10430
11620
|
|
10431
|
-
|
11621
|
+
ggml_fp16_t * dh;
|
11622
|
+
uint8_t * qs;
|
11623
|
+
int block_size;
|
11624
|
+
if (grid_size == 256) {
|
11625
|
+
block_iq3_xxs * y = vy;
|
11626
|
+
dh = &y->d;
|
11627
|
+
qs = y->qs;
|
11628
|
+
block_size = sizeof(block_iq3_xxs);
|
11629
|
+
} else {
|
11630
|
+
block_iq3_s * y = vy;
|
11631
|
+
dh = &y->d;
|
11632
|
+
qs = y->qs;
|
11633
|
+
block_size = sizeof(block_iq3_s);
|
11634
|
+
}
|
11635
|
+
int quant_size = block_size - sizeof(ggml_fp16_t);
|
10432
11636
|
|
10433
11637
|
float scales[QK_K/32];
|
10434
11638
|
float weight[32];
|
@@ -10439,57 +11643,271 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10439
11643
|
bool is_on_grid[8];
|
10440
11644
|
bool is_on_grid_aux[8];
|
10441
11645
|
uint8_t block_signs[8];
|
10442
|
-
uint8_t q3[3*(QK_K/8)];
|
11646
|
+
uint8_t q3[3*(QK_K/8)+QK_K/32];
|
10443
11647
|
uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
|
11648
|
+
uint8_t * qh = q3 + 3*(QK_K/8);
|
10444
11649
|
|
10445
11650
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
10446
11651
|
|
11652
|
+
dh[0] = GGML_FP32_TO_FP16(0.f);
|
11653
|
+
memset(q3, 0, 3*QK_K/8+QK_K/32);
|
11654
|
+
|
11655
|
+
float max_scale = 0;
|
11656
|
+
|
11657
|
+
const float * xbl = x + QK_K*ibl;
|
11658
|
+
float sumx2 = 0;
|
11659
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
11660
|
+
float sigma2 = 2*sumx2/QK_K;
|
11661
|
+
|
11662
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
11663
|
+
const float * xb = xbl + 32*ib;
|
11664
|
+
if (quant_weights) {
|
11665
|
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
11666
|
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
11667
|
+
} else {
|
11668
|
+
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
11669
|
+
}
|
11670
|
+
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
11671
|
+
for (int k = 0; k < 4; ++k) {
|
11672
|
+
int nflip = 0;
|
11673
|
+
uint8_t s = 0;
|
11674
|
+
for (int i = 0; i < 8; ++i) {
|
11675
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
11676
|
+
else {
|
11677
|
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
11678
|
+
}
|
11679
|
+
}
|
11680
|
+
if (nflip%2) {
|
11681
|
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
11682
|
+
for (int i = 1; i < 8; ++i) {
|
11683
|
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
11684
|
+
if (ax < min) {
|
11685
|
+
min = ax; imin = i;
|
11686
|
+
}
|
11687
|
+
}
|
11688
|
+
xval[8*k+imin] = -xval[8*k+imin];
|
11689
|
+
s ^= (1 << imin);
|
11690
|
+
}
|
11691
|
+
block_signs[k] = s & 127;
|
11692
|
+
}
|
11693
|
+
float max = xval[0];
|
11694
|
+
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
11695
|
+
if (!max) {
|
11696
|
+
scales[ib] = 0;
|
11697
|
+
memset(L, 0, 32);
|
11698
|
+
continue;
|
11699
|
+
}
|
11700
|
+
float best = 0;
|
11701
|
+
float scale = max/(2*kMaxQ-1);
|
11702
|
+
for (int is = -15; is <= 15; ++is) {
|
11703
|
+
float id = (2*kMaxQ-1+is*0.2f)/max;
|
11704
|
+
float this_scale = 1/id;
|
11705
|
+
for (int k = 0; k < 8; ++k) {
|
11706
|
+
for (int i = 0; i < 4; ++i) {
|
11707
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
11708
|
+
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
11709
|
+
}
|
11710
|
+
uint16_t u = 0;
|
11711
|
+
for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
|
11712
|
+
int grid_index = kmap_q3xs[u];
|
11713
|
+
is_on_grid_aux[k] = true;
|
11714
|
+
if (grid_index < 0) {
|
11715
|
+
is_on_grid_aux[k] = false;
|
11716
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
11717
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
|
11718
|
+
}
|
11719
|
+
}
|
11720
|
+
float sumqx = 0, sumq2 = 0;
|
11721
|
+
for (int i = 0; i < 32; ++i) {
|
11722
|
+
float w = weight[i];
|
11723
|
+
float q = 2*Laux[i] + 1;
|
11724
|
+
sumqx += w*xval[i]*q;
|
11725
|
+
sumq2 += w*q*q;
|
11726
|
+
}
|
11727
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
11728
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
11729
|
+
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
|
11730
|
+
for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
11731
|
+
}
|
11732
|
+
}
|
11733
|
+
int n_not_ongrid = 0;
|
11734
|
+
for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
11735
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
11736
|
+
float id = 1/scale;
|
11737
|
+
for (int k = 0; k < 8; ++k) {
|
11738
|
+
if (is_on_grid[k]) continue;
|
11739
|
+
uint16_t u = 0;
|
11740
|
+
for (int i = 0; i < 4; ++i) {
|
11741
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
11742
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
11743
|
+
u |= (l << 3*i);
|
11744
|
+
}
|
11745
|
+
int grid_index = kmap_q3xs[u];
|
11746
|
+
if (grid_index < 0) {
|
11747
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
11748
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
|
11749
|
+
}
|
11750
|
+
const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
|
11751
|
+
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
11752
|
+
}
|
11753
|
+
float sumqx = 0, sumq2 = 0;
|
11754
|
+
for (int i = 0; i < 32; ++i) {
|
11755
|
+
float w = weight[i];
|
11756
|
+
float q = 2*L[i] + 1;
|
11757
|
+
sumqx += w*xval[i]*q;
|
11758
|
+
sumq2 += w*q*q;
|
11759
|
+
}
|
11760
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
11761
|
+
}
|
11762
|
+
if (scale < 0) {
|
11763
|
+
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
11764
|
+
// and correspondingly flip quant signs.
|
11765
|
+
scale = -scale;
|
11766
|
+
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
11767
|
+
}
|
11768
|
+
for (int k = 0; k < 8; ++k) {
|
11769
|
+
uint16_t u = 0;
|
11770
|
+
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
11771
|
+
int grid_index = kmap_q3xs[u];
|
11772
|
+
if (grid_index < 0) {
|
11773
|
+
printf("Oops: found point %u not on grid:", u);
|
11774
|
+
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
11775
|
+
printf("\n");
|
11776
|
+
GGML_ASSERT(false);
|
11777
|
+
}
|
11778
|
+
if (grid_size == 256) {
|
11779
|
+
q3[8*ib+k] = grid_index;
|
11780
|
+
} else {
|
11781
|
+
q3[8*ib+k] = grid_index & 255;
|
11782
|
+
qh[ib] |= ((grid_index >> 8) << k);
|
11783
|
+
}
|
11784
|
+
|
11785
|
+
}
|
11786
|
+
scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
|
11787
|
+
GGML_ASSERT(scale >= 0);
|
11788
|
+
scales[ib] = scale;
|
11789
|
+
max_scale = MAX(max_scale, scale);
|
11790
|
+
}
|
11791
|
+
|
11792
|
+
if (!max_scale) {
|
11793
|
+
memset(qs, 0, quant_size);
|
11794
|
+
dh += block_size/sizeof(ggml_fp16_t);
|
11795
|
+
qs += block_size;
|
11796
|
+
continue;
|
11797
|
+
}
|
11798
|
+
|
11799
|
+
float d = max_scale/31;
|
11800
|
+
dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor
|
11801
|
+
float id = 1/d;
|
11802
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
11803
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
11804
|
+
l = MAX(0, MIN(15, l));
|
11805
|
+
scales_and_signs[ib] |= ((uint32_t)l << 28);
|
11806
|
+
}
|
11807
|
+
memcpy(qs, q3, quant_size);
|
11808
|
+
|
11809
|
+
dh += block_size/sizeof(ggml_fp16_t);
|
11810
|
+
qs += block_size;
|
11811
|
+
|
11812
|
+
}
|
11813
|
+
}
|
11814
|
+
|
11815
|
+
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
11816
|
+
(void)hist;
|
11817
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
11818
|
+
int nblock = n_per_row/QK_K;
|
11819
|
+
char * qrow = (char *)dst;
|
11820
|
+
for (int row = 0; row < nrow; ++row) {
|
11821
|
+
quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
|
11822
|
+
src += n_per_row;
|
11823
|
+
qrow += nblock*sizeof(block_iq3_xxs);
|
11824
|
+
}
|
11825
|
+
return nrow * nblock * sizeof(block_iq3_xxs);
|
11826
|
+
}
|
11827
|
+
|
11828
|
+
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
|
11829
|
+
assert(k % QK_K == 0);
|
11830
|
+
block_iq3_xxs * restrict y = vy;
|
11831
|
+
quantize_row_iq3_xxs_reference(x, y, k);
|
11832
|
+
}
|
11833
|
+
|
11834
|
+
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
|
11835
|
+
assert(k % QK_K == 0);
|
11836
|
+
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
11837
|
+
}
|
11838
|
+
|
11839
|
+
static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
|
11840
|
+
const float * restrict quant_weights,
|
11841
|
+
float * scales,
|
11842
|
+
float * weight,
|
11843
|
+
float * xval,
|
11844
|
+
int8_t * L,
|
11845
|
+
int8_t * Laux,
|
11846
|
+
float * waux,
|
11847
|
+
bool * is_on_grid,
|
11848
|
+
bool * is_on_grid_aux,
|
11849
|
+
uint8_t * block_signs) {
|
11850
|
+
|
11851
|
+
const int gindex = iq3_data_index(512);
|
11852
|
+
|
11853
|
+
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
11854
|
+
const int * kmap_q3xs = iq3_data[gindex].map;
|
11855
|
+
const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
|
11856
|
+
|
11857
|
+
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
11858
|
+
GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
|
11859
|
+
GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
|
11860
|
+
GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
|
11861
|
+
GGML_ASSERT(n%QK_K == 0);
|
11862
|
+
|
11863
|
+
const int kMaxQ = 8;
|
11864
|
+
|
11865
|
+
const int nbl = n/QK_K;
|
11866
|
+
|
11867
|
+
block_iq3_s * y = vy;
|
11868
|
+
|
11869
|
+
const int bs4 = block_size/4;
|
11870
|
+
const int bs8 = block_size/8;
|
11871
|
+
|
11872
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
11873
|
+
|
11874
|
+
memset(&y[ibl], 0, sizeof(block_iq3_s));
|
10447
11875
|
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
10448
|
-
|
11876
|
+
|
11877
|
+
uint8_t * qs = y[ibl].qs;
|
11878
|
+
uint8_t * qh = y[ibl].qh;
|
11879
|
+
uint8_t * signs = y[ibl].signs;
|
10449
11880
|
|
10450
11881
|
float max_scale = 0;
|
10451
11882
|
|
10452
11883
|
const float * xbl = x + QK_K*ibl;
|
10453
11884
|
float sumx2 = 0;
|
10454
11885
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
10455
|
-
float sigma2 = sumx2/QK_K;
|
11886
|
+
float sigma2 = 2*sumx2/QK_K;
|
10456
11887
|
|
10457
|
-
for (int ib = 0; ib < QK_K/
|
10458
|
-
const float * xb = xbl +
|
11888
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
11889
|
+
const float * xb = xbl + block_size*ib;
|
10459
11890
|
if (quant_weights) {
|
10460
|
-
const float * qw = quant_weights + QK_K*ibl +
|
10461
|
-
for (int i = 0; i <
|
11891
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
11892
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
10462
11893
|
} else {
|
10463
|
-
for (int i = 0; i <
|
11894
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
10464
11895
|
}
|
10465
|
-
for (int i = 0; i <
|
10466
|
-
for (int k = 0; k <
|
10467
|
-
int nflip = 0;
|
11896
|
+
for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
|
11897
|
+
for (int k = 0; k < bs8; ++k) {
|
10468
11898
|
uint8_t s = 0;
|
10469
11899
|
for (int i = 0; i < 8; ++i) {
|
10470
11900
|
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
10471
11901
|
else {
|
10472
|
-
xval[8*k + i] = -xb[8*k + i];
|
10473
|
-
}
|
10474
|
-
}
|
10475
|
-
if (nflip%2) {
|
10476
|
-
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
10477
|
-
for (int i = 1; i < 8; ++i) {
|
10478
|
-
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
10479
|
-
if (ax < min) {
|
10480
|
-
min = ax; imin = i;
|
10481
|
-
}
|
11902
|
+
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
10482
11903
|
}
|
10483
|
-
xval[8*k+imin] = -xval[8*k+imin];
|
10484
|
-
s ^= (1 << imin);
|
10485
11904
|
}
|
10486
|
-
block_signs[k] = s
|
11905
|
+
block_signs[k] = s;
|
10487
11906
|
}
|
10488
11907
|
float max = xval[0];
|
10489
|
-
for (int i = 1; i <
|
11908
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
|
10490
11909
|
if (!max) {
|
10491
11910
|
scales[ib] = 0;
|
10492
|
-
memset(L, 0, 32);
|
10493
11911
|
continue;
|
10494
11912
|
}
|
10495
11913
|
float best = 0;
|
@@ -10497,7 +11915,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10497
11915
|
for (int is = -15; is <= 15; ++is) {
|
10498
11916
|
float id = (2*kMaxQ-1+is*0.2f)/max;
|
10499
11917
|
float this_scale = 1/id;
|
10500
|
-
for (int k = 0; k <
|
11918
|
+
for (int k = 0; k < bs4; ++k) {
|
10501
11919
|
for (int i = 0; i < 4; ++i) {
|
10502
11920
|
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
10503
11921
|
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
@@ -10513,7 +11931,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10513
11931
|
}
|
10514
11932
|
}
|
10515
11933
|
float sumqx = 0, sumq2 = 0;
|
10516
|
-
for (int i = 0; i <
|
11934
|
+
for (int i = 0; i < block_size; ++i) {
|
10517
11935
|
float w = weight[i];
|
10518
11936
|
float q = 2*Laux[i] + 1;
|
10519
11937
|
sumqx += w*xval[i]*q;
|
@@ -10521,15 +11939,15 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10521
11939
|
}
|
10522
11940
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
10523
11941
|
scale = sumqx/sumq2; best = scale*sumqx;
|
10524
|
-
for (int i = 0; i <
|
10525
|
-
for (int k = 0; k <
|
11942
|
+
for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
|
11943
|
+
for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
10526
11944
|
}
|
10527
11945
|
}
|
10528
11946
|
int n_not_ongrid = 0;
|
10529
|
-
for (int k = 0; k <
|
11947
|
+
for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
10530
11948
|
if (n_not_ongrid > 0 && scale > 0) {
|
10531
11949
|
float id = 1/scale;
|
10532
|
-
for (int k = 0; k <
|
11950
|
+
for (int k = 0; k < bs4; ++k) {
|
10533
11951
|
if (is_on_grid[k]) continue;
|
10534
11952
|
uint16_t u = 0;
|
10535
11953
|
for (int i = 0; i < 4; ++i) {
|
@@ -10546,7 +11964,7 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10546
11964
|
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
10547
11965
|
}
|
10548
11966
|
float sumqx = 0, sumq2 = 0;
|
10549
|
-
for (int i = 0; i <
|
11967
|
+
for (int i = 0; i < block_size; ++i) {
|
10550
11968
|
float w = weight[i];
|
10551
11969
|
float q = 2*L[i] + 1;
|
10552
11970
|
sumqx += w*xval[i]*q;
|
@@ -10558,9 +11976,9 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10558
11976
|
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
10559
11977
|
// and correspondingly flip quant signs.
|
10560
11978
|
scale = -scale;
|
10561
|
-
for (int k = 0; k <
|
11979
|
+
for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
|
10562
11980
|
}
|
10563
|
-
for (int k = 0; k <
|
11981
|
+
for (int k = 0; k < bs4; ++k) {
|
10564
11982
|
uint16_t u = 0;
|
10565
11983
|
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
10566
11984
|
int grid_index = kmap_q3xs[u];
|
@@ -10570,99 +11988,71 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
10570
11988
|
printf("\n");
|
10571
11989
|
GGML_ASSERT(false);
|
10572
11990
|
}
|
10573
|
-
|
11991
|
+
qs[k] = grid_index & 255;
|
11992
|
+
qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
|
10574
11993
|
}
|
10575
|
-
|
11994
|
+
qs += bs4;
|
11995
|
+
for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
|
11996
|
+
signs += bs8;
|
10576
11997
|
GGML_ASSERT(scale >= 0);
|
10577
11998
|
scales[ib] = scale;
|
10578
11999
|
max_scale = MAX(max_scale, scale);
|
10579
12000
|
}
|
10580
12001
|
|
10581
12002
|
if (!max_scale) {
|
10582
|
-
memset(y[ibl].qs, 0, 3*QK_K/8);
|
10583
12003
|
continue;
|
10584
12004
|
}
|
10585
12005
|
|
10586
12006
|
float d = max_scale/31;
|
10587
12007
|
y[ibl].d = GGML_FP32_TO_FP16(d);
|
10588
12008
|
float id = 1/d;
|
10589
|
-
|
10590
|
-
|
10591
|
-
|
10592
|
-
|
10593
|
-
|
10594
|
-
|
10595
|
-
const float * xb = xbl + 32*ib;
|
10596
|
-
if (quant_weights) {
|
10597
|
-
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
10598
|
-
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
10599
|
-
} else {
|
10600
|
-
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
10601
|
-
}
|
10602
|
-
const float db = 0.25f * d * (1 + 2*l);
|
10603
|
-
for (int k = 0; k < 8; ++k) {
|
10604
|
-
const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
|
10605
|
-
const float * xk = xb + 4*k;
|
10606
|
-
const float * wk = weight + 4*k;
|
10607
|
-
//const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
|
10608
|
-
const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
|
10609
|
-
float best_mse = 0; int best_index = q3[8*ib+k];
|
10610
|
-
for (int j = 0; j < 4; ++j) {
|
10611
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
10612
|
-
best_mse += wk[j] * diff * diff;
|
10613
|
-
}
|
10614
|
-
for (int idx = 0; idx < 256; ++idx) {
|
10615
|
-
//grid = (const uint8_t *)(kgrid_q3xs + idx);
|
10616
|
-
grid = (const uint8_t *)(iq3xxs_grid + idx);
|
10617
|
-
float mse = 0;
|
10618
|
-
for (int j = 0; j < 4; ++j) {
|
10619
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
10620
|
-
mse += wk[j] * diff * diff;
|
10621
|
-
}
|
10622
|
-
if (mse < best_mse) {
|
10623
|
-
best_mse = mse; best_index = idx;
|
10624
|
-
}
|
10625
|
-
}
|
10626
|
-
q3[8*ib+k] = best_index;
|
10627
|
-
//grid = (const uint8_t *)(kgrid_q3xs + best_index);
|
10628
|
-
grid = (const uint8_t *)(iq3xxs_grid + best_index);
|
10629
|
-
for (int j = 0; j < 4; ++j) {
|
10630
|
-
float q = db * grid[j] * signs[j];
|
10631
|
-
sumqx += wk[j] * q * xk[j];
|
10632
|
-
sumq2 += wk[j] * q * q;
|
10633
|
-
}
|
10634
|
-
}
|
10635
|
-
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
10636
|
-
}
|
12009
|
+
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
12010
|
+
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|
12011
|
+
l1 = MAX(0, MIN(15, l1));
|
12012
|
+
int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
|
12013
|
+
l2 = MAX(0, MIN(15, l2));
|
12014
|
+
y[ibl].scales[ib/2] = l1 | (l2 << 4);
|
10637
12015
|
}
|
10638
|
-
|
12016
|
+
|
10639
12017
|
}
|
10640
12018
|
}
|
10641
12019
|
|
10642
|
-
|
12020
|
+
#define IQ3S_BLOCK_SIZE 32
|
12021
|
+
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
10643
12022
|
(void)hist;
|
10644
12023
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
10645
12024
|
int nblock = n_per_row/QK_K;
|
12025
|
+
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
12026
|
+
float weight[IQ3S_BLOCK_SIZE];
|
12027
|
+
float xval[IQ3S_BLOCK_SIZE];
|
12028
|
+
int8_t L[IQ3S_BLOCK_SIZE];
|
12029
|
+
int8_t Laux[IQ3S_BLOCK_SIZE];
|
12030
|
+
float waux[IQ3S_BLOCK_SIZE];
|
12031
|
+
bool is_on_grid[IQ3S_BLOCK_SIZE/4];
|
12032
|
+
bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
|
12033
|
+
uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
|
10646
12034
|
char * qrow = (char *)dst;
|
10647
12035
|
for (int row = 0; row < nrow; ++row) {
|
10648
|
-
|
12036
|
+
quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
|
12037
|
+
scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
|
10649
12038
|
src += n_per_row;
|
10650
|
-
qrow += nblock*sizeof(
|
12039
|
+
qrow += nblock*sizeof(block_iq3_s);
|
10651
12040
|
}
|
10652
|
-
return nrow * nblock * sizeof(
|
12041
|
+
return nrow * nblock * sizeof(block_iq3_s);
|
10653
12042
|
}
|
10654
12043
|
|
10655
|
-
void
|
12044
|
+
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
|
10656
12045
|
assert(k % QK_K == 0);
|
10657
|
-
|
10658
|
-
|
12046
|
+
block_iq3_s * restrict y = vy;
|
12047
|
+
quantize_row_iq3_s_reference(x, y, k);
|
10659
12048
|
}
|
10660
12049
|
|
10661
|
-
void
|
12050
|
+
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
10662
12051
|
assert(k % QK_K == 0);
|
10663
|
-
|
12052
|
+
quantize_iq3_s(x, y, 1, k, NULL, NULL);
|
10664
12053
|
}
|
10665
12054
|
|
12055
|
+
|
10666
12056
|
// =================================== 1.5 bpw ===================================================
|
10667
12057
|
|
10668
12058
|
static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
@@ -10745,7 +12135,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
10745
12135
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
10746
12136
|
GGML_ASSERT(n%QK_K == 0);
|
10747
12137
|
|
10748
|
-
const int nbl = n/
|
12138
|
+
const int nbl = n/QK_K;
|
10749
12139
|
|
10750
12140
|
block_iq1_s * y = vy;
|
10751
12141
|
|
@@ -10880,23 +12270,23 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|
10880
12270
|
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
10881
12271
|
}
|
10882
12272
|
|
10883
|
-
static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x,
|
10884
|
-
ggml_fp16_t * dh, uint8_t * q4,
|
10885
|
-
float * weight, uint8_t * L,
|
12273
|
+
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
|
12274
|
+
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
12275
|
+
float * scales, float * weight, uint8_t * L,
|
10886
12276
|
const int8_t * values,
|
10887
12277
|
const float * quant_weights) {
|
10888
12278
|
|
10889
12279
|
const int ntry = 7;
|
10890
12280
|
|
10891
12281
|
float sigma2 = 0;
|
10892
|
-
for (int j = 0; j <
|
10893
|
-
sigma2 *= 2.f/
|
12282
|
+
for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
|
12283
|
+
sigma2 *= 2.f/super_block_size;
|
10894
12284
|
|
10895
|
-
|
12285
|
+
memset(q4, 0, super_block_size/2);
|
12286
|
+
dh[0] = GGML_FP32_TO_FP16(0.f);
|
10896
12287
|
|
10897
|
-
|
10898
|
-
for (int ib = 0; ib <
|
10899
|
-
dh[ib] = GGML_FP32_TO_FP16(0.f);
|
12288
|
+
float max_scale = 0, amax_scale = 0;
|
12289
|
+
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
10900
12290
|
const float * xb = x + ib*block_size;
|
10901
12291
|
if (quant_weights) {
|
10902
12292
|
const float * qw = quant_weights + ib*block_size;
|
@@ -10912,6 +12302,7 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
|
|
10912
12302
|
}
|
10913
12303
|
}
|
10914
12304
|
if (!amax) {
|
12305
|
+
scales[ib] = 0;
|
10915
12306
|
continue;
|
10916
12307
|
}
|
10917
12308
|
float d = -max/values[0];
|
@@ -10925,7 +12316,6 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
|
|
10925
12316
|
sumqx += w*q*xb[j];
|
10926
12317
|
sumq2 += w*q*q;
|
10927
12318
|
}
|
10928
|
-
float best_id = id;
|
10929
12319
|
d = sumqx/sumq2;
|
10930
12320
|
float best = d*sumqx;
|
10931
12321
|
for (int itry = -ntry; itry <= ntry; ++itry) {
|
@@ -10941,15 +12331,47 @@ static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RE
|
|
10941
12331
|
}
|
10942
12332
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
10943
12333
|
d = sumqx/sumq2; best = d * sumqx;
|
10944
|
-
best_id = id;
|
10945
12334
|
}
|
10946
12335
|
}
|
10947
|
-
|
10948
|
-
|
10949
|
-
|
12336
|
+
scales[ib] = d;
|
12337
|
+
float abs_d = fabsf(d);
|
12338
|
+
if (abs_d > amax_scale) {
|
12339
|
+
amax_scale = abs_d; max_scale = d;
|
12340
|
+
}
|
12341
|
+
}
|
12342
|
+
|
12343
|
+
if (super_block_size/block_size > 1) {
|
12344
|
+
int nb = super_block_size/block_size;
|
12345
|
+
memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
|
12346
|
+
float d = -max_scale/32;
|
12347
|
+
dh[0] = GGML_FP32_TO_FP16(d);
|
12348
|
+
float id = d ? 1/d : 0.f;
|
12349
|
+
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
12350
|
+
int l = nearest_int(id*scales[ib]);
|
12351
|
+
l = MAX(-32, MIN(31, l));
|
12352
|
+
float dl = d * l;
|
12353
|
+
float idl = dl ? 1/dl : 0.f;
|
12354
|
+
uint8_t * Lb = L + ib*block_size;
|
12355
|
+
const float * xb = x + ib*block_size;
|
12356
|
+
for (int j = 0; j < block_size; ++j) {
|
12357
|
+
Lb[j] = best_index_int8(16, values, idl*xb[j]);
|
12358
|
+
}
|
12359
|
+
l += 32;
|
12360
|
+
uint8_t l_l = l & 0xf;
|
12361
|
+
uint8_t l_h = l >> 4;
|
12362
|
+
if (ib%2 == 0) scales_l[ib/2] = l_l;
|
12363
|
+
else scales_l[ib/2] |= (l_l << 4);
|
12364
|
+
scales_h[ib/8] |= (l_h << 2*(ib%8));
|
12365
|
+
}
|
12366
|
+
} else {
|
12367
|
+
dh[0] = GGML_FP32_TO_FP16(scales[0]);
|
12368
|
+
float id = scales[0] ? 1/scales[0] : 0;
|
12369
|
+
for (int j = 0; j < super_block_size; ++j) {
|
12370
|
+
L[j] = best_index_int8(16, values, id*x[j]);
|
10950
12371
|
}
|
10951
12372
|
}
|
10952
|
-
|
12373
|
+
|
12374
|
+
for (int i = 0; i < super_block_size/32; ++i) {
|
10953
12375
|
for (int j = 0; j < 16; ++j) {
|
10954
12376
|
q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
|
10955
12377
|
}
|
@@ -10962,12 +12384,16 @@ size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, i
|
|
10962
12384
|
int nblock = n_per_row/QK4_NL;
|
10963
12385
|
char * qrow = (char *)dst;
|
10964
12386
|
uint8_t L[QK4_NL];
|
10965
|
-
float weight[
|
12387
|
+
float weight[QK4_NL];
|
12388
|
+
uint16_t unused_h;
|
12389
|
+
uint8_t * unused_l = NULL;
|
12390
|
+
float scale;
|
10966
12391
|
for (int row = 0; row < nrow; ++row) {
|
10967
12392
|
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
10968
12393
|
for (int ibl = 0; ibl < nblock; ++ibl) {
|
10969
12394
|
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
10970
|
-
quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs,
|
12395
|
+
quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
|
12396
|
+
&scale, weight, L, kvalues_iq4nl, qw);
|
10971
12397
|
}
|
10972
12398
|
src += n_per_row;
|
10973
12399
|
qrow += nblock*sizeof(block_iq4_nl);
|
@@ -10986,3 +12412,232 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
|
|
10986
12412
|
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
10987
12413
|
}
|
10988
12414
|
|
12415
|
+
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
12416
|
+
#if QK_K == 64
|
12417
|
+
return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
|
12418
|
+
#else
|
12419
|
+
(void)hist;
|
12420
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
12421
|
+
int nblock = n_per_row/QK_K;
|
12422
|
+
char * qrow = (char *)dst;
|
12423
|
+
uint8_t L[QK_K];
|
12424
|
+
float weight[32];
|
12425
|
+
float scales[QK_K/32];
|
12426
|
+
for (int row = 0; row < nrow; ++row) {
|
12427
|
+
block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
|
12428
|
+
for (int ibl = 0; ibl < nblock; ++ibl) {
|
12429
|
+
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
|
12430
|
+
quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
|
12431
|
+
scales, weight, L, kvalues_iq4nl, qw);
|
12432
|
+
}
|
12433
|
+
src += n_per_row;
|
12434
|
+
qrow += nblock*sizeof(block_iq4_xs);
|
12435
|
+
}
|
12436
|
+
return nrow * nblock * sizeof(block_iq4_xs);
|
12437
|
+
#endif
|
12438
|
+
}
|
12439
|
+
|
12440
|
+
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
12441
|
+
assert(k % QK_K == 0);
|
12442
|
+
block_iq4_xs * restrict y = vy;
|
12443
|
+
quantize_row_iq4_xs_reference(x, y, k);
|
12444
|
+
}
|
12445
|
+
|
12446
|
+
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
12447
|
+
assert(k % QK_K == 0);
|
12448
|
+
quantize_iq4_xs(x, y, 1, k, NULL, NULL);
|
12449
|
+
}
|
12450
|
+
|
12451
|
+
// =============================== 2.5625 bpw
|
12452
|
+
|
12453
|
+
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
12454
|
+
|
12455
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
12456
|
+
|
12457
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
12458
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
12459
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
12460
|
+
|
12461
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
12462
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
12463
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
12464
|
+
GGML_ASSERT(n%QK_K == 0);
|
12465
|
+
|
12466
|
+
const int kMaxQ = 3;
|
12467
|
+
|
12468
|
+
const int nbl = n/QK_K;
|
12469
|
+
|
12470
|
+
block_iq2_s * y = vy;
|
12471
|
+
|
12472
|
+
float scales[QK_K/16];
|
12473
|
+
float weight[16];
|
12474
|
+
float xval[16];
|
12475
|
+
int8_t L[16];
|
12476
|
+
int8_t Laux[16];
|
12477
|
+
float waux[16];
|
12478
|
+
bool is_on_grid[2];
|
12479
|
+
bool is_on_grid_aux[2];
|
12480
|
+
uint8_t block_signs[2];
|
12481
|
+
|
12482
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
12483
|
+
|
12484
|
+
memset(&y[ibl], 0, sizeof(block_iq2_s));
|
12485
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
12486
|
+
|
12487
|
+
float max_scale = 0;
|
12488
|
+
|
12489
|
+
const float * xbl = x + QK_K*ibl;
|
12490
|
+
float sumx2 = 0;
|
12491
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
12492
|
+
float sigma2 = 2*sumx2/QK_K;
|
12493
|
+
|
12494
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
12495
|
+
const float * xb = xbl + 16*ib;
|
12496
|
+
if (quant_weights) {
|
12497
|
+
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
12498
|
+
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12499
|
+
} else {
|
12500
|
+
for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
|
12501
|
+
}
|
12502
|
+
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
12503
|
+
for (int k = 0; k < 2; ++k) {
|
12504
|
+
uint8_t s = 0;
|
12505
|
+
for (int i = 0; i < 8; ++i) {
|
12506
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
12507
|
+
else {
|
12508
|
+
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
12509
|
+
}
|
12510
|
+
}
|
12511
|
+
block_signs[k] = s;
|
12512
|
+
}
|
12513
|
+
float max = xval[0];
|
12514
|
+
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
12515
|
+
if (!max) {
|
12516
|
+
scales[ib] = 0;
|
12517
|
+
continue;
|
12518
|
+
}
|
12519
|
+
float best = 0;
|
12520
|
+
float scale = max/(2*kMaxQ-1);
|
12521
|
+
is_on_grid[0] = is_on_grid[1] = true;
|
12522
|
+
for (int is = -9; is <= 9; ++is) {
|
12523
|
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
12524
|
+
float this_scale = 1/id;
|
12525
|
+
for (int k = 0; k < 2; ++k) {
|
12526
|
+
for (int i = 0; i < 8; ++i) {
|
12527
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
12528
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
12529
|
+
}
|
12530
|
+
uint16_t u = 0;
|
12531
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
12532
|
+
int grid_index = kmap_q2xs[u];
|
12533
|
+
is_on_grid_aux[k] = true;
|
12534
|
+
if (grid_index < 0) {
|
12535
|
+
is_on_grid_aux[k] = false;
|
12536
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12537
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
12538
|
+
}
|
12539
|
+
}
|
12540
|
+
float sumqx = 0, sumq2 = 0;
|
12541
|
+
for (int i = 0; i < 16; ++i) {
|
12542
|
+
float w = weight[i];
|
12543
|
+
float q = 2*Laux[i] + 1;
|
12544
|
+
sumqx += w*xval[i]*q;
|
12545
|
+
sumq2 += w*q*q;
|
12546
|
+
}
|
12547
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
12548
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
12549
|
+
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
12550
|
+
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
12551
|
+
}
|
12552
|
+
}
|
12553
|
+
int n_not_ongrid = 0;
|
12554
|
+
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
12555
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
12556
|
+
float id = 1/scale;
|
12557
|
+
for (int k = 0; k < 2; ++k) {
|
12558
|
+
if (is_on_grid[k]) continue;
|
12559
|
+
uint16_t u = 0;
|
12560
|
+
for (int i = 0; i < 8; ++i) {
|
12561
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
12562
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
12563
|
+
u |= (l << 2*i);
|
12564
|
+
L[8*k + i] = l;
|
12565
|
+
}
|
12566
|
+
int grid_index = kmap_q2xs[u];
|
12567
|
+
if (grid_index < 0) {
|
12568
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12569
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
12570
|
+
}
|
12571
|
+
}
|
12572
|
+
float sumqx = 0, sumq2 = 0;
|
12573
|
+
for (int i = 0; i < 16; ++i) {
|
12574
|
+
float w = weight[i];
|
12575
|
+
float q = 2*L[i] + 1;
|
12576
|
+
sumqx += w*xval[i]*q;
|
12577
|
+
sumq2 += w*q*q;
|
12578
|
+
}
|
12579
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
12580
|
+
}
|
12581
|
+
if (scale < 0) {
|
12582
|
+
scale = -scale;
|
12583
|
+
for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
|
12584
|
+
}
|
12585
|
+
for (int k = 0; k < 2; ++k) {
|
12586
|
+
uint16_t u = 0;
|
12587
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
12588
|
+
int grid_index = kmap_q2xs[u];
|
12589
|
+
if (grid_index < 0) {
|
12590
|
+
printf("Oops: found point %u not on grid:", u);
|
12591
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
12592
|
+
printf("\n");
|
12593
|
+
GGML_ASSERT(false);
|
12594
|
+
}
|
12595
|
+
const int i8 = 2*ib + k;
|
12596
|
+
y[ibl].qs[i8] = grid_index & 255;
|
12597
|
+
y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
|
12598
|
+
y[ibl].qs[QK_K/8 + i8] = block_signs[k];
|
12599
|
+
}
|
12600
|
+
GGML_ASSERT(scale >= 0);
|
12601
|
+
scales[ib] = scale;
|
12602
|
+
max_scale = MAX(max_scale, scale);
|
12603
|
+
}
|
12604
|
+
|
12605
|
+
if (!max_scale) {
|
12606
|
+
continue;
|
12607
|
+
}
|
12608
|
+
|
12609
|
+
float d = max_scale/31;
|
12610
|
+
y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
|
12611
|
+
float id = 1/d;
|
12612
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
12613
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
12614
|
+
l = MAX(0, MIN(15, l));
|
12615
|
+
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
12616
|
+
else y[ibl].scales[ib/2] |= (l << 4);
|
12617
|
+
}
|
12618
|
+
}
|
12619
|
+
}
|
12620
|
+
|
12621
|
+
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
12622
|
+
(void)hist;
|
12623
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
12624
|
+
int nblock = n_per_row/QK_K;
|
12625
|
+
char * qrow = (char *)dst;
|
12626
|
+
for (int row = 0; row < nrow; ++row) {
|
12627
|
+
quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
|
12628
|
+
src += n_per_row;
|
12629
|
+
qrow += nblock*sizeof(block_iq2_s);
|
12630
|
+
}
|
12631
|
+
return nrow * nblock * sizeof(block_iq2_s);
|
12632
|
+
}
|
12633
|
+
|
12634
|
+
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
12635
|
+
assert(k % QK_K == 0);
|
12636
|
+
quantize_iq2_s(x, y, 1, k, NULL, NULL);
|
12637
|
+
}
|
12638
|
+
|
12639
|
+
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
12640
|
+
assert(k % QK_K == 0);
|
12641
|
+
block_iq2_s * restrict y = vy;
|
12642
|
+
quantize_row_iq2_s_reference(x, y, k);
|
12643
|
+
}
|