llama_cpp 0.12.6 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/ext/llama_cpp/llama_cpp.cpp +90 -269
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +28 -23
- data/vendor/tmp/llama.cpp/Makefile +51 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -11
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +191 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +2472 -862
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +3176 -667
- data/vendor/tmp/llama.cpp/ggml-quants.h +77 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +373 -424
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +186 -102
- data/vendor/tmp/llama.cpp/ggml.c +1266 -699
- data/vendor/tmp/llama.cpp/ggml.h +59 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1517 -717
- data/vendor/tmp/llama.cpp/llama.h +87 -63
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
|
@@ -438,6 +438,54 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
|
|
438
438
|
return res;
|
|
439
439
|
}
|
|
440
440
|
|
|
441
|
+
// NOTE: not tested
|
|
442
|
+
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
443
|
+
int8x16_t res;
|
|
444
|
+
|
|
445
|
+
res[ 0] = a[b[ 0]];
|
|
446
|
+
res[ 1] = a[b[ 1]];
|
|
447
|
+
res[ 2] = a[b[ 2]];
|
|
448
|
+
res[ 3] = a[b[ 3]];
|
|
449
|
+
res[ 4] = a[b[ 4]];
|
|
450
|
+
res[ 5] = a[b[ 5]];
|
|
451
|
+
res[ 6] = a[b[ 6]];
|
|
452
|
+
res[ 7] = a[b[ 7]];
|
|
453
|
+
res[ 8] = a[b[ 8]];
|
|
454
|
+
res[ 9] = a[b[ 9]];
|
|
455
|
+
res[10] = a[b[10]];
|
|
456
|
+
res[11] = a[b[11]];
|
|
457
|
+
res[12] = a[b[12]];
|
|
458
|
+
res[13] = a[b[13]];
|
|
459
|
+
res[14] = a[b[14]];
|
|
460
|
+
res[15] = a[b[15]];
|
|
461
|
+
|
|
462
|
+
return res;
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// NOTE: not tested
|
|
466
|
+
inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
|
467
|
+
int8x16_t res;
|
|
468
|
+
|
|
469
|
+
res[ 0] = a[b[ 0]];
|
|
470
|
+
res[ 1] = a[b[ 1]];
|
|
471
|
+
res[ 2] = a[b[ 2]];
|
|
472
|
+
res[ 3] = a[b[ 3]];
|
|
473
|
+
res[ 4] = a[b[ 4]];
|
|
474
|
+
res[ 5] = a[b[ 5]];
|
|
475
|
+
res[ 6] = a[b[ 6]];
|
|
476
|
+
res[ 7] = a[b[ 7]];
|
|
477
|
+
res[ 8] = a[b[ 8]];
|
|
478
|
+
res[ 9] = a[b[ 9]];
|
|
479
|
+
res[10] = a[b[10]];
|
|
480
|
+
res[11] = a[b[11]];
|
|
481
|
+
res[12] = a[b[12]];
|
|
482
|
+
res[13] = a[b[13]];
|
|
483
|
+
res[14] = a[b[14]];
|
|
484
|
+
res[15] = a[b[15]];
|
|
485
|
+
|
|
486
|
+
return res;
|
|
487
|
+
}
|
|
488
|
+
|
|
441
489
|
#else
|
|
442
490
|
|
|
443
491
|
#define ggml_int16x8x2_t int16x8x2_t
|
|
@@ -451,6 +499,8 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
|
|
451
499
|
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
|
452
500
|
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
|
453
501
|
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
|
502
|
+
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
|
503
|
+
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
|
454
504
|
|
|
455
505
|
#endif
|
|
456
506
|
|
|
@@ -1827,7 +1877,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
|
1827
1877
|
float mins[QK_K/16];
|
|
1828
1878
|
float scales[QK_K/16];
|
|
1829
1879
|
float sw[QK_K/16];
|
|
1830
|
-
float weight[
|
|
1880
|
+
float weight[16];
|
|
1831
1881
|
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
|
1832
1882
|
|
|
1833
1883
|
for (int i = 0; i < nb; i++) {
|
|
@@ -1838,12 +1888,41 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
|
1838
1888
|
for (int j = 0; j < QK_K/16; ++j) {
|
|
1839
1889
|
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
|
1840
1890
|
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
|
1841
|
-
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
|
|
1891
|
+
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
|
1842
1892
|
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
|
1843
1893
|
}
|
|
1844
1894
|
|
|
1845
|
-
float dm
|
|
1846
|
-
|
|
1895
|
+
float dm, mm;
|
|
1896
|
+
#if QK_K == 64
|
|
1897
|
+
float max_scale = 0, max_min = 0;
|
|
1898
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
|
1899
|
+
max_scale = MAX(max_scale, scales[j]);
|
|
1900
|
+
max_min = MAX(max_min, mins[j]);
|
|
1901
|
+
}
|
|
1902
|
+
dm = max_scale/15;
|
|
1903
|
+
mm = max_min/15;
|
|
1904
|
+
if (max_scale) {
|
|
1905
|
+
float id = 1/dm;
|
|
1906
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
|
1907
|
+
int l = nearest_int(id*scales[j]);
|
|
1908
|
+
Ls[j] = MAX(0, MIN(15, l));
|
|
1909
|
+
}
|
|
1910
|
+
} else {
|
|
1911
|
+
memset(Ls, 0, QK_K/16);
|
|
1912
|
+
}
|
|
1913
|
+
if (max_min) {
|
|
1914
|
+
float id = 1/mm;
|
|
1915
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
|
1916
|
+
int l = nearest_int(id*mins[j]);
|
|
1917
|
+
Lm[j] = MAX(0, MIN(15, l));
|
|
1918
|
+
}
|
|
1919
|
+
} else {
|
|
1920
|
+
memset(Lm, 0, QK_K/16);
|
|
1921
|
+
}
|
|
1922
|
+
#else
|
|
1923
|
+
dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
|
1924
|
+
mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
|
1925
|
+
#endif
|
|
1847
1926
|
y[i].d = GGML_FP32_TO_FP16(dm);
|
|
1848
1927
|
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
|
1849
1928
|
dm = GGML_FP16_TO_FP32(y[i].d);
|
|
@@ -3445,6 +3524,265 @@ static const uint64_t iq2xs_grid[512] = {
|
|
|
3445
3524
|
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
|
3446
3525
|
};
|
|
3447
3526
|
|
|
3527
|
+
static const uint64_t iq2s_grid[1024] = {
|
|
3528
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
|
3529
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
|
3530
|
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
|
3531
|
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
|
3532
|
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
|
3533
|
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
|
3534
|
+
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
|
3535
|
+
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
|
3536
|
+
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
|
3537
|
+
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
|
3538
|
+
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
|
3539
|
+
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
|
3540
|
+
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
|
3541
|
+
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
|
3542
|
+
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
|
3543
|
+
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
|
3544
|
+
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
|
3545
|
+
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
|
3546
|
+
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
|
3547
|
+
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
|
3548
|
+
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
|
3549
|
+
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
|
3550
|
+
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
|
3551
|
+
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
|
3552
|
+
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
|
3553
|
+
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
|
3554
|
+
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
|
3555
|
+
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
|
3556
|
+
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
|
3557
|
+
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
|
3558
|
+
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
|
3559
|
+
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
|
3560
|
+
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
|
3561
|
+
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
|
3562
|
+
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
|
3563
|
+
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
|
3564
|
+
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
|
3565
|
+
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
|
3566
|
+
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
|
3567
|
+
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
|
3568
|
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
|
3569
|
+
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
|
3570
|
+
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
|
3571
|
+
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
|
3572
|
+
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
|
3573
|
+
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
|
3574
|
+
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
|
3575
|
+
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
|
3576
|
+
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
|
3577
|
+
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
|
3578
|
+
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
|
3579
|
+
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
|
3580
|
+
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
|
3581
|
+
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
|
3582
|
+
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
|
3583
|
+
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
|
3584
|
+
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
|
3585
|
+
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
|
3586
|
+
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
|
3587
|
+
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
|
3588
|
+
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
|
3589
|
+
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
|
3590
|
+
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
|
3591
|
+
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
|
3592
|
+
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
|
3593
|
+
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
|
3594
|
+
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
|
3595
|
+
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
|
3596
|
+
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
|
3597
|
+
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
|
3598
|
+
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
|
3599
|
+
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
|
3600
|
+
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
|
3601
|
+
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
|
3602
|
+
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
|
3603
|
+
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
|
3604
|
+
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
|
3605
|
+
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
|
3606
|
+
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
|
3607
|
+
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
|
3608
|
+
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
|
3609
|
+
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
|
3610
|
+
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
|
3611
|
+
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
|
3612
|
+
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
|
3613
|
+
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
|
3614
|
+
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
|
3615
|
+
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
|
3616
|
+
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
|
3617
|
+
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
|
3618
|
+
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
|
3619
|
+
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
|
3620
|
+
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
|
3621
|
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
|
3622
|
+
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
|
3623
|
+
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
|
3624
|
+
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
|
3625
|
+
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
|
3626
|
+
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
|
3627
|
+
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
|
3628
|
+
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
|
3629
|
+
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
|
3630
|
+
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
|
3631
|
+
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
|
3632
|
+
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
|
3633
|
+
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
|
3634
|
+
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
|
3635
|
+
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
|
3636
|
+
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
|
3637
|
+
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
|
3638
|
+
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
|
3639
|
+
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
|
3640
|
+
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
|
3641
|
+
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
|
3642
|
+
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
|
3643
|
+
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
|
3644
|
+
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
|
3645
|
+
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
|
3646
|
+
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
|
3647
|
+
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
|
3648
|
+
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
|
3649
|
+
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
|
3650
|
+
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
|
3651
|
+
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
|
3652
|
+
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
|
3653
|
+
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
|
3654
|
+
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
|
3655
|
+
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
|
3656
|
+
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
|
3657
|
+
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
|
3658
|
+
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
|
3659
|
+
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
|
3660
|
+
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
|
3661
|
+
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
|
3662
|
+
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
|
3663
|
+
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
|
3664
|
+
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
|
3665
|
+
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
|
3666
|
+
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
|
3667
|
+
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
|
3668
|
+
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
|
3669
|
+
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
|
3670
|
+
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
|
3671
|
+
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
|
3672
|
+
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
|
3673
|
+
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
|
3674
|
+
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
|
3675
|
+
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
|
3676
|
+
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
|
3677
|
+
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
|
3678
|
+
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
|
3679
|
+
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
|
3680
|
+
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
|
3681
|
+
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
|
3682
|
+
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
|
3683
|
+
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
|
3684
|
+
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
|
3685
|
+
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
|
3686
|
+
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
|
3687
|
+
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
|
3688
|
+
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
|
3689
|
+
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
|
3690
|
+
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
|
3691
|
+
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
|
3692
|
+
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
|
3693
|
+
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
|
3694
|
+
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
|
3695
|
+
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
|
3696
|
+
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
|
3697
|
+
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
|
3698
|
+
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
|
3699
|
+
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
|
3700
|
+
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
|
3701
|
+
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
|
3702
|
+
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
|
3703
|
+
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
|
3704
|
+
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
|
3705
|
+
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
|
3706
|
+
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
|
3707
|
+
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
|
3708
|
+
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
|
3709
|
+
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
|
3710
|
+
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
|
3711
|
+
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
|
3712
|
+
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
|
3713
|
+
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
|
3714
|
+
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
|
3715
|
+
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
|
3716
|
+
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
|
3717
|
+
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
|
3718
|
+
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
|
3719
|
+
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
|
3720
|
+
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
|
3721
|
+
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
|
3722
|
+
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
|
3723
|
+
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
|
3724
|
+
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
|
3725
|
+
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
|
3726
|
+
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
|
3727
|
+
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
|
3728
|
+
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
|
3729
|
+
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
|
3730
|
+
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
|
3731
|
+
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
|
3732
|
+
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
|
3733
|
+
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
|
3734
|
+
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
|
3735
|
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
|
3736
|
+
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
|
3737
|
+
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
|
3738
|
+
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
|
3739
|
+
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
|
3740
|
+
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
|
3741
|
+
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
|
3742
|
+
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
|
3743
|
+
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
|
3744
|
+
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
|
3745
|
+
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
|
3746
|
+
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
|
3747
|
+
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
|
3748
|
+
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
|
3749
|
+
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
|
3750
|
+
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
|
3751
|
+
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
|
3752
|
+
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
|
3753
|
+
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
|
3754
|
+
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
|
3755
|
+
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
|
3756
|
+
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
|
3757
|
+
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
|
3758
|
+
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
|
3759
|
+
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
|
3760
|
+
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
|
3761
|
+
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
|
3762
|
+
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
|
3763
|
+
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
|
3764
|
+
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
|
3765
|
+
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
|
3766
|
+
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
|
3767
|
+
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
|
3768
|
+
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
|
3769
|
+
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
|
3770
|
+
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
|
3771
|
+
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
|
3772
|
+
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
|
3773
|
+
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
|
3774
|
+
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
|
3775
|
+
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
|
3776
|
+
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
|
3777
|
+
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
|
3778
|
+
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
|
3779
|
+
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
|
3780
|
+
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
|
3781
|
+
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
|
3782
|
+
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
|
3783
|
+
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
|
3784
|
+
};
|
|
3785
|
+
|
|
3448
3786
|
static const uint32_t iq3xxs_grid[256] = {
|
|
3449
3787
|
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
|
3450
3788
|
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
|
@@ -3480,6 +3818,206 @@ static const uint32_t iq3xxs_grid[256] = {
|
|
|
3480
3818
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
|
3481
3819
|
};
|
|
3482
3820
|
|
|
3821
|
+
static const uint32_t iq3xs_grid[512] = {
|
|
3822
|
+
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
|
3823
|
+
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
|
3824
|
+
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
|
3825
|
+
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
|
3826
|
+
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
|
3827
|
+
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
|
3828
|
+
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
|
3829
|
+
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
|
3830
|
+
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
|
3831
|
+
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
|
3832
|
+
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
|
3833
|
+
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
|
3834
|
+
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
|
3835
|
+
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
|
3836
|
+
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
|
3837
|
+
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
|
3838
|
+
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
|
3839
|
+
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
|
3840
|
+
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
|
3841
|
+
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
|
3842
|
+
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
|
3843
|
+
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
|
3844
|
+
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
|
3845
|
+
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
|
3846
|
+
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
|
3847
|
+
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
|
3848
|
+
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
|
3849
|
+
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
|
3850
|
+
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
|
3851
|
+
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
|
3852
|
+
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
|
3853
|
+
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
|
3854
|
+
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
|
3855
|
+
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
|
3856
|
+
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
|
3857
|
+
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
|
3858
|
+
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
|
3859
|
+
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
|
3860
|
+
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
|
3861
|
+
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
|
3862
|
+
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
|
3863
|
+
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
|
3864
|
+
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
|
3865
|
+
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
|
3866
|
+
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
|
3867
|
+
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
|
3868
|
+
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
|
3869
|
+
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
|
3870
|
+
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
|
3871
|
+
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
|
3872
|
+
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
|
3873
|
+
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
|
3874
|
+
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
|
3875
|
+
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
|
3876
|
+
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
|
3877
|
+
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
|
3878
|
+
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
|
3879
|
+
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
|
3880
|
+
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
|
3881
|
+
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
|
3882
|
+
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
|
3883
|
+
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
|
3884
|
+
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
|
3885
|
+
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
|
3886
|
+
};
|
|
3887
|
+
|
|
3888
|
+
#define NGRID_IQ2XXS 512
|
|
3889
|
+
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
|
3890
|
+
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
|
3891
|
+
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
|
3892
|
+
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
|
3893
|
+
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
|
3894
|
+
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
|
3895
|
+
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
|
3896
|
+
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
|
3897
|
+
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
|
3898
|
+
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
|
3899
|
+
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
|
3900
|
+
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
|
3901
|
+
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
|
3902
|
+
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
|
3903
|
+
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
|
3904
|
+
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
|
3905
|
+
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
|
3906
|
+
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
|
3907
|
+
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
|
3908
|
+
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
|
3909
|
+
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
|
3910
|
+
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
|
3911
|
+
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
|
3912
|
+
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
|
3913
|
+
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
|
3914
|
+
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
|
3915
|
+
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
|
3916
|
+
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
|
3917
|
+
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
|
3918
|
+
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
|
3919
|
+
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
|
3920
|
+
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
|
3921
|
+
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
|
3922
|
+
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
|
3923
|
+
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
|
3924
|
+
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
|
3925
|
+
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
|
3926
|
+
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
|
3927
|
+
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
|
3928
|
+
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
|
3929
|
+
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
|
3930
|
+
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
|
3931
|
+
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
|
3932
|
+
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
|
3933
|
+
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
|
3934
|
+
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
|
3935
|
+
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
|
3936
|
+
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
|
3937
|
+
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
|
3938
|
+
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
|
3939
|
+
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
|
3940
|
+
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
|
3941
|
+
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
|
3942
|
+
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
|
3943
|
+
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
|
3944
|
+
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
|
3945
|
+
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
|
3946
|
+
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
|
3947
|
+
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
|
3948
|
+
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
|
3949
|
+
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
|
3950
|
+
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
|
3951
|
+
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
|
3952
|
+
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
|
3953
|
+
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
|
3954
|
+
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
|
3955
|
+
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
|
3956
|
+
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
|
3957
|
+
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
|
3958
|
+
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
|
3959
|
+
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
|
3960
|
+
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
|
3961
|
+
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
|
3962
|
+
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
|
3963
|
+
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
|
3964
|
+
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
|
3965
|
+
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
|
3966
|
+
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
|
3967
|
+
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
|
3968
|
+
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
|
3969
|
+
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
|
3970
|
+
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
|
3971
|
+
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
|
3972
|
+
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
|
3973
|
+
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
|
3974
|
+
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
|
3975
|
+
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
|
3976
|
+
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
|
3977
|
+
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
|
3978
|
+
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
|
3979
|
+
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
|
3980
|
+
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
|
3981
|
+
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
|
3982
|
+
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
|
3983
|
+
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
|
3984
|
+
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
|
3985
|
+
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
|
3986
|
+
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
|
3987
|
+
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
|
3988
|
+
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
|
3989
|
+
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
|
3990
|
+
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
|
3991
|
+
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
|
3992
|
+
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
|
3993
|
+
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
|
3994
|
+
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
|
3995
|
+
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
|
3996
|
+
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
|
3997
|
+
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
|
3998
|
+
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
|
3999
|
+
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
|
4000
|
+
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
|
4001
|
+
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
|
4002
|
+
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
|
4003
|
+
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
|
4004
|
+
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
|
4005
|
+
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
|
4006
|
+
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
|
4007
|
+
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
|
4008
|
+
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
|
4009
|
+
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
|
4010
|
+
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
|
4011
|
+
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
|
4012
|
+
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
|
4013
|
+
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
|
4014
|
+
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
|
4015
|
+
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
|
4016
|
+
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
|
4017
|
+
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
|
4018
|
+
|
|
4019
|
+
};
|
|
4020
|
+
|
|
3483
4021
|
static const uint8_t ksigns_iq2xs[128] = {
|
|
3484
4022
|
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
|
3485
4023
|
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
|
@@ -3546,6 +4084,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
|
3546
4084
|
}
|
|
3547
4085
|
}
|
|
3548
4086
|
|
|
4087
|
+
// ====================== 2.5625 bpw (de)-quantization
|
|
4088
|
+
|
|
4089
|
+
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
|
|
4090
|
+
assert(k % QK_K == 0);
|
|
4091
|
+
const int nb = k / QK_K;
|
|
4092
|
+
|
|
4093
|
+
float db[2];
|
|
4094
|
+
|
|
4095
|
+
for (int i = 0; i < nb; i++) {
|
|
4096
|
+
|
|
4097
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
|
4098
|
+
const uint8_t * qs = x[i].qs;
|
|
4099
|
+
const uint8_t * qh = x[i].qh;
|
|
4100
|
+
const uint8_t * signs = qs + QK_K/8;
|
|
4101
|
+
|
|
4102
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
4103
|
+
db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
|
|
4104
|
+
db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
|
|
4105
|
+
for (int l = 0; l < 4; ++l) {
|
|
4106
|
+
const float dl = db[l/2];
|
|
4107
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
4108
|
+
for (int j = 0; j < 8; ++j) {
|
|
4109
|
+
y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
|
4110
|
+
}
|
|
4111
|
+
y += 8;
|
|
4112
|
+
}
|
|
4113
|
+
qs += 4;
|
|
4114
|
+
signs += 4;
|
|
4115
|
+
}
|
|
4116
|
+
}
|
|
4117
|
+
}
|
|
4118
|
+
|
|
3549
4119
|
// ====================== 3.0625 bpw (de)-quantization
|
|
3550
4120
|
|
|
3551
4121
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
|
|
@@ -3578,6 +4148,139 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
|
|
3578
4148
|
}
|
|
3579
4149
|
}
|
|
3580
4150
|
|
|
4151
|
+
// ====================== 3.3125 bpw (de)-quantization
|
|
4152
|
+
|
|
4153
|
+
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
|
|
4154
|
+
assert(k % QK_K == 0);
|
|
4155
|
+
const int nb = k / QK_K;
|
|
4156
|
+
|
|
4157
|
+
for (int i = 0; i < nb; i++) {
|
|
4158
|
+
|
|
4159
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
|
4160
|
+
const uint8_t * qs = x[i].qs;
|
|
4161
|
+
const uint8_t * qh = x[i].qh;
|
|
4162
|
+
const uint8_t * signs = x[i].signs;
|
|
4163
|
+
|
|
4164
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
4165
|
+
const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f;
|
|
4166
|
+
const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f;
|
|
4167
|
+
for (int l = 0; l < 4; ++l) {
|
|
4168
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
|
4169
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
|
4170
|
+
for (int j = 0; j < 4; ++j) {
|
|
4171
|
+
y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
|
4172
|
+
y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
4173
|
+
}
|
|
4174
|
+
y += 8;
|
|
4175
|
+
}
|
|
4176
|
+
qs += 8;
|
|
4177
|
+
signs += 4;
|
|
4178
|
+
for (int l = 0; l < 4; ++l) {
|
|
4179
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
|
|
4180
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
|
|
4181
|
+
for (int j = 0; j < 4; ++j) {
|
|
4182
|
+
y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
|
4183
|
+
y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
4184
|
+
}
|
|
4185
|
+
y += 8;
|
|
4186
|
+
}
|
|
4187
|
+
qh += 2;
|
|
4188
|
+
qs += 8;
|
|
4189
|
+
signs += 4;
|
|
4190
|
+
}
|
|
4191
|
+
}
|
|
4192
|
+
}
|
|
4193
|
+
|
|
4194
|
+
// ====================== 1.5625 bpw (de)-quantization
|
|
4195
|
+
|
|
4196
|
+
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
|
|
4197
|
+
assert(k % QK_K == 0);
|
|
4198
|
+
const int nb = k / QK_K;
|
|
4199
|
+
|
|
4200
|
+
float db[4];
|
|
4201
|
+
uint16_t idx[4];
|
|
4202
|
+
//const int8_t * grid[4];
|
|
4203
|
+
|
|
4204
|
+
for (int i = 0; i < nb; i++) {
|
|
4205
|
+
|
|
4206
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
|
4207
|
+
const uint8_t * sc = x[i].scales;
|
|
4208
|
+
const uint8_t * qs = x[i].qs;
|
|
4209
|
+
|
|
4210
|
+
for (int i8 = 0; i8 < QK_K/8; i8 += 4) {
|
|
4211
|
+
idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
|
|
4212
|
+
idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
|
|
4213
|
+
idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
|
|
4214
|
+
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
|
4215
|
+
//grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
|
4216
|
+
//grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
|
4217
|
+
//grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
|
|
4218
|
+
//grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
|
|
4219
|
+
db[0] = d * (2*(sc[0] & 7) + 1);
|
|
4220
|
+
db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
|
|
4221
|
+
db[2] = d * (2*(sc[1] & 7) + 1);
|
|
4222
|
+
db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
|
|
4223
|
+
for (int l = 0; l < 4; ++l) {
|
|
4224
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
|
4225
|
+
for (int j = 0; j < 8; ++j) {
|
|
4226
|
+
//y[j] = db[l] * grid[l][j];
|
|
4227
|
+
y[j] = db[l] * grid[j];
|
|
4228
|
+
}
|
|
4229
|
+
y += 8;
|
|
4230
|
+
}
|
|
4231
|
+
qs += 4;
|
|
4232
|
+
sc += 2;
|
|
4233
|
+
}
|
|
4234
|
+
}
|
|
4235
|
+
}
|
|
4236
|
+
|
|
4237
|
+
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
|
4238
|
+
|
|
4239
|
+
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
|
|
4240
|
+
assert(k % QK4_NL == 0);
|
|
4241
|
+
const int nb = k / QK4_NL;
|
|
4242
|
+
|
|
4243
|
+
for (int i = 0; i < nb; i++) {
|
|
4244
|
+
|
|
4245
|
+
const uint8_t * qs = x[i].qs;
|
|
4246
|
+
|
|
4247
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
|
4248
|
+
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
4249
|
+
y[j+ 0] = d * kvalues_iq4nl[qs[j] & 0xf];
|
|
4250
|
+
y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >> 4];
|
|
4251
|
+
}
|
|
4252
|
+
y += QK4_NL;
|
|
4253
|
+
qs += QK4_NL/2;
|
|
4254
|
+
}
|
|
4255
|
+
}
|
|
4256
|
+
|
|
4257
|
+
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
|
|
4258
|
+
assert(k % QK_K == 0);
|
|
4259
|
+
#if QK_K == 64
|
|
4260
|
+
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
|
4261
|
+
#else
|
|
4262
|
+
const int nb = k / QK_K;
|
|
4263
|
+
|
|
4264
|
+
for (int i = 0; i < nb; i++) {
|
|
4265
|
+
|
|
4266
|
+
const uint8_t * qs = x[i].qs;
|
|
4267
|
+
|
|
4268
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
|
4269
|
+
|
|
4270
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
4271
|
+
const int ls = ((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4);
|
|
4272
|
+
const float dl = d * (ls - 32);
|
|
4273
|
+
for (int j = 0; j < 16; ++j) {
|
|
4274
|
+
y[j+ 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
|
|
4275
|
+
y[j+16] = dl * kvalues_iq4nl[qs[j] >> 4];
|
|
4276
|
+
}
|
|
4277
|
+
y += 32;
|
|
4278
|
+
qs += 16;
|
|
4279
|
+
}
|
|
4280
|
+
}
|
|
4281
|
+
#endif
|
|
4282
|
+
}
|
|
4283
|
+
|
|
3581
4284
|
//===================================== Q8_K ==============================================
|
|
3582
4285
|
|
|
3583
4286
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
|
@@ -3848,15 +4551,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3848
4551
|
|
|
3849
4552
|
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
|
|
3850
4553
|
|
|
3851
|
-
__m128i
|
|
3852
|
-
__m128i
|
|
3853
|
-
|
|
3854
|
-
const __m128i i32_0 = mul_sum_i8_pairs(
|
|
4554
|
+
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
|
|
4555
|
+
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
|
|
4556
|
+
bx_0 = _mm_sub_epi8(bx_0, off);
|
|
4557
|
+
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
|
3855
4558
|
|
|
3856
|
-
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
const __m128i i32_1 = mul_sum_i8_pairs(
|
|
4559
|
+
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
|
4560
|
+
by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
|
4561
|
+
bx_0 = _mm_sub_epi8(bx_0, off);
|
|
4562
|
+
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
|
|
3860
4563
|
|
|
3861
4564
|
// Convert int32_t to float
|
|
3862
4565
|
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
|
@@ -4442,21 +5145,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
4442
5145
|
/* Compute combined scale for the block */
|
|
4443
5146
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
|
4444
5147
|
|
|
4445
|
-
__m256i
|
|
5148
|
+
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
|
4446
5149
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
|
4447
5150
|
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
|
4448
5151
|
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
|
4449
5152
|
bxhil = _mm_andnot_si128(bxhil, mask);
|
|
4450
5153
|
bxhih = _mm_andnot_si128(bxhih, mask);
|
|
4451
|
-
__m128i bxl = _mm256_castsi256_si128(
|
|
4452
|
-
__m128i bxh = _mm256_extractf128_si256(
|
|
5154
|
+
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
|
5155
|
+
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
|
4453
5156
|
bxl = _mm_or_si128(bxl, bxhil);
|
|
4454
5157
|
bxh = _mm_or_si128(bxh, bxhih);
|
|
4455
|
-
|
|
5158
|
+
bx_0 = MM256_SET_M128I(bxh, bxl);
|
|
4456
5159
|
|
|
4457
|
-
const __m256i
|
|
5160
|
+
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
|
4458
5161
|
|
|
4459
|
-
const __m256 q = mul_sum_i8_pairs_float(
|
|
5162
|
+
const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
|
|
4460
5163
|
|
|
4461
5164
|
/* Multiply q with scale and accumulate */
|
|
4462
5165
|
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
|
|
@@ -4749,22 +5452,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
4749
5452
|
|
|
4750
5453
|
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
|
4751
5454
|
|
|
4752
|
-
__m256i
|
|
5455
|
+
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
|
4753
5456
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
|
4754
5457
|
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
|
4755
5458
|
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
|
4756
5459
|
bxhil = _mm_and_si128(bxhil, mask);
|
|
4757
5460
|
bxhih = _mm_and_si128(bxhih, mask);
|
|
4758
|
-
__m128i bxl = _mm256_castsi256_si128(
|
|
4759
|
-
__m128i bxh = _mm256_extractf128_si256(
|
|
5461
|
+
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
|
5462
|
+
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
|
4760
5463
|
bxl = _mm_or_si128(bxl, bxhil);
|
|
4761
5464
|
bxh = _mm_or_si128(bxh, bxhih);
|
|
4762
|
-
|
|
5465
|
+
bx_0 = MM256_SET_M128I(bxh, bxl);
|
|
4763
5466
|
|
|
4764
5467
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
|
4765
|
-
const __m256i
|
|
5468
|
+
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
|
4766
5469
|
|
|
4767
|
-
const __m256 q = mul_sum_us8_pairs_float(
|
|
5470
|
+
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
|
4768
5471
|
|
|
4769
5472
|
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
|
4770
5473
|
}
|
|
@@ -4993,10 +5696,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
4993
5696
|
|
|
4994
5697
|
for (int i = 0; i < nb; i++) {
|
|
4995
5698
|
// load elements
|
|
4996
|
-
vint8m1_t
|
|
4997
|
-
vint8m1_t
|
|
5699
|
+
vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
|
5700
|
+
vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
|
4998
5701
|
|
|
4999
|
-
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(
|
|
5702
|
+
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
|
|
5000
5703
|
|
|
5001
5704
|
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
|
5002
5705
|
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
|
@@ -5433,8 +6136,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5433
6136
|
|
|
5434
6137
|
for (int i = 0; i < nb; ++i) {
|
|
5435
6138
|
|
|
5436
|
-
const float d
|
|
5437
|
-
const float dmin = -y[i].d * (
|
|
6139
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6140
|
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
5438
6141
|
|
|
5439
6142
|
const uint8_t * restrict q2 = x[i].qs;
|
|
5440
6143
|
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -5583,8 +6286,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5583
6286
|
|
|
5584
6287
|
for (int i = 0; i < nb; ++i) {
|
|
5585
6288
|
|
|
5586
|
-
const float d
|
|
5587
|
-
const float dmin = -y[i].d * (
|
|
6289
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6290
|
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
5588
6291
|
|
|
5589
6292
|
const uint8_t * restrict q2 = x[i].qs;
|
|
5590
6293
|
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -5636,7 +6339,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5636
6339
|
|
|
5637
6340
|
float sumf = 0;
|
|
5638
6341
|
|
|
5639
|
-
int isum[
|
|
6342
|
+
int isum[QK_K/16];
|
|
5640
6343
|
|
|
5641
6344
|
for (int i = 0; i < nb; ++i) {
|
|
5642
6345
|
|
|
@@ -5652,14 +6355,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5652
6355
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5653
6356
|
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
5654
6357
|
|
|
5655
|
-
isum
|
|
6358
|
+
memset(isum, 0, (QK_K/16)*sizeof(int));
|
|
5656
6359
|
for (int l = 0; l < 16; ++l) {
|
|
5657
6360
|
isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
|
|
5658
6361
|
isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
|
|
5659
6362
|
isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
|
|
5660
6363
|
isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
|
|
5661
6364
|
}
|
|
5662
|
-
for (int l = 0; l <
|
|
6365
|
+
for (int l = 0; l < QK_K/16; ++l) {
|
|
5663
6366
|
isum[l] *= (sc[l] & 0xF);
|
|
5664
6367
|
}
|
|
5665
6368
|
sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
|
|
@@ -6237,7 +6940,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6237
6940
|
|
|
6238
6941
|
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
|
6239
6942
|
|
|
6240
|
-
const float d = y[i].d * (
|
|
6943
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6241
6944
|
|
|
6242
6945
|
const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
|
|
6243
6946
|
q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
|
|
@@ -6439,7 +7142,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6439
7142
|
|
|
6440
7143
|
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
|
6441
7144
|
|
|
6442
|
-
const float d = y[i].d * (
|
|
7145
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6443
7146
|
|
|
6444
7147
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
|
6445
7148
|
|
|
@@ -6942,9 +7645,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6942
7645
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
6943
7646
|
|
|
6944
7647
|
const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
|
|
6945
|
-
sum_mins += y[i].d * (
|
|
7648
|
+
sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;
|
|
6946
7649
|
|
|
6947
|
-
const float d = y[i].d * (
|
|
7650
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
|
|
6948
7651
|
|
|
6949
7652
|
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
|
6950
7653
|
|
|
@@ -7602,7 +8305,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7602
8305
|
|
|
7603
8306
|
for (int i = 0; i < nb; ++i) {
|
|
7604
8307
|
|
|
7605
|
-
const float d = y[i].d * (
|
|
8308
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7606
8309
|
const int8_t * sc = x[i].scales;
|
|
7607
8310
|
|
|
7608
8311
|
const uint8_t * restrict q5 = x[i].qs;
|
|
@@ -7744,7 +8447,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7744
8447
|
|
|
7745
8448
|
for (int i = 0; i < nb; ++i) {
|
|
7746
8449
|
|
|
7747
|
-
const float d = y[i].d * (
|
|
8450
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7748
8451
|
const int8_t * sc = x[i].scales;
|
|
7749
8452
|
|
|
7750
8453
|
const uint8_t * restrict q5 = x[i].qs;
|
|
@@ -8312,7 +9015,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
8312
9015
|
|
|
8313
9016
|
for (int i = 0; i < nb; ++i) {
|
|
8314
9017
|
|
|
8315
|
-
const float d_all = (
|
|
9018
|
+
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
|
8316
9019
|
|
|
8317
9020
|
const uint8_t * restrict q6 = x[i].ql;
|
|
8318
9021
|
const uint8_t * restrict qh = x[i].qh;
|
|
@@ -8483,7 +9186,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
8483
9186
|
|
|
8484
9187
|
for (int i = 0; i < nb; ++i) {
|
|
8485
9188
|
|
|
8486
|
-
const float d_all = (
|
|
9189
|
+
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
|
8487
9190
|
|
|
8488
9191
|
const uint8_t * restrict q6 = x[i].ql;
|
|
8489
9192
|
const uint8_t * restrict qh = x[i].qh;
|
|
@@ -8585,6 +9288,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
8585
9288
|
|
|
8586
9289
|
#endif
|
|
8587
9290
|
|
|
9291
|
+
#if defined (__AVX2__) || defined (__ARM_NEON)
|
|
8588
9292
|
static const int8_t keven_signs_q2xs[1024] = {
|
|
8589
9293
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
|
8590
9294
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
|
@@ -8619,6 +9323,7 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|
|
8619
9323
|
1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
|
|
8620
9324
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
8621
9325
|
};
|
|
9326
|
+
#endif
|
|
8622
9327
|
|
|
8623
9328
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
|
8624
9329
|
assert(n % QK_K == 0);
|
|
@@ -8816,15 +9521,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8816
9521
|
|
|
8817
9522
|
#elif defined(__AVX2__)
|
|
8818
9523
|
|
|
8819
|
-
const __m128i m4 = _mm_set1_epi8(0xf);
|
|
8820
|
-
const __m128i m1 = _mm_set1_epi8(1);
|
|
8821
|
-
const __m256i m511 = _mm256_set1_epi16(511);
|
|
8822
9524
|
const __m256i mone = _mm256_set1_epi8(1);
|
|
8823
|
-
|
|
8824
|
-
static const uint8_t k_bit_helper[32] = {
|
|
8825
|
-
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
8826
|
-
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
8827
|
-
};
|
|
8828
9525
|
static const char block_sign_shuffle_mask_1[32] = {
|
|
8829
9526
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
|
8830
9527
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
|
@@ -8838,11 +9535,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8838
9535
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
8839
9536
|
};
|
|
8840
9537
|
|
|
8841
|
-
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
|
8842
9538
|
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
|
|
8843
9539
|
const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
|
|
8844
9540
|
const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
|
|
8845
9541
|
|
|
9542
|
+
#if QK_K == 64
|
|
9543
|
+
static const uint8_t k_bit_helper[16] = {
|
|
9544
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
9545
|
+
};
|
|
9546
|
+
const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
|
9547
|
+
const __m128i m511 = _mm_set1_epi16(511);
|
|
9548
|
+
typedef union {
|
|
9549
|
+
__m128i vec_index;
|
|
9550
|
+
uint16_t index[8];
|
|
9551
|
+
} index_t;
|
|
9552
|
+
|
|
9553
|
+
index_t idx;
|
|
9554
|
+
__m256 accumf = _mm256_setzero_ps();
|
|
9555
|
+
for (int i = 0; i < nb; ++i) {
|
|
9556
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9557
|
+
const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
|
|
9558
|
+
idx.vec_index = _mm_and_si128(q2_data, m511);
|
|
9559
|
+
|
|
9560
|
+
const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
|
|
9561
|
+
const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
|
|
9562
|
+
const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
|
|
9563
|
+
|
|
9564
|
+
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
|
9565
|
+
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
|
9566
|
+
const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
|
|
9567
|
+
|
|
9568
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
|
9569
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
|
9570
|
+
|
|
9571
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
|
|
9572
|
+
iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
|
|
9573
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
|
|
9574
|
+
iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
|
|
9575
|
+
|
|
9576
|
+
__m256i signs;
|
|
9577
|
+
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
|
|
9578
|
+
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
|
9579
|
+
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
|
|
9580
|
+
|
|
9581
|
+
signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
|
|
9582
|
+
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
|
9583
|
+
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
|
|
9584
|
+
|
|
9585
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
|
9586
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
|
9587
|
+
|
|
9588
|
+
const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
|
9589
|
+
const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
|
9590
|
+
|
|
9591
|
+
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
|
9592
|
+
|
|
9593
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
|
|
9594
|
+
|
|
9595
|
+
}
|
|
9596
|
+
|
|
9597
|
+
*s = 0.125f * hsum_float_8(accumf);
|
|
9598
|
+
#else
|
|
9599
|
+
|
|
9600
|
+
static const uint8_t k_bit_helper[32] = {
|
|
9601
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
9602
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
9603
|
+
};
|
|
9604
|
+
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
|
9605
|
+
const __m256i m511 = _mm256_set1_epi16(511);
|
|
9606
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
|
9607
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
|
9608
|
+
|
|
8846
9609
|
uint64_t aux64;
|
|
8847
9610
|
|
|
8848
9611
|
// somewhat hacky, but gives a significant boost in performance
|
|
@@ -8931,6 +9694,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8931
9694
|
}
|
|
8932
9695
|
|
|
8933
9696
|
*s = 0.125f * hsum_float_8(accumf);
|
|
9697
|
+
#endif
|
|
8934
9698
|
|
|
8935
9699
|
#else
|
|
8936
9700
|
|
|
@@ -8972,8 +9736,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8972
9736
|
#endif
|
|
8973
9737
|
}
|
|
8974
9738
|
|
|
8975
|
-
|
|
8976
|
-
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
|
9739
|
+
void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
|
8977
9740
|
assert(n % QK_K == 0);
|
|
8978
9741
|
assert(nrc == 1);
|
|
8979
9742
|
UNUSED(nrc);
|
|
@@ -8981,75 +9744,279 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8981
9744
|
UNUSED(by);
|
|
8982
9745
|
UNUSED(bs);
|
|
8983
9746
|
|
|
8984
|
-
const
|
|
8985
|
-
const block_q8_K
|
|
9747
|
+
const block_iq2_s * restrict x = vx;
|
|
9748
|
+
const block_q8_K * restrict y = vy;
|
|
8986
9749
|
|
|
8987
9750
|
const int nb = n / QK_K;
|
|
8988
9751
|
|
|
8989
9752
|
#if defined(__ARM_NEON)
|
|
8990
9753
|
|
|
8991
|
-
|
|
9754
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
9755
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
|
9756
|
+
};
|
|
8992
9757
|
|
|
8993
|
-
|
|
9758
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
|
8994
9759
|
|
|
8995
|
-
|
|
9760
|
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
|
9761
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
|
9762
|
+
const uint8x16_t m1 = vdupq_n_u8(1);
|
|
9763
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
|
9764
|
+
|
|
9765
|
+
uint8x16x2_t vs;
|
|
9766
|
+
ggml_int8x16x4_t q2s;
|
|
8996
9767
|
ggml_int8x16x4_t q8b;
|
|
8997
9768
|
|
|
8998
9769
|
float sumf = 0;
|
|
8999
9770
|
for (int i = 0; i < nb; ++i) {
|
|
9771
|
+
|
|
9000
9772
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9001
|
-
|
|
9002
|
-
const uint8_t * restrict
|
|
9003
|
-
const
|
|
9004
|
-
|
|
9773
|
+
|
|
9774
|
+
const uint8_t * restrict qs = x[i].qs;
|
|
9775
|
+
const uint8_t * restrict qh = x[i].qh;
|
|
9776
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
9777
|
+
const int8_t * restrict q8 = y[i].qs;
|
|
9778
|
+
|
|
9779
|
+
int sumi1 = 0, sumi2 = 0;
|
|
9005
9780
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
9006
9781
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
9007
|
-
|
|
9008
|
-
|
|
9009
|
-
const
|
|
9010
|
-
|
|
9011
|
-
const
|
|
9012
|
-
|
|
9013
|
-
|
|
9014
|
-
|
|
9015
|
-
|
|
9016
|
-
|
|
9017
|
-
|
|
9018
|
-
|
|
9019
|
-
|
|
9020
|
-
|
|
9021
|
-
|
|
9022
|
-
|
|
9023
|
-
|
|
9024
|
-
|
|
9782
|
+
q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
|
|
9783
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
|
|
9784
|
+
q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
|
|
9785
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
|
|
9786
|
+
q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
|
|
9787
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
|
|
9788
|
+
q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
|
|
9789
|
+
vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
|
|
9790
|
+
qs += 8;
|
|
9791
|
+
|
|
9792
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
|
9793
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
|
9794
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
|
9795
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
|
9796
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
|
9797
|
+
|
|
9798
|
+
q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
|
|
9799
|
+
q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
|
|
9800
|
+
|
|
9801
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
|
9802
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
|
9803
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
|
9804
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
|
9805
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
|
9806
|
+
|
|
9807
|
+
signs += 4;
|
|
9808
|
+
|
|
9809
|
+
q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
|
|
9810
|
+
q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
|
|
9811
|
+
|
|
9812
|
+
const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
|
|
9813
|
+
const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
|
|
9814
|
+
const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
|
|
9815
|
+
const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
|
|
9816
|
+
|
|
9817
|
+
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
|
|
9818
|
+
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
|
|
9819
|
+
sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
|
|
9820
|
+
sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
|
|
9025
9821
|
}
|
|
9026
|
-
sumf += d*(
|
|
9822
|
+
sumf += d*(sumi1 + sumi2);
|
|
9027
9823
|
}
|
|
9028
|
-
|
|
9824
|
+
|
|
9825
|
+
*s = 0.125f * sumf;
|
|
9029
9826
|
|
|
9030
9827
|
#elif defined(__AVX2__)
|
|
9031
9828
|
|
|
9032
|
-
|
|
9829
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
9830
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
|
9831
|
+
};
|
|
9033
9832
|
|
|
9034
|
-
|
|
9833
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
9834
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
9835
|
+
};
|
|
9836
|
+
|
|
9837
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
|
9838
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
|
9839
|
+
|
|
9840
|
+
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
|
9841
|
+
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
|
9842
|
+
|
|
9843
|
+
uint64_t aux64;
|
|
9035
9844
|
|
|
9036
9845
|
__m256 accumf = _mm256_setzero_ps();
|
|
9037
9846
|
for (int i = 0; i < nb; ++i) {
|
|
9038
9847
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9039
|
-
const uint8_t * restrict
|
|
9040
|
-
const uint8_t * restrict
|
|
9848
|
+
const uint8_t * restrict qs = x[i].qs;
|
|
9849
|
+
const uint8_t * restrict qh = x[i].qh;
|
|
9850
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
9041
9851
|
const int8_t * restrict q8 = y[i].qs;
|
|
9852
|
+
|
|
9853
|
+
memcpy(&aux64, x[i].scales, 8);
|
|
9854
|
+
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
|
9855
|
+
const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
|
|
9856
|
+
|
|
9042
9857
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
9043
9858
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
9044
9859
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
9045
9860
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
9046
9861
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
9047
|
-
const __m256i q2_1 =
|
|
9048
|
-
|
|
9049
|
-
|
|
9050
|
-
|
|
9051
|
-
|
|
9052
|
-
|
|
9862
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
|
9863
|
+
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
|
|
9864
|
+
iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
|
9865
|
+
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
|
9866
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
|
9867
|
+
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
|
|
9868
|
+
iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
|
9869
|
+
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
|
9870
|
+
qs += 8;
|
|
9871
|
+
|
|
9872
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
|
9873
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
|
9874
|
+
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
|
9875
|
+
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
|
9876
|
+
|
|
9877
|
+
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
|
9878
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
|
9879
|
+
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
|
9880
|
+
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
|
9881
|
+
|
|
9882
|
+
signs += 4;
|
|
9883
|
+
|
|
9884
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
|
|
9885
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
|
|
9886
|
+
|
|
9887
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
|
|
9888
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
|
|
9889
|
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
|
9890
|
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
|
9891
|
+
}
|
|
9892
|
+
|
|
9893
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
|
9894
|
+
|
|
9895
|
+
}
|
|
9896
|
+
|
|
9897
|
+
*s = 0.125f * hsum_float_8(accumf);
|
|
9898
|
+
|
|
9899
|
+
#else
|
|
9900
|
+
|
|
9901
|
+
float sumf = 0;
|
|
9902
|
+
for (int i = 0; i < nb; i++) {
|
|
9903
|
+
|
|
9904
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9905
|
+
const int8_t * q8 = y[i].qs;
|
|
9906
|
+
const uint8_t * qs = x[i].qs;
|
|
9907
|
+
const uint8_t * qh = x[i].qh;
|
|
9908
|
+
const uint8_t * signs = qs + QK_K/8;
|
|
9909
|
+
|
|
9910
|
+
int bsum = 0;
|
|
9911
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
9912
|
+
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
|
9913
|
+
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
|
9914
|
+
int sumi1 = 0, sumi2 = 0;
|
|
9915
|
+
for (int l = 0; l < 2; ++l) {
|
|
9916
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
9917
|
+
for (int j = 0; j < 8; ++j) {
|
|
9918
|
+
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
9919
|
+
}
|
|
9920
|
+
q8 += 8;
|
|
9921
|
+
}
|
|
9922
|
+
for (int l = 2; l < 4; ++l) {
|
|
9923
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
9924
|
+
for (int j = 0; j < 8; ++j) {
|
|
9925
|
+
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
9926
|
+
}
|
|
9927
|
+
q8 += 8;
|
|
9928
|
+
}
|
|
9929
|
+
bsum += ls1 * sumi1 + ls2 * sumi2;
|
|
9930
|
+
qs += 4;
|
|
9931
|
+
signs += 4;
|
|
9932
|
+
}
|
|
9933
|
+
|
|
9934
|
+
sumf += d * bsum;
|
|
9935
|
+
}
|
|
9936
|
+
|
|
9937
|
+
*s = 0.125f * sumf;
|
|
9938
|
+
|
|
9939
|
+
#endif
|
|
9940
|
+
|
|
9941
|
+
}
|
|
9942
|
+
|
|
9943
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
|
9944
|
+
assert(n % QK_K == 0);
|
|
9945
|
+
assert(nrc == 1);
|
|
9946
|
+
UNUSED(nrc);
|
|
9947
|
+
UNUSED(bx);
|
|
9948
|
+
UNUSED(by);
|
|
9949
|
+
UNUSED(bs);
|
|
9950
|
+
|
|
9951
|
+
const block_iq3_xxs * restrict x = vx;
|
|
9952
|
+
const block_q8_K * restrict y = vy;
|
|
9953
|
+
|
|
9954
|
+
const int nb = n / QK_K;
|
|
9955
|
+
|
|
9956
|
+
#if defined(__ARM_NEON)
|
|
9957
|
+
|
|
9958
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
9959
|
+
|
|
9960
|
+
uint32_t aux32[2];
|
|
9961
|
+
|
|
9962
|
+
ggml_int8x16x4_t q3s;
|
|
9963
|
+
ggml_int8x16x4_t q8b;
|
|
9964
|
+
|
|
9965
|
+
float sumf = 0;
|
|
9966
|
+
for (int i = 0; i < nb; ++i) {
|
|
9967
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9968
|
+
const uint8_t * restrict q3 = x[i].qs;
|
|
9969
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
|
9970
|
+
const int8_t * restrict q8 = y[i].qs;
|
|
9971
|
+
float sumf1 = 0, sumf2 = 0;
|
|
9972
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
9973
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
9974
|
+
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
|
9975
|
+
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
|
|
9976
|
+
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
|
|
9977
|
+
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
|
|
9978
|
+
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
|
|
9979
|
+
q3 += 16;
|
|
9980
|
+
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
|
9981
|
+
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
|
9982
|
+
q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
|
|
9983
|
+
q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
|
|
9984
|
+
q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
|
|
9985
|
+
q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
|
|
9986
|
+
q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
|
|
9987
|
+
q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
|
|
9988
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
|
9989
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
|
9990
|
+
sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
|
|
9991
|
+
sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
|
|
9992
|
+
}
|
|
9993
|
+
sumf += d*(sumf1 + sumf2);
|
|
9994
|
+
}
|
|
9995
|
+
*s = 0.5f * sumf;
|
|
9996
|
+
|
|
9997
|
+
#elif defined(__AVX2__)
|
|
9998
|
+
|
|
9999
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
10000
|
+
|
|
10001
|
+
uint32_t aux32[2];
|
|
10002
|
+
|
|
10003
|
+
__m256 accumf = _mm256_setzero_ps();
|
|
10004
|
+
for (int i = 0; i < nb; ++i) {
|
|
10005
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
10006
|
+
const uint8_t * restrict q3 = x[i].qs;
|
|
10007
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
|
10008
|
+
const int8_t * restrict q8 = y[i].qs;
|
|
10009
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
|
10010
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
|
10011
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
10012
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10013
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10014
|
+
const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
|
10015
|
+
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
|
10016
|
+
q3 += 8;
|
|
10017
|
+
const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
|
10018
|
+
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
|
10019
|
+
q3 += 8;
|
|
9053
10020
|
memcpy(aux32, gas, 8); gas += 8;
|
|
9054
10021
|
const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
|
|
9055
10022
|
signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
|
@@ -9107,137 +10074,1449 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9107
10074
|
#endif
|
|
9108
10075
|
}
|
|
9109
10076
|
|
|
9110
|
-
|
|
10077
|
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
10078
|
+
assert(n % QK_K == 0);
|
|
10079
|
+
assert(nrc == 1);
|
|
10080
|
+
UNUSED(nrc);
|
|
10081
|
+
UNUSED(bx);
|
|
10082
|
+
UNUSED(by);
|
|
10083
|
+
UNUSED(bs);
|
|
9111
10084
|
|
|
9112
|
-
|
|
9113
|
-
|
|
9114
|
-
int * map;
|
|
9115
|
-
uint16_t * neighbours;
|
|
9116
|
-
} iq2_entry_t;
|
|
10085
|
+
const block_iq3_s * restrict x = vx;
|
|
10086
|
+
const block_q8_K * restrict y = vy;
|
|
9117
10087
|
|
|
9118
|
-
|
|
9119
|
-
{NULL, NULL, NULL},
|
|
9120
|
-
{NULL, NULL, NULL},
|
|
9121
|
-
};
|
|
10088
|
+
const int nb = n / QK_K;
|
|
9122
10089
|
|
|
9123
|
-
|
|
9124
|
-
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
|
9125
|
-
return grid_size == 256 ? 0 : 1;
|
|
9126
|
-
}
|
|
10090
|
+
#if defined(__ARM_NEON)
|
|
9127
10091
|
|
|
9128
|
-
static
|
|
9129
|
-
|
|
9130
|
-
|
|
9131
|
-
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
|
9132
|
-
}
|
|
10092
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
10093
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
|
10094
|
+
};
|
|
9133
10095
|
|
|
9134
|
-
|
|
9135
|
-
const int gindex = iq2_data_index(grid_size);
|
|
9136
|
-
if (iq2_data[gindex].grid) {
|
|
9137
|
-
return;
|
|
9138
|
-
}
|
|
9139
|
-
static const uint16_t kgrid_256[256] = {
|
|
9140
|
-
0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
|
|
9141
|
-
100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
|
|
9142
|
-
1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
|
|
9143
|
-
1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
|
|
9144
|
-
2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
|
|
9145
|
-
4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
|
|
9146
|
-
5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
|
|
9147
|
-
8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
|
|
9148
|
-
10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
|
|
9149
|
-
16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
|
|
9150
|
-
17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
|
|
9151
|
-
20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
|
|
9152
|
-
22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
|
|
9153
|
-
25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
|
|
9154
|
-
33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
|
|
9155
|
-
37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
|
|
9156
|
-
};
|
|
9157
|
-
static const uint16_t kgrid_512[512] = {
|
|
9158
|
-
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
|
9159
|
-
73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
|
|
9160
|
-
260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
|
|
9161
|
-
352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
|
|
9162
|
-
640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
|
|
9163
|
-
1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
|
|
9164
|
-
1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
|
|
9165
|
-
2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
|
|
9166
|
-
2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
|
|
9167
|
-
4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
|
|
9168
|
-
4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
|
|
9169
|
-
5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
|
|
9170
|
-
5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
|
|
9171
|
-
8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
|
|
9172
|
-
8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
|
|
9173
|
-
10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
|
|
9174
|
-
16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
|
|
9175
|
-
16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
|
|
9176
|
-
16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
|
|
9177
|
-
17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
|
|
9178
|
-
18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
|
|
9179
|
-
20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
|
|
9180
|
-
21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
|
|
9181
|
-
22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
|
|
9182
|
-
24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
|
|
9183
|
-
32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
|
|
9184
|
-
33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
|
|
9185
|
-
33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
|
|
9186
|
-
35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
|
|
9187
|
-
37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
|
|
9188
|
-
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
|
9189
|
-
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
|
9190
|
-
};
|
|
9191
|
-
const int kmap_size = 43692;
|
|
9192
|
-
const int nwant = 2;
|
|
9193
|
-
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
|
9194
|
-
uint64_t * kgrid_q2xs;
|
|
9195
|
-
int * kmap_q2xs;
|
|
9196
|
-
uint16_t * kneighbors_q2xs;
|
|
10096
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
|
9197
10097
|
|
|
9198
|
-
|
|
9199
|
-
|
|
9200
|
-
|
|
9201
|
-
|
|
9202
|
-
|
|
9203
|
-
|
|
9204
|
-
|
|
10098
|
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
|
10099
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
|
10100
|
+
|
|
10101
|
+
uint8x16x2_t vs;
|
|
10102
|
+
ggml_int8x16x4_t q3s;
|
|
10103
|
+
ggml_int8x16x4_t q8b;
|
|
10104
|
+
|
|
10105
|
+
float sumf = 0;
|
|
10106
|
+
for (int i = 0; i < nb; ++i) {
|
|
10107
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
10108
|
+
const uint8_t * restrict qs = x[i].qs;
|
|
10109
|
+
const uint8_t * restrict qh = x[i].qh;
|
|
10110
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
|
10111
|
+
const int8_t * restrict q8 = y[i].qs;
|
|
10112
|
+
int sumi1 = 0, sumi2 = 0;
|
|
10113
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
10114
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
10115
|
+
const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
|
|
10116
|
+
iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
|
|
10117
|
+
const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
|
|
10118
|
+
iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
|
|
10119
|
+
const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
|
|
10120
|
+
iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
|
|
10121
|
+
const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
|
|
10122
|
+
iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
|
|
10123
|
+
qs += 16;
|
|
10124
|
+
|
|
10125
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
|
10126
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
|
10127
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
|
10128
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
|
10129
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
|
10130
|
+
|
|
10131
|
+
q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0]));
|
|
10132
|
+
q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
|
|
10133
|
+
|
|
10134
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
|
10135
|
+
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
|
10136
|
+
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
|
10137
|
+
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
|
10138
|
+
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
|
10139
|
+
|
|
10140
|
+
signs += 4;
|
|
10141
|
+
|
|
10142
|
+
q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0]));
|
|
10143
|
+
q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1]));
|
|
10144
|
+
|
|
10145
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
|
10146
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
|
10147
|
+
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
|
10148
|
+
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
|
9205
10149
|
}
|
|
10150
|
+
sumf += d*(sumi1 + sumi2);
|
|
9206
10151
|
}
|
|
9207
|
-
|
|
9208
|
-
|
|
9209
|
-
|
|
9210
|
-
|
|
9211
|
-
|
|
9212
|
-
|
|
9213
|
-
|
|
10152
|
+
*s = 0.25f * sumf;
|
|
10153
|
+
|
|
10154
|
+
#elif defined(__AVX2__)
|
|
10155
|
+
|
|
10156
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
10157
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
|
10158
|
+
};
|
|
10159
|
+
|
|
10160
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
10161
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
10162
|
+
};
|
|
10163
|
+
|
|
10164
|
+
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
|
10165
|
+
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
|
10166
|
+
|
|
10167
|
+
__m256 accumf = _mm256_setzero_ps();
|
|
10168
|
+
for (int i = 0; i < nb; ++i) {
|
|
10169
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
10170
|
+
const uint8_t * restrict qs = x[i].qs;
|
|
10171
|
+
const uint8_t * restrict qh = x[i].qh;
|
|
10172
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
|
10173
|
+
const int8_t * restrict q8 = y[i].qs;
|
|
10174
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
|
10175
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
|
10176
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
10177
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10178
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10179
|
+
const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)],
|
|
10180
|
+
iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)],
|
|
10181
|
+
iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)],
|
|
10182
|
+
iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)],
|
|
10183
|
+
iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)],
|
|
10184
|
+
iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)],
|
|
10185
|
+
iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)],
|
|
10186
|
+
iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
|
|
10187
|
+
qs += 8;
|
|
10188
|
+
const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)],
|
|
10189
|
+
iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)],
|
|
10190
|
+
iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)],
|
|
10191
|
+
iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)],
|
|
10192
|
+
iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)],
|
|
10193
|
+
iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)],
|
|
10194
|
+
iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)],
|
|
10195
|
+
iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]);
|
|
10196
|
+
qs += 8;
|
|
10197
|
+
|
|
10198
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
|
10199
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
|
10200
|
+
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
|
10201
|
+
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
|
10202
|
+
|
|
10203
|
+
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
|
10204
|
+
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
|
10205
|
+
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
|
10206
|
+
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
|
10207
|
+
|
|
10208
|
+
signs += 4;
|
|
10209
|
+
|
|
10210
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
|
10211
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
|
10212
|
+
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
|
10213
|
+
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
|
10214
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
|
10215
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
|
10216
|
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
|
10217
|
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
|
10218
|
+
}
|
|
10219
|
+
|
|
10220
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
|
10221
|
+
|
|
10222
|
+
}
|
|
10223
|
+
|
|
10224
|
+
*s = 0.25f * hsum_float_8(accumf);
|
|
10225
|
+
|
|
10226
|
+
#else
|
|
10227
|
+
|
|
10228
|
+
float sumf = 0.f;
|
|
10229
|
+
for (int i = 0; i < nb; ++i) {
|
|
10230
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
10231
|
+
const uint8_t * restrict qs = x[i].qs;
|
|
10232
|
+
const uint8_t * restrict qh = x[i].qh;
|
|
10233
|
+
const uint8_t * restrict signs = x[i].signs;
|
|
10234
|
+
const int8_t * restrict q8 = y[i].qs;
|
|
10235
|
+
int32_t bsum = 0;
|
|
10236
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
10237
|
+
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
10238
|
+
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
10239
|
+
int32_t sumi = 0;
|
|
10240
|
+
for (int l = 0; l < 4; ++l) {
|
|
10241
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
10242
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
10243
|
+
for (int j = 0; j < 4; ++j) {
|
|
10244
|
+
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
10245
|
+
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
10246
|
+
}
|
|
10247
|
+
q8 += 8;
|
|
10248
|
+
}
|
|
10249
|
+
qs += 8;
|
|
10250
|
+
signs += 4;
|
|
10251
|
+
bsum += sumi * ls1;
|
|
10252
|
+
sumi = 0;
|
|
10253
|
+
for (int l = 0; l < 4; ++l) {
|
|
10254
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
10255
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
10256
|
+
for (int j = 0; j < 4; ++j) {
|
|
10257
|
+
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
10258
|
+
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
10259
|
+
}
|
|
10260
|
+
q8 += 8;
|
|
10261
|
+
}
|
|
10262
|
+
qs += 8;
|
|
10263
|
+
signs += 4;
|
|
10264
|
+
bsum += sumi * ls2;
|
|
10265
|
+
}
|
|
10266
|
+
sumf += d * bsum;
|
|
10267
|
+
}
|
|
10268
|
+
*s = 0.25f * sumf;
|
|
10269
|
+
#endif
|
|
10270
|
+
}
|
|
10271
|
+
|
|
10272
|
+
|
|
10273
|
+
#ifdef __AVX2__
|
|
10274
|
+
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
10275
|
+
const __m256i ax = _mm256_sign_epi8(x, x);
|
|
10276
|
+
const __m256i sy = _mm256_sign_epi8(y, x);
|
|
10277
|
+
return _mm256_maddubs_epi16(ax, sy);
|
|
10278
|
+
}
|
|
10279
|
+
#endif
|
|
10280
|
+
|
|
10281
|
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
10282
|
+
assert(n % QK_K == 0);
|
|
10283
|
+
assert(nrc == 1);
|
|
10284
|
+
UNUSED(nrc);
|
|
10285
|
+
UNUSED(bx);
|
|
10286
|
+
UNUSED(by);
|
|
10287
|
+
UNUSED(bs);
|
|
10288
|
+
|
|
10289
|
+
const block_iq1_s * restrict x = vx;
|
|
10290
|
+
const block_q8_K * restrict y = vy;
|
|
10291
|
+
|
|
10292
|
+
const int nb = n / QK_K;
|
|
10293
|
+
|
|
10294
|
+
// TODO: implement for QK_K = 64
|
|
10295
|
+
#if defined __ARM_NEON && QK_K == 256
|
|
10296
|
+
|
|
10297
|
+
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
|
10298
|
+
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
|
10299
|
+
const uint8x16_t m1 = vdupq_n_u8(0x01);
|
|
10300
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
|
10301
|
+
|
|
10302
|
+
uint16_t gindex[8];
|
|
10303
|
+
uint16x8x2_t vindex;
|
|
10304
|
+
int8x16x4_t q1b;
|
|
10305
|
+
ggml_int8x16x4_t q8b;
|
|
10306
|
+
uint16x8x4_t scales;
|
|
10307
|
+
int32x4x2_t sumi;
|
|
10308
|
+
int32x4x2_t dotq;
|
|
10309
|
+
|
|
10310
|
+
float sumf = 0;
|
|
10311
|
+
for (int i = 0; i < nb; ++i) {
|
|
10312
|
+
|
|
10313
|
+
const int8_t * q8 = y[i].qs;
|
|
10314
|
+
const uint8_t * qs = x[i].qs;
|
|
10315
|
+
const uint8_t * sc = x[i].scales;
|
|
10316
|
+
|
|
10317
|
+
sumi.val[0] = sumi.val[1] = vzero;
|
|
10318
|
+
|
|
10319
|
+
for (int i128 = 0; i128 < QK_K/128; ++i128) {
|
|
10320
|
+
const uint8x16_t ql = vld1q_u8(qs); qs += 16;
|
|
10321
|
+
const uint8x8_t tm1 = vld1_u8 (sc); sc += 8;
|
|
10322
|
+
const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
|
|
10323
|
+
const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
|
|
10324
|
+
const uint8x16_t hbit = vandq_u8(qh, m8);
|
|
10325
|
+
vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
|
|
10326
|
+
vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
|
|
10327
|
+
const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
|
|
10328
|
+
scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
|
|
10329
|
+
scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
|
|
10330
|
+
|
|
10331
|
+
for (int l = 0; l < 2; ++l) {
|
|
10332
|
+
vst1q_u16(gindex+0, vindex.val[l]);
|
|
10333
|
+
q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1])));
|
|
10334
|
+
q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3])));
|
|
10335
|
+
q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5])));
|
|
10336
|
+
q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7])));
|
|
10337
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
10338
|
+
|
|
10339
|
+
dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1]));
|
|
10340
|
+
dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3]));
|
|
10341
|
+
|
|
10342
|
+
sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
|
|
10343
|
+
sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
|
|
10344
|
+
}
|
|
10345
|
+
}
|
|
10346
|
+
|
|
10347
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1]));
|
|
10348
|
+
}
|
|
10349
|
+
|
|
10350
|
+
*s = sumf;
|
|
10351
|
+
|
|
10352
|
+
// TODO: implement for QK_K = 64
|
|
10353
|
+
#elif defined __AVX2__ && QK_K == 256
|
|
10354
|
+
|
|
10355
|
+
const __m128i m8 = _mm_set1_epi8(0x08);
|
|
10356
|
+
const __m128i m7 = _mm_set1_epi8(0x07);
|
|
10357
|
+
const __m128i m1 = _mm_set1_epi8(0x01);
|
|
10358
|
+
const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
|
|
10359
|
+
const __m128i shuffle_s[4] = {
|
|
10360
|
+
_mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
|
|
10361
|
+
_mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
|
|
10362
|
+
_mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
|
|
10363
|
+
_mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
|
|
10364
|
+
};
|
|
10365
|
+
|
|
10366
|
+
uint64_t aux64;
|
|
10367
|
+
|
|
10368
|
+
typedef union m256i_uint16 {
|
|
10369
|
+
__m256i reg;
|
|
10370
|
+
uint16_t s[16];
|
|
10371
|
+
} m256i_uint16_t;
|
|
10372
|
+
|
|
10373
|
+
m256i_uint16_t v_gindex;
|
|
10374
|
+
|
|
10375
|
+
__m256 accum = _mm256_setzero_ps();
|
|
10376
|
+
for (int i = 0; i < nb; ++i) {
|
|
10377
|
+
|
|
10378
|
+
const int8_t * q8 = y[i].qs;
|
|
10379
|
+
const uint8_t * qs = x[i].qs;
|
|
10380
|
+
const uint8_t * sc = x[i].scales;
|
|
10381
|
+
|
|
10382
|
+
__m256i sumi = _mm256_setzero_si256();
|
|
10383
|
+
for (int i128 = 0; i128 < QK_K/128; ++i128) {
|
|
10384
|
+
const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
|
10385
|
+
memcpy(&aux64, sc, 8); sc += 8;
|
|
10386
|
+
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
|
10387
|
+
const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
|
|
10388
|
+
v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
|
10389
|
+
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
|
10390
|
+
|
|
10391
|
+
for (int i32 = 0; i32 < 4; ++i32) {
|
|
10392
|
+
const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
10393
|
+
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
|
|
10394
|
+
iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
|
|
10395
|
+
const __m256i dot = mul_add_epi8(q1b, q8b);
|
|
10396
|
+
const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
|
|
10397
|
+
const __m256i p = _mm256_madd_epi16(s16, dot);
|
|
10398
|
+
sumi = _mm256_add_epi32(sumi, p);
|
|
10399
|
+
}
|
|
10400
|
+
|
|
10401
|
+
}
|
|
10402
|
+
|
|
10403
|
+
accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum);
|
|
10404
|
+
|
|
10405
|
+
}
|
|
10406
|
+
|
|
10407
|
+
*s = hsum_float_8(accum);
|
|
10408
|
+
|
|
10409
|
+
#else
|
|
10410
|
+
|
|
10411
|
+
int db[4];
|
|
10412
|
+
uint16_t idx[4];
|
|
10413
|
+
|
|
10414
|
+
float sumf = 0;
|
|
10415
|
+
for (int i = 0; i < nb; ++i) {
|
|
10416
|
+
|
|
10417
|
+
const int8_t * q8 = y[i].qs;
|
|
10418
|
+
const uint8_t * qs = x[i].qs;
|
|
10419
|
+
const uint8_t * sc = x[i].scales;
|
|
10420
|
+
|
|
10421
|
+
int sumi = 0;
|
|
10422
|
+
for (int i32 = 0; i32 < QK_K/32; ++i32) {
|
|
10423
|
+
idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
|
|
10424
|
+
idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
|
|
10425
|
+
idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
|
|
10426
|
+
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
|
10427
|
+
db[0] = (2*(sc[0] & 7) + 1);
|
|
10428
|
+
db[1] = (2*((sc[0] >> 4) & 7) + 1);
|
|
10429
|
+
db[2] = (2*(sc[1] & 7) + 1);
|
|
10430
|
+
db[3] = (2*((sc[1] >> 4) & 7) + 1);
|
|
10431
|
+
for (int l = 0; l < 4; ++l) {
|
|
10432
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
|
10433
|
+
int suml = 0;
|
|
10434
|
+
for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j];
|
|
10435
|
+
sumi += db[l] * suml;
|
|
10436
|
+
q8 += 8;
|
|
10437
|
+
}
|
|
10438
|
+
qs += 4;
|
|
10439
|
+
sc += 2;
|
|
10440
|
+
}
|
|
10441
|
+
|
|
10442
|
+
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
|
|
10443
|
+
}
|
|
10444
|
+
|
|
10445
|
+
*s = sumf;
|
|
10446
|
+
|
|
10447
|
+
#endif
|
|
10448
|
+
}
|
|
10449
|
+
|
|
10450
|
+
void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
|
10451
|
+
assert(nrc == 1);
|
|
10452
|
+
UNUSED(nrc);
|
|
10453
|
+
UNUSED(bx);
|
|
10454
|
+
UNUSED(by);
|
|
10455
|
+
UNUSED(bs);
|
|
10456
|
+
assert(n % QK4_NL == 0);
|
|
10457
|
+
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
|
|
10458
|
+
|
|
10459
|
+
const block_iq4_nl * restrict x = vx;
|
|
10460
|
+
const block_q8_0 * restrict y = vy;
|
|
10461
|
+
|
|
10462
|
+
const int nb = n / QK4_NL;
|
|
10463
|
+
|
|
10464
|
+
#if defined __ARM_NEON
|
|
10465
|
+
const int8x16_t values = vld1q_s8(kvalues_iq4nl);
|
|
10466
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
|
10467
|
+
uint8x16x2_t q4bits;
|
|
10468
|
+
int8x16x4_t q4b;
|
|
10469
|
+
int8x16x4_t q8b;
|
|
10470
|
+
int32x4_t prod_1, prod_2;
|
|
10471
|
+
|
|
10472
|
+
float sumf = 0;
|
|
10473
|
+
|
|
10474
|
+
for (int ib = 0; ib < nb; ib += 2) {
|
|
10475
|
+
|
|
10476
|
+
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
|
10477
|
+
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
|
10478
|
+
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
|
10479
|
+
q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
|
|
10480
|
+
q8b.val[2] = vld1q_s8(y[ib+1].qs);
|
|
10481
|
+
q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
|
|
10482
|
+
|
|
10483
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
|
10484
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
|
10485
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
|
10486
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
|
10487
|
+
|
|
10488
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
|
10489
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
|
10490
|
+
|
|
10491
|
+
sumf +=
|
|
10492
|
+
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
|
|
10493
|
+
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
|
|
10494
|
+
}
|
|
10495
|
+
|
|
10496
|
+
*s = sumf;
|
|
10497
|
+
|
|
10498
|
+
#elif defined __AVX2__
|
|
10499
|
+
|
|
10500
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
|
10501
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
10502
|
+
const __m256i mone = _mm256_set1_epi16(1);
|
|
10503
|
+
|
|
10504
|
+
__m256 accum1 = _mm256_setzero_ps();
|
|
10505
|
+
__m256 accum2 = _mm256_setzero_ps();
|
|
10506
|
+
for (int ib = 0; ib < nb; ib += 2) {
|
|
10507
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
|
|
10508
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
|
10509
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
|
10510
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
|
10511
|
+
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
|
10512
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
|
10513
|
+
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
|
10514
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
|
10515
|
+
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
|
10516
|
+
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
10517
|
+
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
|
10518
|
+
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
|
10519
|
+
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
|
10520
|
+
_mm256_cvtepi32_ps(p_1), accum1);
|
|
10521
|
+
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
|
10522
|
+
_mm256_cvtepi32_ps(p_2), accum2);
|
|
10523
|
+
|
|
10524
|
+
y += 2;
|
|
10525
|
+
x += 2;
|
|
10526
|
+
}
|
|
10527
|
+
|
|
10528
|
+
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
|
10529
|
+
|
|
10530
|
+
#else
|
|
10531
|
+
float sumf = 0;
|
|
10532
|
+
for (int ib = 0; ib < nb; ++ib) {
|
|
10533
|
+
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
|
10534
|
+
int sumi1 = 0, sumi2 = 0;
|
|
10535
|
+
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
10536
|
+
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
10537
|
+
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
|
10538
|
+
}
|
|
10539
|
+
sumf += d * (sumi1 + sumi2);
|
|
10540
|
+
}
|
|
10541
|
+
*s = sumf;
|
|
10542
|
+
#endif
|
|
10543
|
+
}
|
|
10544
|
+
|
|
10545
|
+
void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
|
10546
|
+
assert(nrc == 1);
|
|
10547
|
+
UNUSED(nrc);
|
|
10548
|
+
UNUSED(bx);
|
|
10549
|
+
UNUSED(by);
|
|
10550
|
+
UNUSED(bs);
|
|
10551
|
+
assert(n % QK_K == 0);
|
|
10552
|
+
#if QK_K == 64
|
|
10553
|
+
ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
|
|
10554
|
+
#else
|
|
10555
|
+
|
|
10556
|
+
const block_iq4_xs * restrict x = vx;
|
|
10557
|
+
const block_q8_K * restrict y = vy;
|
|
10558
|
+
|
|
10559
|
+
const int nb = n / QK_K;
|
|
10560
|
+
|
|
10561
|
+
#if defined __ARM_NEON
|
|
10562
|
+
const int8x16_t values = vld1q_s8(kvalues_iq4nl);
|
|
10563
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
|
10564
|
+
ggml_uint8x16x2_t q4bits;
|
|
10565
|
+
ggml_int8x16x4_t q4b;
|
|
10566
|
+
ggml_int8x16x4_t q8b;
|
|
10567
|
+
int32x4_t prod_1, prod_2;
|
|
10568
|
+
|
|
10569
|
+
float sumf = 0;
|
|
10570
|
+
|
|
10571
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
10572
|
+
|
|
10573
|
+
const int8_t * q8 = y[ibl].qs;
|
|
10574
|
+
const uint8_t * q4 = x[ibl].qs;
|
|
10575
|
+
uint16_t h = x[ibl].scales_h;
|
|
10576
|
+
|
|
10577
|
+
int sumi1 = 0, sumi2 = 0;
|
|
10578
|
+
for (int ib = 0; ib < QK_K/64; ++ib) {
|
|
10579
|
+
|
|
10580
|
+
q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
|
10581
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
10582
|
+
|
|
10583
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
|
10584
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
|
10585
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
|
10586
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
|
10587
|
+
|
|
10588
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
|
10589
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
|
10590
|
+
|
|
10591
|
+
int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
|
|
10592
|
+
int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
|
|
10593
|
+
h >>= 4;
|
|
10594
|
+
sumi1 += vaddvq_s32(prod_1) * ls1;
|
|
10595
|
+
sumi2 += vaddvq_s32(prod_2) * ls2;
|
|
10596
|
+
|
|
10597
|
+
}
|
|
10598
|
+
|
|
10599
|
+
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
|
10600
|
+
}
|
|
10601
|
+
|
|
10602
|
+
*s = sumf;
|
|
10603
|
+
|
|
10604
|
+
#elif defined __AVX2__
|
|
10605
|
+
|
|
10606
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
|
10607
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
10608
|
+
|
|
10609
|
+
__m256 accum = _mm256_setzero_ps();
|
|
10610
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
10611
|
+
const uint8_t * qs = x[ibl].qs;
|
|
10612
|
+
const int8_t * q8 = y[ibl].qs;
|
|
10613
|
+
uint16_t sh = x[ibl].scales_h;
|
|
10614
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
|
10615
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
|
10616
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
10617
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
|
10618
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
|
10619
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10620
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10621
|
+
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
|
10622
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
|
10623
|
+
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
|
10624
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
|
10625
|
+
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
|
10626
|
+
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
10627
|
+
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
|
10628
|
+
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
|
10629
|
+
sh >>= 4;
|
|
10630
|
+
const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
|
|
10631
|
+
const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
|
|
10632
|
+
sumi1 = _mm256_add_epi32(p_1, sumi1);
|
|
10633
|
+
sumi2 = _mm256_add_epi32(p_2, sumi2);
|
|
10634
|
+
}
|
|
10635
|
+
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
|
10636
|
+
_mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
|
|
10637
|
+
}
|
|
10638
|
+
|
|
10639
|
+
*s = hsum_float_8(accum);
|
|
10640
|
+
|
|
10641
|
+
#else
|
|
10642
|
+
float sumf = 0;
|
|
10643
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
10644
|
+
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
10645
|
+
uint16_t h = x[ibl].scales_h;
|
|
10646
|
+
const uint8_t * qs = x[ibl].qs;
|
|
10647
|
+
const int8_t * q8 = y[ibl].qs;
|
|
10648
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
10649
|
+
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
10650
|
+
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
10651
|
+
h >>= 4;
|
|
10652
|
+
const float d1 = d4d8*(ls1 - 32);
|
|
10653
|
+
const float d2 = d4d8*(ls2 - 32);
|
|
10654
|
+
int sumi1 = 0, sumi2 = 0;
|
|
10655
|
+
for (int j = 0; j < 16; ++j) {
|
|
10656
|
+
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
10657
|
+
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
10658
|
+
}
|
|
10659
|
+
sumf += d1 * (sumi1 + sumi2);
|
|
10660
|
+
qs += 16;
|
|
10661
|
+
q8 += 32;
|
|
10662
|
+
sumi1 = sumi2 = 0;
|
|
10663
|
+
for (int j = 0; j < 16; ++j) {
|
|
10664
|
+
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
10665
|
+
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
10666
|
+
}
|
|
10667
|
+
sumf += d2 * (sumi1 + sumi2);
|
|
10668
|
+
qs += 16;
|
|
10669
|
+
q8 += 32;
|
|
10670
|
+
}
|
|
10671
|
+
}
|
|
10672
|
+
*s = sumf;
|
|
10673
|
+
#endif
|
|
10674
|
+
#endif
|
|
10675
|
+
}
|
|
10676
|
+
|
|
10677
|
+
// ================================ IQ2 quantization =============================================
|
|
10678
|
+
|
|
10679
|
+
typedef struct {
|
|
10680
|
+
uint64_t * grid;
|
|
10681
|
+
int * map;
|
|
10682
|
+
uint16_t * neighbours;
|
|
10683
|
+
} iq2_entry_t;
|
|
10684
|
+
|
|
10685
|
+
static iq2_entry_t iq2_data[4] = {
|
|
10686
|
+
{NULL, NULL, NULL},
|
|
10687
|
+
{NULL, NULL, NULL},
|
|
10688
|
+
{NULL, NULL, NULL},
|
|
10689
|
+
{NULL, NULL, NULL},
|
|
10690
|
+
};
|
|
10691
|
+
|
|
10692
|
+
static inline int iq2_data_index(enum ggml_type type) {
|
|
10693
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
|
10694
|
+
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
|
10695
|
+
type == GGML_TYPE_IQ2_XS ? 1 :
|
|
10696
|
+
type == GGML_TYPE_IQ1_S ? 2 : 3;
|
|
10697
|
+
}
|
|
10698
|
+
|
|
10699
|
+
static inline int iq2_grid_size(enum ggml_type type) {
|
|
10700
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
|
10701
|
+
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
|
10702
|
+
type == GGML_TYPE_IQ2_XS ? 512 :
|
|
10703
|
+
type == GGML_TYPE_IQ1_S ? 512 : 1024;
|
|
10704
|
+
}
|
|
10705
|
+
|
|
10706
|
+
static int iq2_compare_func(const void * left, const void * right) {
|
|
10707
|
+
const int * l = (const int *)left;
|
|
10708
|
+
const int * r = (const int *)right;
|
|
10709
|
+
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
|
10710
|
+
}
|
|
10711
|
+
|
|
10712
|
+
void iq2xs_init_impl(enum ggml_type type) {
|
|
10713
|
+
const int gindex = iq2_data_index(type);
|
|
10714
|
+
const int grid_size = iq2_grid_size(type);
|
|
10715
|
+
if (iq2_data[gindex].grid) {
|
|
10716
|
+
return;
|
|
10717
|
+
}
|
|
10718
|
+
static const uint16_t kgrid_2bit_256[256] = {
|
|
10719
|
+
0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
|
|
10720
|
+
100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
|
|
10721
|
+
1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
|
|
10722
|
+
1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
|
|
10723
|
+
2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
|
|
10724
|
+
4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
|
|
10725
|
+
5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
|
|
10726
|
+
8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
|
|
10727
|
+
10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
|
|
10728
|
+
16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
|
|
10729
|
+
17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
|
|
10730
|
+
20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
|
|
10731
|
+
22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
|
|
10732
|
+
25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
|
|
10733
|
+
33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
|
|
10734
|
+
37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
|
|
10735
|
+
};
|
|
10736
|
+
static const uint16_t kgrid_2bit_512[512] = {
|
|
10737
|
+
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
|
10738
|
+
73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
|
|
10739
|
+
260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
|
|
10740
|
+
352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
|
|
10741
|
+
640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
|
|
10742
|
+
1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
|
|
10743
|
+
1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
|
|
10744
|
+
2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
|
|
10745
|
+
2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
|
|
10746
|
+
4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
|
|
10747
|
+
4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
|
|
10748
|
+
5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
|
|
10749
|
+
5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
|
|
10750
|
+
8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
|
|
10751
|
+
8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
|
|
10752
|
+
10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
|
|
10753
|
+
16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
|
|
10754
|
+
16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
|
|
10755
|
+
16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
|
|
10756
|
+
17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
|
|
10757
|
+
18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
|
|
10758
|
+
20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
|
|
10759
|
+
21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
|
|
10760
|
+
22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
|
|
10761
|
+
24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
|
|
10762
|
+
32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
|
|
10763
|
+
33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
|
|
10764
|
+
33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
|
|
10765
|
+
35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
|
|
10766
|
+
37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
|
|
10767
|
+
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
|
10768
|
+
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
|
10769
|
+
};
|
|
10770
|
+
static const uint16_t kgrid_1bit_512[512] = {
|
|
10771
|
+
10, 33, 41, 85, 132, 134, 160, 162, 277, 337, 340, 345, 357, 405, 516, 545,
|
|
10772
|
+
553, 598, 641, 650, 681, 1042, 1044, 1097, 1169, 1176, 1320, 1345, 1365, 1378, 1434, 1444,
|
|
10773
|
+
1545, 1617, 1642, 1685, 2053, 2080, 2089, 2133, 2176, 2182, 2208, 2214, 2306, 2384, 2393, 2440,
|
|
10774
|
+
2453, 2581, 2664, 2690, 2721, 4117, 4161, 4182, 4184, 4261, 4357, 4369, 4372, 4377, 4390, 4422,
|
|
10775
|
+
4432, 4437, 4449, 4457, 4485, 4497, 4505, 4629, 4677, 4696, 4774, 5205, 5217, 5225, 5386, 5397,
|
|
10776
|
+
5409, 5445, 5457, 5460, 5461, 5462, 5465, 5472, 5477, 5525, 5545, 5650, 5668, 5717, 5729, 5769,
|
|
10777
|
+
5777, 6212, 6234, 6244, 6293, 6424, 6482, 6485, 6502, 6505, 6529, 6538, 6565, 6656, 6682, 6788,
|
|
10778
|
+
6806, 6820, 8218, 8224, 8226, 8232, 8277, 8326, 8354, 8469, 8521, 8530, 8549, 8596, 8737, 8794,
|
|
10779
|
+
9221, 9253, 9348, 9369, 9380, 9474, 9557, 9633, 9732, 9753, 9793, 9830, 9862, 9880, 10240, 10272,
|
|
10780
|
+
10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665,
|
|
10781
|
+
16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685,
|
|
10782
|
+
17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529,
|
|
10783
|
+
18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517,
|
|
10784
|
+
20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872,
|
|
10785
|
+
20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653,
|
|
10786
|
+
21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842,
|
|
10787
|
+
21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913,
|
|
10788
|
+
21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608,
|
|
10789
|
+
22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072,
|
|
10790
|
+
23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110,
|
|
10791
|
+
25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937,
|
|
10792
|
+
25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885,
|
|
10793
|
+
26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808,
|
|
10794
|
+
32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320,
|
|
10795
|
+
33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918,
|
|
10796
|
+
34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125,
|
|
10797
|
+
37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973,
|
|
10798
|
+
38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485,
|
|
10799
|
+
38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497,
|
|
10800
|
+
39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514,
|
|
10801
|
+
41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
|
|
10802
|
+
42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
|
|
10803
|
+
};
|
|
10804
|
+
static const uint16_t kgrid_2bit_1024[1024] = {
|
|
10805
|
+
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
|
10806
|
+
73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
|
|
10807
|
+
165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
|
|
10808
|
+
337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
|
|
10809
|
+
517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
|
|
10810
|
+
674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
|
|
10811
|
+
1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
|
|
10812
|
+
1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
|
|
10813
|
+
1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
|
|
10814
|
+
1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
|
|
10815
|
+
2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
|
|
10816
|
+
2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
|
|
10817
|
+
2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
|
|
10818
|
+
4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
|
|
10819
|
+
4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
|
|
10820
|
+
4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
|
|
10821
|
+
4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
|
|
10822
|
+
4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
|
|
10823
|
+
5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
|
|
10824
|
+
5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
|
|
10825
|
+
5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
|
|
10826
|
+
5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
|
|
10827
|
+
6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
|
|
10828
|
+
6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
|
|
10829
|
+
8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
|
|
10830
|
+
8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
|
|
10831
|
+
8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
|
|
10832
|
+
9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
|
|
10833
|
+
9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
|
|
10834
|
+
10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
|
|
10835
|
+
16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
|
|
10836
|
+
16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
|
|
10837
|
+
16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
|
|
10838
|
+
16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
|
|
10839
|
+
17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
|
|
10840
|
+
17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
|
|
10841
|
+
17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
|
|
10842
|
+
17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
|
|
10843
|
+
18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
|
|
10844
|
+
18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
|
|
10845
|
+
18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
|
|
10846
|
+
20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
|
|
10847
|
+
20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
|
|
10848
|
+
20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
|
|
10849
|
+
21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
|
|
10850
|
+
21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
|
|
10851
|
+
22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
|
|
10852
|
+
22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
|
|
10853
|
+
24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
|
|
10854
|
+
24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
|
|
10855
|
+
25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
|
|
10856
|
+
26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
|
|
10857
|
+
32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
|
|
10858
|
+
33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
|
|
10859
|
+
33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
|
|
10860
|
+
33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
|
|
10861
|
+
34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
|
|
10862
|
+
35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
|
|
10863
|
+
36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
|
|
10864
|
+
37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
|
|
10865
|
+
38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
|
|
10866
|
+
39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
|
|
10867
|
+
41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
|
|
10868
|
+
42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
|
|
10869
|
+
};
|
|
10870
|
+
|
|
10871
|
+
const int kmap_size = 43692;
|
|
10872
|
+
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
|
10873
|
+
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
|
10874
|
+
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
|
10875
|
+
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
|
10876
|
+
type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
|
|
10877
|
+
uint64_t * kgrid_q2xs;
|
|
10878
|
+
int * kmap_q2xs;
|
|
10879
|
+
uint16_t * kneighbors_q2xs;
|
|
10880
|
+
|
|
10881
|
+
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
|
10882
|
+
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
|
|
10883
|
+
for (int k = 0; k < grid_size; ++k) {
|
|
10884
|
+
int8_t * pos = (int8_t *)(the_grid + k);
|
|
10885
|
+
for (int i = 0; i < 8; ++i) {
|
|
10886
|
+
int l = (kgrid[k] >> 2*i) & 0x3;
|
|
10887
|
+
pos[i] = 2*l + 1;
|
|
10888
|
+
}
|
|
10889
|
+
}
|
|
10890
|
+
kgrid_q2xs = the_grid;
|
|
10891
|
+
iq2_data[gindex].grid = the_grid;
|
|
10892
|
+
kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
|
|
10893
|
+
iq2_data[gindex].map = kmap_q2xs;
|
|
10894
|
+
for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
|
|
10895
|
+
uint64_t aux64;
|
|
10896
|
+
uint8_t * aux8 = (uint8_t *)&aux64;
|
|
10897
|
+
for (int i = 0; i < grid_size; ++i) {
|
|
10898
|
+
aux64 = kgrid_q2xs[i];
|
|
10899
|
+
uint16_t index = 0;
|
|
10900
|
+
for (int k=0; k<8; ++k) {
|
|
10901
|
+
uint16_t q = (aux8[k] - 1)/2;
|
|
10902
|
+
index |= (q << 2*k);
|
|
10903
|
+
}
|
|
10904
|
+
kmap_q2xs[index] = i;
|
|
10905
|
+
}
|
|
10906
|
+
int8_t pos[8];
|
|
10907
|
+
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
|
10908
|
+
int num_neighbors = 0, num_not_in_map = 0;
|
|
10909
|
+
for (int i = 0; i < kmap_size; ++i) {
|
|
10910
|
+
if (kmap_q2xs[i] >= 0) continue;
|
|
10911
|
+
++num_not_in_map;
|
|
10912
|
+
for (int k = 0; k < 8; ++k) {
|
|
10913
|
+
int l = (i >> 2*k) & 0x3;
|
|
10914
|
+
pos[k] = 2*l + 1;
|
|
10915
|
+
}
|
|
10916
|
+
for (int j = 0; j < grid_size; ++j) {
|
|
10917
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
|
10918
|
+
int d2 = 0;
|
|
10919
|
+
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
|
10920
|
+
dist2[2*j+0] = d2;
|
|
10921
|
+
dist2[2*j+1] = j;
|
|
10922
|
+
}
|
|
10923
|
+
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
|
10924
|
+
int n = 0; int d2 = dist2[0];
|
|
10925
|
+
int nhave = 1;
|
|
10926
|
+
for (int j = 0; j < grid_size; ++j) {
|
|
10927
|
+
if (dist2[2*j] > d2) {
|
|
10928
|
+
if (nhave == nwant) break;
|
|
10929
|
+
d2 = dist2[2*j];
|
|
10930
|
+
++nhave;
|
|
10931
|
+
}
|
|
10932
|
+
++n;
|
|
10933
|
+
}
|
|
10934
|
+
num_neighbors += n;
|
|
10935
|
+
}
|
|
10936
|
+
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
|
10937
|
+
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
|
10938
|
+
iq2_data[gindex].neighbours = kneighbors_q2xs;
|
|
10939
|
+
int counter = 0;
|
|
10940
|
+
for (int i = 0; i < kmap_size; ++i) {
|
|
10941
|
+
if (kmap_q2xs[i] >= 0) continue;
|
|
10942
|
+
for (int k = 0; k < 8; ++k) {
|
|
10943
|
+
int l = (i >> 2*k) & 0x3;
|
|
10944
|
+
pos[k] = 2*l + 1;
|
|
10945
|
+
}
|
|
10946
|
+
for (int j = 0; j < grid_size; ++j) {
|
|
10947
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
|
10948
|
+
int d2 = 0;
|
|
10949
|
+
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
|
10950
|
+
dist2[2*j+0] = d2;
|
|
10951
|
+
dist2[2*j+1] = j;
|
|
10952
|
+
}
|
|
10953
|
+
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
|
10954
|
+
kmap_q2xs[i] = -(counter + 1);
|
|
10955
|
+
int d2 = dist2[0];
|
|
10956
|
+
uint16_t * start = &kneighbors_q2xs[counter++];
|
|
10957
|
+
int n = 0, nhave = 1;
|
|
10958
|
+
for (int j = 0; j < grid_size; ++j) {
|
|
10959
|
+
if (dist2[2*j] > d2) {
|
|
10960
|
+
if (nhave == nwant) break;
|
|
10961
|
+
d2 = dist2[2*j];
|
|
10962
|
+
++nhave;
|
|
10963
|
+
}
|
|
10964
|
+
kneighbors_q2xs[counter++] = dist2[2*j+1];
|
|
10965
|
+
++n;
|
|
10966
|
+
}
|
|
10967
|
+
*start = n;
|
|
10968
|
+
}
|
|
10969
|
+
free(dist2);
|
|
10970
|
+
}
|
|
10971
|
+
|
|
10972
|
+
void iq2xs_free_impl(enum ggml_type type) {
|
|
10973
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
|
10974
|
+
const int gindex = iq2_data_index(type);
|
|
10975
|
+
if (iq2_data[gindex].grid) {
|
|
10976
|
+
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
|
10977
|
+
free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
|
|
10978
|
+
free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
|
|
10979
|
+
}
|
|
10980
|
+
}
|
|
10981
|
+
|
|
10982
|
+
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
|
10983
|
+
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
|
10984
|
+
int num_neighbors = neighbours[0];
|
|
10985
|
+
GGML_ASSERT(num_neighbors > 0);
|
|
10986
|
+
float best_d2 = FLT_MAX;
|
|
10987
|
+
int grid_index = -1;
|
|
10988
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
|
10989
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
|
10990
|
+
float d2 = 0;
|
|
10991
|
+
for (int i = 0; i < 8; ++i) {
|
|
10992
|
+
float q = pg[i];
|
|
10993
|
+
float diff = scale*q - xval[i];
|
|
10994
|
+
d2 += weight[i]*diff*diff;
|
|
10995
|
+
}
|
|
10996
|
+
if (d2 < best_d2) {
|
|
10997
|
+
best_d2 = d2; grid_index = neighbours[j];
|
|
10998
|
+
}
|
|
10999
|
+
}
|
|
11000
|
+
GGML_ASSERT(grid_index >= 0);
|
|
11001
|
+
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
|
11002
|
+
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
|
|
11003
|
+
return grid_index;
|
|
11004
|
+
}
|
|
11005
|
+
|
|
11006
|
+
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
|
11007
|
+
|
|
11008
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
|
|
11009
|
+
|
|
11010
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
|
11011
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
|
11012
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
|
11013
|
+
|
|
11014
|
+
GGML_ASSERT(quant_weights && "missing quantization weights");
|
|
11015
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
|
11016
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
|
11017
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
|
11018
|
+
GGML_ASSERT(n%QK_K == 0);
|
|
11019
|
+
|
|
11020
|
+
const int kMaxQ = 3;
|
|
11021
|
+
|
|
11022
|
+
const int nbl = n/QK_K;
|
|
11023
|
+
|
|
11024
|
+
block_iq2_xxs * y = vy;
|
|
11025
|
+
|
|
11026
|
+
float scales[QK_K/32];
|
|
11027
|
+
float weight[32];
|
|
11028
|
+
float xval[32];
|
|
11029
|
+
int8_t L[32];
|
|
11030
|
+
int8_t Laux[32];
|
|
11031
|
+
float waux[32];
|
|
11032
|
+
uint8_t block_signs[4];
|
|
11033
|
+
uint32_t q2[2*(QK_K/32)];
|
|
11034
|
+
|
|
11035
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
|
11036
|
+
|
|
11037
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
|
11038
|
+
memset(q2, 0, QK_K/4);
|
|
11039
|
+
|
|
11040
|
+
float max_scale = 0;
|
|
11041
|
+
|
|
11042
|
+
const float * xbl = x + QK_K*ibl;
|
|
11043
|
+
float sumx2 = 0;
|
|
11044
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
|
11045
|
+
float sigma2 = sumx2/QK_K;
|
|
11046
|
+
|
|
11047
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
11048
|
+
const float * xb = xbl + 32*ib;
|
|
11049
|
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
|
11050
|
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
|
11051
|
+
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
|
11052
|
+
for (int k = 0; k < 4; ++k) {
|
|
11053
|
+
int nflip = 0;
|
|
11054
|
+
uint8_t s = 0;
|
|
11055
|
+
for (int i = 0; i < 8; ++i) {
|
|
11056
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
|
11057
|
+
else {
|
|
11058
|
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
|
11059
|
+
}
|
|
11060
|
+
}
|
|
11061
|
+
if (nflip%2) {
|
|
11062
|
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
|
11063
|
+
for (int i = 1; i < 8; ++i) {
|
|
11064
|
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
|
11065
|
+
if (ax < min) {
|
|
11066
|
+
min = ax; imin = i;
|
|
11067
|
+
}
|
|
11068
|
+
}
|
|
11069
|
+
xval[8*k+imin] = -xval[8*k+imin];
|
|
11070
|
+
s ^= (1 << imin);
|
|
11071
|
+
}
|
|
11072
|
+
block_signs[k] = s & 127;
|
|
11073
|
+
}
|
|
11074
|
+
float max = xval[0];
|
|
11075
|
+
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
|
11076
|
+
if (!max) {
|
|
11077
|
+
scales[ib] = 0;
|
|
11078
|
+
memset(L, 0, 32);
|
|
11079
|
+
continue;
|
|
11080
|
+
}
|
|
11081
|
+
float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
|
|
11082
|
+
float eff_max = scale*kMaxQ;
|
|
11083
|
+
float best = 0;
|
|
11084
|
+
for (int is = -6; is <= 6; ++is) {
|
|
11085
|
+
float id = (2*kMaxQ-1+is*0.1f)/eff_max;
|
|
11086
|
+
float this_scale = 1/id;
|
|
11087
|
+
for (int k = 0; k < 4; ++k) {
|
|
11088
|
+
for (int i = 0; i < 8; ++i) {
|
|
11089
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
|
11090
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
|
11091
|
+
}
|
|
11092
|
+
uint16_t u = 0;
|
|
11093
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
|
11094
|
+
int grid_index = kmap_q2xs[u];
|
|
11095
|
+
if (grid_index < 0) {
|
|
11096
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
|
11097
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
|
11098
|
+
}
|
|
11099
|
+
}
|
|
11100
|
+
float sumqx = 0, sumq2 = 0;
|
|
11101
|
+
for (int i = 0; i < 32; ++i) {
|
|
11102
|
+
float w = weight[i];
|
|
11103
|
+
float q = 2*Laux[i] + 1;
|
|
11104
|
+
sumqx += w*xval[i]*q;
|
|
11105
|
+
sumq2 += w*q*q;
|
|
11106
|
+
}
|
|
11107
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
|
11108
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
|
11109
|
+
memcpy(L, Laux, 32);
|
|
11110
|
+
}
|
|
11111
|
+
}
|
|
11112
|
+
if (scale > 0) {
|
|
11113
|
+
float id = 1/scale;
|
|
11114
|
+
for (int k = 0; k < 4; ++k) {
|
|
11115
|
+
uint16_t u = 0;
|
|
11116
|
+
for (int i = 0; i < 8; ++i) {
|
|
11117
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
|
11118
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
|
11119
|
+
u |= (l << 2*i);
|
|
11120
|
+
}
|
|
11121
|
+
int grid_index = kmap_q2xs[u];
|
|
11122
|
+
if (grid_index < 0) {
|
|
11123
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
|
11124
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
|
11125
|
+
}
|
|
11126
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
|
|
11127
|
+
for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
|
|
11128
|
+
}
|
|
11129
|
+
float sumqx = 0, sumq2 = 0;
|
|
11130
|
+
for (int i = 0; i < 32; ++i) {
|
|
11131
|
+
float w = weight[i];
|
|
11132
|
+
float q = 2*L[i] + 1;
|
|
11133
|
+
sumqx += w*xval[i]*q;
|
|
11134
|
+
sumq2 += w*q*q;
|
|
11135
|
+
}
|
|
11136
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
|
11137
|
+
}
|
|
11138
|
+
if (scale < 0) {
|
|
11139
|
+
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
|
11140
|
+
// and correspondingly flip quant signs.
|
|
11141
|
+
scale = -scale;
|
|
11142
|
+
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
|
11143
|
+
}
|
|
11144
|
+
for (int k = 0; k < 4; ++k) {
|
|
11145
|
+
uint16_t u = 0;
|
|
11146
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
|
11147
|
+
int grid_index = kmap_q2xs[u];
|
|
11148
|
+
if (grid_index < 0) {
|
|
11149
|
+
printf("Oops: found point %u not on grid:", u);
|
|
11150
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
|
11151
|
+
printf("\n");
|
|
11152
|
+
GGML_ASSERT(false);
|
|
11153
|
+
}
|
|
11154
|
+
q2[2*ib+0] |= (grid_index << 8*k);
|
|
11155
|
+
q2[2*ib+1] |= (block_signs[k] << 7*k);
|
|
11156
|
+
}
|
|
11157
|
+
GGML_ASSERT(scale >= 0);
|
|
11158
|
+
scales[ib] = scale;
|
|
11159
|
+
max_scale = MAX(max_scale, scale);
|
|
11160
|
+
}
|
|
11161
|
+
|
|
11162
|
+
if (!max_scale) {
|
|
11163
|
+
memset(y[ibl].qs, 0, QK_K/4);
|
|
11164
|
+
continue;
|
|
11165
|
+
}
|
|
11166
|
+
|
|
11167
|
+
float d = max_scale/31;
|
|
11168
|
+
y[ibl].d = GGML_FP32_TO_FP16(d);
|
|
11169
|
+
float id = 1/d;
|
|
11170
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
11171
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
|
11172
|
+
l = MAX(0, MIN(15, l));
|
|
11173
|
+
q2[2*ib+1] |= ((uint32_t)l << 28);
|
|
11174
|
+
}
|
|
11175
|
+
memcpy(y[ibl].qs, q2, QK_K/4);
|
|
11176
|
+
}
|
|
11177
|
+
}
|
|
11178
|
+
|
|
11179
|
+
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
|
11180
|
+
|
|
11181
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
|
|
11182
|
+
|
|
11183
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
|
11184
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
|
11185
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
|
11186
|
+
|
|
11187
|
+
GGML_ASSERT(quant_weights && "missing quantization weights");
|
|
11188
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
|
11189
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
|
11190
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
|
11191
|
+
GGML_ASSERT(n%QK_K == 0);
|
|
11192
|
+
|
|
11193
|
+
const int kMaxQ = 3;
|
|
11194
|
+
|
|
11195
|
+
const int nbl = n/QK_K;
|
|
11196
|
+
|
|
11197
|
+
block_iq2_xs * y = vy;
|
|
11198
|
+
|
|
11199
|
+
float scales[QK_K/16];
|
|
11200
|
+
float weight[16];
|
|
11201
|
+
float xval[16];
|
|
11202
|
+
int8_t L[16];
|
|
11203
|
+
int8_t Laux[16];
|
|
11204
|
+
float waux[16];
|
|
11205
|
+
bool is_on_grid[2];
|
|
11206
|
+
bool is_on_grid_aux[2];
|
|
11207
|
+
uint8_t block_signs[2];
|
|
11208
|
+
uint16_t q2[2*(QK_K/16)];
|
|
11209
|
+
|
|
11210
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
|
11211
|
+
|
|
11212
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
|
11213
|
+
memset(q2, 0, QK_K/4);
|
|
11214
|
+
memset(y[ibl].scales, 0, QK_K/32);
|
|
11215
|
+
|
|
11216
|
+
float max_scale = 0;
|
|
11217
|
+
|
|
11218
|
+
const float * xbl = x + QK_K*ibl;
|
|
11219
|
+
float sumx2 = 0;
|
|
11220
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
|
11221
|
+
float sigma2 = sumx2/QK_K;
|
|
11222
|
+
|
|
11223
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
|
11224
|
+
const float * xb = xbl + 16*ib;
|
|
11225
|
+
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
|
11226
|
+
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
|
11227
|
+
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
|
11228
|
+
for (int k = 0; k < 2; ++k) {
|
|
11229
|
+
int nflip = 0;
|
|
11230
|
+
uint8_t s = 0;
|
|
11231
|
+
for (int i = 0; i < 8; ++i) {
|
|
11232
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
|
11233
|
+
else {
|
|
11234
|
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
|
11235
|
+
}
|
|
11236
|
+
}
|
|
11237
|
+
if (nflip%2) {
|
|
11238
|
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
|
11239
|
+
for (int i = 1; i < 8; ++i) {
|
|
11240
|
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
|
11241
|
+
if (ax < min) {
|
|
11242
|
+
min = ax; imin = i;
|
|
11243
|
+
}
|
|
11244
|
+
}
|
|
11245
|
+
xval[8*k+imin] = -xval[8*k+imin];
|
|
11246
|
+
s ^= (1 << imin);
|
|
11247
|
+
}
|
|
11248
|
+
block_signs[k] = s & 127;
|
|
11249
|
+
}
|
|
11250
|
+
float max = xval[0];
|
|
11251
|
+
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
|
11252
|
+
if (!max) {
|
|
11253
|
+
scales[ib] = 0;
|
|
11254
|
+
memset(L, 0, 16);
|
|
11255
|
+
continue;
|
|
11256
|
+
}
|
|
11257
|
+
float best = 0;
|
|
11258
|
+
float scale = max/(2*kMaxQ-1);
|
|
11259
|
+
is_on_grid[0] = is_on_grid[1] = true;
|
|
11260
|
+
for (int is = -9; is <= 9; ++is) {
|
|
11261
|
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
|
11262
|
+
float this_scale = 1/id;
|
|
11263
|
+
for (int k = 0; k < 2; ++k) {
|
|
11264
|
+
for (int i = 0; i < 8; ++i) {
|
|
11265
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
|
11266
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
|
11267
|
+
}
|
|
11268
|
+
uint16_t u = 0;
|
|
11269
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
|
11270
|
+
int grid_index = kmap_q2xs[u];
|
|
11271
|
+
is_on_grid_aux[k] = true;
|
|
11272
|
+
if (grid_index < 0) {
|
|
11273
|
+
is_on_grid_aux[k] = false;
|
|
11274
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
|
11275
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
|
11276
|
+
}
|
|
11277
|
+
}
|
|
11278
|
+
float sumqx = 0, sumq2 = 0;
|
|
11279
|
+
for (int i = 0; i < 16; ++i) {
|
|
11280
|
+
float w = weight[i];
|
|
11281
|
+
float q = 2*Laux[i] + 1;
|
|
11282
|
+
sumqx += w*xval[i]*q;
|
|
11283
|
+
sumq2 += w*q*q;
|
|
11284
|
+
}
|
|
11285
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
|
11286
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
|
11287
|
+
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
|
11288
|
+
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
|
11289
|
+
}
|
|
11290
|
+
}
|
|
11291
|
+
int n_not_ongrid = 0;
|
|
11292
|
+
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
|
11293
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
|
11294
|
+
float id = 1/scale;
|
|
11295
|
+
for (int k = 0; k < 2; ++k) {
|
|
11296
|
+
if (is_on_grid[k]) continue;
|
|
11297
|
+
uint16_t u = 0;
|
|
11298
|
+
for (int i = 0; i < 8; ++i) {
|
|
11299
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
|
11300
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
|
11301
|
+
u |= (l << 2*i);
|
|
11302
|
+
L[8*k + i] = l;
|
|
11303
|
+
}
|
|
11304
|
+
int grid_index = kmap_q2xs[u];
|
|
11305
|
+
if (grid_index < 0) {
|
|
11306
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
|
11307
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
|
11308
|
+
}
|
|
11309
|
+
}
|
|
11310
|
+
float sumqx = 0, sumq2 = 0;
|
|
11311
|
+
for (int i = 0; i < 16; ++i) {
|
|
11312
|
+
float w = weight[i];
|
|
11313
|
+
float q = 2*L[i] + 1;
|
|
11314
|
+
sumqx += w*xval[i]*q;
|
|
11315
|
+
sumq2 += w*q*q;
|
|
11316
|
+
}
|
|
11317
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
|
11318
|
+
}
|
|
11319
|
+
if (scale < 0) {
|
|
11320
|
+
scale = -scale;
|
|
11321
|
+
for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
|
11322
|
+
}
|
|
11323
|
+
for (int k = 0; k < 2; ++k) {
|
|
11324
|
+
uint16_t u = 0;
|
|
11325
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
|
11326
|
+
int grid_index = kmap_q2xs[u];
|
|
11327
|
+
if (grid_index < 0) {
|
|
11328
|
+
printf("Oops: found point %u not on grid:", u);
|
|
11329
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
|
11330
|
+
printf("\n");
|
|
11331
|
+
GGML_ASSERT(false);
|
|
11332
|
+
}
|
|
11333
|
+
q2[2*ib+k] = grid_index | (block_signs[k] << 9);
|
|
11334
|
+
}
|
|
11335
|
+
GGML_ASSERT(scale >= 0);
|
|
11336
|
+
scales[ib] = scale;
|
|
11337
|
+
max_scale = MAX(max_scale, scale);
|
|
11338
|
+
}
|
|
11339
|
+
|
|
11340
|
+
if (!max_scale) {
|
|
11341
|
+
memset(y[ibl].qs, 0, QK_K/4);
|
|
11342
|
+
continue;
|
|
11343
|
+
}
|
|
11344
|
+
|
|
11345
|
+
float d = max_scale/31;
|
|
11346
|
+
y[ibl].d = GGML_FP32_TO_FP16(d);
|
|
11347
|
+
float id = 1/d;
|
|
11348
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
|
11349
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
|
11350
|
+
l = MAX(0, MIN(15, l));
|
|
11351
|
+
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
|
11352
|
+
else y[ibl].scales[ib/2] |= (l << 4);
|
|
11353
|
+
}
|
|
11354
|
+
memcpy(y[ibl].qs, q2, QK_K/4);
|
|
11355
|
+
|
|
11356
|
+
}
|
|
11357
|
+
}
|
|
11358
|
+
|
|
11359
|
+
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
|
11360
|
+
(void)hist;
|
|
11361
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
11362
|
+
int nblock = n_per_row/QK_K;
|
|
11363
|
+
char * qrow = (char *)dst;
|
|
11364
|
+
for (int row = 0; row < nrow; ++row) {
|
|
11365
|
+
quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
|
|
11366
|
+
src += n_per_row;
|
|
11367
|
+
qrow += nblock*sizeof(block_iq2_xxs);
|
|
11368
|
+
}
|
|
11369
|
+
return nrow * nblock * sizeof(block_iq2_xxs);
|
|
11370
|
+
}
|
|
11371
|
+
|
|
11372
|
+
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
|
11373
|
+
(void)hist;
|
|
11374
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
11375
|
+
int nblock = n_per_row/QK_K;
|
|
11376
|
+
char * qrow = (char *)dst;
|
|
11377
|
+
for (int row = 0; row < nrow; ++row) {
|
|
11378
|
+
quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
|
|
11379
|
+
src += n_per_row;
|
|
11380
|
+
qrow += nblock*sizeof(block_iq2_xs);
|
|
11381
|
+
}
|
|
11382
|
+
return nrow * nblock * sizeof(block_iq2_xs);
|
|
11383
|
+
}
|
|
11384
|
+
|
|
11385
|
+
//
|
|
11386
|
+
// ============================================= 3-bit using D4 lattice
|
|
11387
|
+
//
|
|
11388
|
+
|
|
11389
|
+
typedef struct {
|
|
11390
|
+
uint32_t * grid;
|
|
11391
|
+
int * map;
|
|
11392
|
+
uint16_t * neighbours;
|
|
11393
|
+
} iq3_entry_t;
|
|
11394
|
+
|
|
11395
|
+
static iq3_entry_t iq3_data[2] = {
|
|
11396
|
+
{NULL, NULL, NULL},
|
|
11397
|
+
{NULL, NULL, NULL},
|
|
11398
|
+
};
|
|
11399
|
+
|
|
11400
|
+
static inline int iq3_data_index(int grid_size) {
|
|
11401
|
+
(void)grid_size;
|
|
11402
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
|
11403
|
+
return grid_size == 256 ? 0 : 1;
|
|
11404
|
+
}
|
|
11405
|
+
|
|
11406
|
+
static int iq3_compare_func(const void * left, const void * right) {
|
|
11407
|
+
const int * l = (const int *)left;
|
|
11408
|
+
const int * r = (const int *)right;
|
|
11409
|
+
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
|
11410
|
+
}
|
|
11411
|
+
|
|
11412
|
+
void iq3xs_init_impl(int grid_size) {
|
|
11413
|
+
const int gindex = iq3_data_index(grid_size);
|
|
11414
|
+
if (iq3_data[gindex].grid) {
|
|
11415
|
+
return;
|
|
11416
|
+
}
|
|
11417
|
+
static const uint16_t kgrid_256[256] = {
|
|
11418
|
+
0, 2, 4, 9, 11, 15, 16, 18, 25, 34, 59, 61, 65, 67, 72, 74,
|
|
11419
|
+
81, 85, 88, 90, 97, 108, 120, 128, 130, 132, 137, 144, 146, 153, 155, 159,
|
|
11420
|
+
169, 175, 189, 193, 199, 200, 202, 213, 248, 267, 287, 292, 303, 315, 317, 321,
|
|
11421
|
+
327, 346, 362, 413, 436, 456, 460, 462, 483, 497, 513, 515, 520, 522, 529, 531,
|
|
11422
|
+
536, 538, 540, 551, 552, 576, 578, 585, 592, 594, 641, 643, 648, 650, 657, 664,
|
|
11423
|
+
698, 704, 706, 720, 729, 742, 758, 769, 773, 808, 848, 852, 870, 889, 901, 978,
|
|
11424
|
+
992, 1024, 1026, 1033, 1035, 1040, 1042, 1046, 1049, 1058, 1089, 1091, 1093, 1096, 1098, 1105,
|
|
11425
|
+
1112, 1139, 1143, 1144, 1152, 1154, 1161, 1167, 1168, 1170, 1183, 1184, 1197, 1217, 1224, 1228,
|
|
11426
|
+
1272, 1276, 1309, 1323, 1347, 1367, 1377, 1404, 1473, 1475, 1486, 1509, 1537, 1544, 1546, 1553,
|
|
11427
|
+
1555, 1576, 1589, 1594, 1600, 1602, 1616, 1625, 1636, 1638, 1665, 1667, 1672, 1685, 1706, 1722,
|
|
11428
|
+
1737, 1755, 1816, 1831, 1850, 1856, 1862, 1874, 1901, 1932, 1950, 1971, 2011, 2032, 2052, 2063,
|
|
11429
|
+
2077, 2079, 2091, 2095, 2172, 2192, 2207, 2208, 2224, 2230, 2247, 2277, 2308, 2345, 2356, 2389,
|
|
11430
|
+
2403, 2424, 2501, 2504, 2506, 2520, 2570, 2593, 2616, 2624, 2630, 2646, 2669, 2700, 2714, 2746,
|
|
11431
|
+
2754, 2795, 2824, 2835, 2839, 2874, 2882, 2905, 2984, 3028, 3042, 3092, 3108, 3110, 3124, 3153,
|
|
11432
|
+
3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
|
|
11433
|
+
3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
|
|
11434
|
+
};
|
|
11435
|
+
static const uint16_t kgrid_512[512] = {
|
|
11436
|
+
0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34,
|
|
11437
|
+
37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77,
|
|
11438
|
+
80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142,
|
|
11439
|
+
145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210,
|
|
11440
|
+
217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288,
|
|
11441
|
+
291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393,
|
|
11442
|
+
395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514,
|
|
11443
|
+
516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576,
|
|
11444
|
+
577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653,
|
|
11445
|
+
655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727,
|
|
11446
|
+
728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833,
|
|
11447
|
+
840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977,
|
|
11448
|
+
989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047,
|
|
11449
|
+
1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103,
|
|
11450
|
+
1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199,
|
|
11451
|
+
1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296,
|
|
11452
|
+
1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415,
|
|
11453
|
+
1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561,
|
|
11454
|
+
1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648,
|
|
11455
|
+
1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761,
|
|
11456
|
+
1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877,
|
|
11457
|
+
1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068,
|
|
11458
|
+
2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177,
|
|
11459
|
+
2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269,
|
|
11460
|
+
2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520,
|
|
11461
|
+
2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634,
|
|
11462
|
+
2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805,
|
|
11463
|
+
2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083,
|
|
11464
|
+
3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276,
|
|
11465
|
+
3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591,
|
|
11466
|
+
3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729,
|
|
11467
|
+
3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032,
|
|
11468
|
+
};
|
|
11469
|
+
|
|
11470
|
+
const int kmap_size = 4096;
|
|
11471
|
+
const int nwant = grid_size == 256 ? 2 : 3;
|
|
11472
|
+
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
|
11473
|
+
uint32_t * kgrid_q3xs;
|
|
11474
|
+
int * kmap_q3xs;
|
|
11475
|
+
uint16_t * kneighbors_q3xs;
|
|
11476
|
+
|
|
11477
|
+
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
|
11478
|
+
uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
|
|
11479
|
+
for (int k = 0; k < grid_size; ++k) {
|
|
11480
|
+
int8_t * pos = (int8_t *)(the_grid + k);
|
|
11481
|
+
for (int i = 0; i < 4; ++i) {
|
|
11482
|
+
int l = (kgrid[k] >> 3*i) & 0x7;
|
|
11483
|
+
pos[i] = 2*l + 1;
|
|
11484
|
+
}
|
|
11485
|
+
}
|
|
11486
|
+
kgrid_q3xs = the_grid;
|
|
11487
|
+
iq3_data[gindex].grid = the_grid;
|
|
11488
|
+
kmap_q3xs = (int *)malloc(kmap_size*sizeof(int));
|
|
11489
|
+
iq3_data[gindex].map = kmap_q3xs;
|
|
11490
|
+
for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1;
|
|
11491
|
+
uint32_t aux32;
|
|
11492
|
+
uint8_t * aux8 = (uint8_t *)&aux32;
|
|
9214
11493
|
for (int i = 0; i < grid_size; ++i) {
|
|
9215
|
-
|
|
11494
|
+
aux32 = kgrid_q3xs[i];
|
|
9216
11495
|
uint16_t index = 0;
|
|
9217
|
-
for (int k=0; k<
|
|
11496
|
+
for (int k=0; k<4; ++k) {
|
|
9218
11497
|
uint16_t q = (aux8[k] - 1)/2;
|
|
9219
|
-
index |= (q <<
|
|
11498
|
+
index |= (q << 3*k);
|
|
9220
11499
|
}
|
|
9221
|
-
|
|
11500
|
+
kmap_q3xs[index] = i;
|
|
9222
11501
|
}
|
|
9223
|
-
int8_t pos[
|
|
11502
|
+
int8_t pos[4];
|
|
9224
11503
|
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
|
9225
11504
|
int num_neighbors = 0, num_not_in_map = 0;
|
|
9226
11505
|
for (int i = 0; i < kmap_size; ++i) {
|
|
9227
|
-
if (
|
|
11506
|
+
if (kmap_q3xs[i] >= 0) continue;
|
|
9228
11507
|
++num_not_in_map;
|
|
9229
|
-
for (int k = 0; k <
|
|
9230
|
-
int l = (i >>
|
|
11508
|
+
for (int k = 0; k < 4; ++k) {
|
|
11509
|
+
int l = (i >> 3*k) & 0x7;
|
|
9231
11510
|
pos[k] = 2*l + 1;
|
|
9232
11511
|
}
|
|
9233
11512
|
for (int j = 0; j < grid_size; ++j) {
|
|
9234
|
-
const int8_t * pg = (const int8_t *)(
|
|
11513
|
+
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
|
9235
11514
|
int d2 = 0;
|
|
9236
|
-
for (int k = 0; k <
|
|
11515
|
+
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
|
9237
11516
|
dist2[2*j+0] = d2;
|
|
9238
11517
|
dist2[2*j+1] = j;
|
|
9239
11518
|
}
|
|
9240
|
-
qsort(dist2, grid_size, 2*sizeof(int),
|
|
11519
|
+
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
|
9241
11520
|
int n = 0; int d2 = dist2[0];
|
|
9242
11521
|
int nhave = 1;
|
|
9243
11522
|
for (int j = 0; j < grid_size; ++j) {
|
|
@@ -9251,26 +11530,26 @@ void iq2xs_init_impl(int grid_size) {
|
|
|
9251
11530
|
num_neighbors += n;
|
|
9252
11531
|
}
|
|
9253
11532
|
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
|
9254
|
-
|
|
9255
|
-
|
|
11533
|
+
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
|
11534
|
+
iq3_data[gindex].neighbours = kneighbors_q3xs;
|
|
9256
11535
|
int counter = 0;
|
|
9257
11536
|
for (int i = 0; i < kmap_size; ++i) {
|
|
9258
|
-
if (
|
|
9259
|
-
for (int k = 0; k <
|
|
9260
|
-
int l = (i >>
|
|
11537
|
+
if (kmap_q3xs[i] >= 0) continue;
|
|
11538
|
+
for (int k = 0; k < 4; ++k) {
|
|
11539
|
+
int l = (i >> 3*k) & 0x7;
|
|
9261
11540
|
pos[k] = 2*l + 1;
|
|
9262
11541
|
}
|
|
9263
11542
|
for (int j = 0; j < grid_size; ++j) {
|
|
9264
|
-
const int8_t * pg = (const int8_t *)(
|
|
11543
|
+
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
|
9265
11544
|
int d2 = 0;
|
|
9266
|
-
for (int k = 0; k <
|
|
11545
|
+
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
|
9267
11546
|
dist2[2*j+0] = d2;
|
|
9268
11547
|
dist2[2*j+1] = j;
|
|
9269
11548
|
}
|
|
9270
|
-
qsort(dist2, grid_size, 2*sizeof(int),
|
|
9271
|
-
|
|
11549
|
+
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
|
11550
|
+
kmap_q3xs[i] = -(counter + 1);
|
|
9272
11551
|
int d2 = dist2[0];
|
|
9273
|
-
uint16_t * start = &
|
|
11552
|
+
uint16_t * start = &kneighbors_q3xs[counter++];
|
|
9274
11553
|
int n = 0, nhave = 1;
|
|
9275
11554
|
for (int j = 0; j < grid_size; ++j) {
|
|
9276
11555
|
if (dist2[2*j] > d2) {
|
|
@@ -9278,7 +11557,7 @@ void iq2xs_init_impl(int grid_size) {
|
|
|
9278
11557
|
d2 = dist2[2*j];
|
|
9279
11558
|
++nhave;
|
|
9280
11559
|
}
|
|
9281
|
-
|
|
11560
|
+
kneighbors_q3xs[counter++] = dist2[2*j+1];
|
|
9282
11561
|
++n;
|
|
9283
11562
|
}
|
|
9284
11563
|
*start = n;
|
|
@@ -9286,17 +11565,17 @@ void iq2xs_init_impl(int grid_size) {
|
|
|
9286
11565
|
free(dist2);
|
|
9287
11566
|
}
|
|
9288
11567
|
|
|
9289
|
-
void
|
|
9290
|
-
GGML_ASSERT(grid_size == 256 || grid_size == 512
|
|
9291
|
-
const int gindex =
|
|
9292
|
-
if (
|
|
9293
|
-
free(
|
|
9294
|
-
free(
|
|
9295
|
-
free(
|
|
11568
|
+
void iq3xs_free_impl(int grid_size) {
|
|
11569
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
|
11570
|
+
const int gindex = iq3_data_index(grid_size);
|
|
11571
|
+
if (iq3_data[gindex].grid) {
|
|
11572
|
+
free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
|
|
11573
|
+
free(iq3_data[gindex].map); iq3_data[gindex].map = NULL;
|
|
11574
|
+
free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL;
|
|
9296
11575
|
}
|
|
9297
11576
|
}
|
|
9298
11577
|
|
|
9299
|
-
static int
|
|
11578
|
+
static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
|
|
9300
11579
|
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
|
9301
11580
|
int num_neighbors = neighbours[0];
|
|
9302
11581
|
GGML_ASSERT(num_neighbors > 0);
|
|
@@ -9305,7 +11584,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
|
9305
11584
|
for (int j = 1; j <= num_neighbors; ++j) {
|
|
9306
11585
|
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
|
9307
11586
|
float d2 = 0;
|
|
9308
|
-
for (int i = 0; i <
|
|
11587
|
+
for (int i = 0; i < 4; ++i) {
|
|
9309
11588
|
float q = pg[i];
|
|
9310
11589
|
float diff = scale*q - xval[i];
|
|
9311
11590
|
d2 += weight[i]*diff*diff;
|
|
@@ -9316,29 +11595,44 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
|
9316
11595
|
}
|
|
9317
11596
|
GGML_ASSERT(grid_index >= 0);
|
|
9318
11597
|
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
|
9319
|
-
for (int i = 0; i <
|
|
11598
|
+
for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2;
|
|
9320
11599
|
return grid_index;
|
|
9321
11600
|
}
|
|
9322
11601
|
|
|
9323
|
-
static void
|
|
11602
|
+
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
|
|
11603
|
+
const float * restrict quant_weights) {
|
|
9324
11604
|
|
|
9325
|
-
const int gindex =
|
|
11605
|
+
const int gindex = iq3_data_index(grid_size);
|
|
9326
11606
|
|
|
9327
|
-
const
|
|
9328
|
-
const int *
|
|
9329
|
-
const uint16_t *
|
|
11607
|
+
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
|
11608
|
+
const int * kmap_q3xs = iq3_data[gindex].map;
|
|
11609
|
+
const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
|
|
9330
11610
|
|
|
9331
|
-
GGML_ASSERT(quant_weights && "missing quantization weights");
|
|
9332
|
-
GGML_ASSERT(
|
|
9333
|
-
GGML_ASSERT(
|
|
9334
|
-
GGML_ASSERT(
|
|
11611
|
+
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
|
11612
|
+
GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
|
|
11613
|
+
GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
|
|
11614
|
+
GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
|
|
9335
11615
|
GGML_ASSERT(n%QK_K == 0);
|
|
9336
11616
|
|
|
9337
|
-
const int kMaxQ =
|
|
11617
|
+
const int kMaxQ = 8;
|
|
9338
11618
|
|
|
9339
|
-
const int nbl = n/
|
|
11619
|
+
const int nbl = n/QK_K;
|
|
9340
11620
|
|
|
9341
|
-
|
|
11621
|
+
ggml_fp16_t * dh;
|
|
11622
|
+
uint8_t * qs;
|
|
11623
|
+
int block_size;
|
|
11624
|
+
if (grid_size == 256) {
|
|
11625
|
+
block_iq3_xxs * y = vy;
|
|
11626
|
+
dh = &y->d;
|
|
11627
|
+
qs = y->qs;
|
|
11628
|
+
block_size = sizeof(block_iq3_xxs);
|
|
11629
|
+
} else {
|
|
11630
|
+
block_iq3_s * y = vy;
|
|
11631
|
+
dh = &y->d;
|
|
11632
|
+
qs = y->qs;
|
|
11633
|
+
block_size = sizeof(block_iq3_s);
|
|
11634
|
+
}
|
|
11635
|
+
int quant_size = block_size - sizeof(ggml_fp16_t);
|
|
9342
11636
|
|
|
9343
11637
|
float scales[QK_K/32];
|
|
9344
11638
|
float weight[32];
|
|
@@ -9346,25 +11640,33 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
|
9346
11640
|
int8_t L[32];
|
|
9347
11641
|
int8_t Laux[32];
|
|
9348
11642
|
float waux[32];
|
|
9349
|
-
|
|
9350
|
-
|
|
11643
|
+
bool is_on_grid[8];
|
|
11644
|
+
bool is_on_grid_aux[8];
|
|
11645
|
+
uint8_t block_signs[8];
|
|
11646
|
+
uint8_t q3[3*(QK_K/8)+QK_K/32];
|
|
11647
|
+
uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
|
|
11648
|
+
uint8_t * qh = q3 + 3*(QK_K/8);
|
|
9351
11649
|
|
|
9352
11650
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
|
9353
11651
|
|
|
9354
|
-
|
|
9355
|
-
memset(
|
|
11652
|
+
dh[0] = GGML_FP32_TO_FP16(0.f);
|
|
11653
|
+
memset(q3, 0, 3*QK_K/8+QK_K/32);
|
|
9356
11654
|
|
|
9357
11655
|
float max_scale = 0;
|
|
9358
11656
|
|
|
9359
11657
|
const float * xbl = x + QK_K*ibl;
|
|
9360
11658
|
float sumx2 = 0;
|
|
9361
11659
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
|
9362
|
-
float sigma2 = sumx2/QK_K;
|
|
11660
|
+
float sigma2 = 2*sumx2/QK_K;
|
|
9363
11661
|
|
|
9364
11662
|
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
9365
11663
|
const float * xb = xbl + 32*ib;
|
|
9366
|
-
|
|
9367
|
-
|
|
11664
|
+
if (quant_weights) {
|
|
11665
|
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
|
11666
|
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
|
11667
|
+
} else {
|
|
11668
|
+
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
|
11669
|
+
}
|
|
9368
11670
|
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
|
9369
11671
|
for (int k = 0; k < 4; ++k) {
|
|
9370
11672
|
int nflip = 0;
|
|
@@ -9395,23 +11697,24 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
|
9395
11697
|
memset(L, 0, 32);
|
|
9396
11698
|
continue;
|
|
9397
11699
|
}
|
|
9398
|
-
float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
|
|
9399
|
-
float eff_max = scale*kMaxQ;
|
|
9400
11700
|
float best = 0;
|
|
9401
|
-
|
|
9402
|
-
|
|
11701
|
+
float scale = max/(2*kMaxQ-1);
|
|
11702
|
+
for (int is = -15; is <= 15; ++is) {
|
|
11703
|
+
float id = (2*kMaxQ-1+is*0.2f)/max;
|
|
9403
11704
|
float this_scale = 1/id;
|
|
9404
|
-
for (int k = 0; k <
|
|
9405
|
-
for (int i = 0; i <
|
|
9406
|
-
int l = nearest_int(0.5f*(id*xval[
|
|
9407
|
-
Laux[
|
|
11705
|
+
for (int k = 0; k < 8; ++k) {
|
|
11706
|
+
for (int i = 0; i < 4; ++i) {
|
|
11707
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
|
11708
|
+
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
|
9408
11709
|
}
|
|
9409
11710
|
uint16_t u = 0;
|
|
9410
|
-
for (int i = 0; i <
|
|
9411
|
-
int grid_index =
|
|
11711
|
+
for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
|
|
11712
|
+
int grid_index = kmap_q3xs[u];
|
|
11713
|
+
is_on_grid_aux[k] = true;
|
|
9412
11714
|
if (grid_index < 0) {
|
|
9413
|
-
|
|
9414
|
-
|
|
11715
|
+
is_on_grid_aux[k] = false;
|
|
11716
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
|
11717
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
|
|
9415
11718
|
}
|
|
9416
11719
|
}
|
|
9417
11720
|
float sumqx = 0, sumq2 = 0;
|
|
@@ -9423,25 +11726,29 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
|
9423
11726
|
}
|
|
9424
11727
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
|
9425
11728
|
scale = sumqx/sumq2; best = scale*sumqx;
|
|
9426
|
-
|
|
11729
|
+
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
|
|
11730
|
+
for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
|
9427
11731
|
}
|
|
9428
11732
|
}
|
|
9429
|
-
|
|
11733
|
+
int n_not_ongrid = 0;
|
|
11734
|
+
for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
|
11735
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
|
9430
11736
|
float id = 1/scale;
|
|
9431
|
-
for (int k = 0; k <
|
|
11737
|
+
for (int k = 0; k < 8; ++k) {
|
|
11738
|
+
if (is_on_grid[k]) continue;
|
|
9432
11739
|
uint16_t u = 0;
|
|
9433
|
-
for (int i = 0; i <
|
|
9434
|
-
int l = nearest_int(0.5f*(id*xval[
|
|
11740
|
+
for (int i = 0; i < 4; ++i) {
|
|
11741
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
|
9435
11742
|
l = MAX(0, MIN(kMaxQ-1, l));
|
|
9436
|
-
u |= (l <<
|
|
11743
|
+
u |= (l << 3*i);
|
|
9437
11744
|
}
|
|
9438
|
-
int grid_index =
|
|
11745
|
+
int grid_index = kmap_q3xs[u];
|
|
9439
11746
|
if (grid_index < 0) {
|
|
9440
|
-
const uint16_t * neighbours =
|
|
9441
|
-
grid_index =
|
|
11747
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
|
11748
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
|
|
9442
11749
|
}
|
|
9443
|
-
const int8_t * pg = (const int8_t *)(
|
|
9444
|
-
for (int i = 0; i <
|
|
11750
|
+
const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
|
|
11751
|
+
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
|
9445
11752
|
}
|
|
9446
11753
|
float sumqx = 0, sumq2 = 0;
|
|
9447
11754
|
for (int i = 0; i < 32; ++i) {
|
|
@@ -9458,142 +11765,173 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
|
9458
11765
|
scale = -scale;
|
|
9459
11766
|
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
|
9460
11767
|
}
|
|
9461
|
-
for (int k = 0; k <
|
|
11768
|
+
for (int k = 0; k < 8; ++k) {
|
|
9462
11769
|
uint16_t u = 0;
|
|
9463
|
-
for (int i = 0; i <
|
|
9464
|
-
int grid_index =
|
|
11770
|
+
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
|
11771
|
+
int grid_index = kmap_q3xs[u];
|
|
9465
11772
|
if (grid_index < 0) {
|
|
9466
11773
|
printf("Oops: found point %u not on grid:", u);
|
|
9467
|
-
for (int i = 0; i <
|
|
11774
|
+
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
|
9468
11775
|
printf("\n");
|
|
9469
11776
|
GGML_ASSERT(false);
|
|
9470
11777
|
}
|
|
9471
|
-
|
|
9472
|
-
|
|
11778
|
+
if (grid_size == 256) {
|
|
11779
|
+
q3[8*ib+k] = grid_index;
|
|
11780
|
+
} else {
|
|
11781
|
+
q3[8*ib+k] = grid_index & 255;
|
|
11782
|
+
qh[ib] |= ((grid_index >> 8) << k);
|
|
11783
|
+
}
|
|
11784
|
+
|
|
9473
11785
|
}
|
|
11786
|
+
scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
|
|
9474
11787
|
GGML_ASSERT(scale >= 0);
|
|
9475
11788
|
scales[ib] = scale;
|
|
9476
11789
|
max_scale = MAX(max_scale, scale);
|
|
9477
11790
|
}
|
|
9478
11791
|
|
|
9479
11792
|
if (!max_scale) {
|
|
9480
|
-
memset(
|
|
11793
|
+
memset(qs, 0, quant_size);
|
|
11794
|
+
dh += block_size/sizeof(ggml_fp16_t);
|
|
11795
|
+
qs += block_size;
|
|
9481
11796
|
continue;
|
|
9482
11797
|
}
|
|
9483
11798
|
|
|
9484
11799
|
float d = max_scale/31;
|
|
9485
|
-
|
|
11800
|
+
dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor
|
|
9486
11801
|
float id = 1/d;
|
|
9487
11802
|
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
9488
11803
|
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
|
9489
11804
|
l = MAX(0, MIN(15, l));
|
|
9490
|
-
|
|
11805
|
+
scales_and_signs[ib] |= ((uint32_t)l << 28);
|
|
9491
11806
|
}
|
|
9492
|
-
memcpy(
|
|
11807
|
+
memcpy(qs, q3, quant_size);
|
|
11808
|
+
|
|
11809
|
+
dh += block_size/sizeof(ggml_fp16_t);
|
|
11810
|
+
qs += block_size;
|
|
11811
|
+
|
|
9493
11812
|
}
|
|
9494
11813
|
}
|
|
9495
11814
|
|
|
9496
|
-
|
|
11815
|
+
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
|
11816
|
+
(void)hist;
|
|
11817
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
11818
|
+
int nblock = n_per_row/QK_K;
|
|
11819
|
+
char * qrow = (char *)dst;
|
|
11820
|
+
for (int row = 0; row < nrow; ++row) {
|
|
11821
|
+
quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
|
|
11822
|
+
src += n_per_row;
|
|
11823
|
+
qrow += nblock*sizeof(block_iq3_xxs);
|
|
11824
|
+
}
|
|
11825
|
+
return nrow * nblock * sizeof(block_iq3_xxs);
|
|
11826
|
+
}
|
|
9497
11827
|
|
|
9498
|
-
|
|
11828
|
+
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
|
|
11829
|
+
assert(k % QK_K == 0);
|
|
11830
|
+
block_iq3_xxs * restrict y = vy;
|
|
11831
|
+
quantize_row_iq3_xxs_reference(x, y, k);
|
|
11832
|
+
}
|
|
9499
11833
|
|
|
9500
|
-
|
|
9501
|
-
|
|
9502
|
-
|
|
11834
|
+
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
|
|
11835
|
+
assert(k % QK_K == 0);
|
|
11836
|
+
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
|
11837
|
+
}
|
|
9503
11838
|
|
|
9504
|
-
|
|
9505
|
-
|
|
9506
|
-
|
|
9507
|
-
|
|
11839
|
+
static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
|
|
11840
|
+
const float * restrict quant_weights,
|
|
11841
|
+
float * scales,
|
|
11842
|
+
float * weight,
|
|
11843
|
+
float * xval,
|
|
11844
|
+
int8_t * L,
|
|
11845
|
+
int8_t * Laux,
|
|
11846
|
+
float * waux,
|
|
11847
|
+
bool * is_on_grid,
|
|
11848
|
+
bool * is_on_grid_aux,
|
|
11849
|
+
uint8_t * block_signs) {
|
|
11850
|
+
|
|
11851
|
+
const int gindex = iq3_data_index(512);
|
|
11852
|
+
|
|
11853
|
+
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
|
11854
|
+
const int * kmap_q3xs = iq3_data[gindex].map;
|
|
11855
|
+
const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
|
|
11856
|
+
|
|
11857
|
+
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
|
11858
|
+
GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
|
|
11859
|
+
GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
|
|
11860
|
+
GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
|
|
9508
11861
|
GGML_ASSERT(n%QK_K == 0);
|
|
9509
11862
|
|
|
9510
|
-
const int kMaxQ =
|
|
11863
|
+
const int kMaxQ = 8;
|
|
9511
11864
|
|
|
9512
|
-
const int nbl = n/
|
|
11865
|
+
const int nbl = n/QK_K;
|
|
9513
11866
|
|
|
9514
|
-
|
|
11867
|
+
block_iq3_s * y = vy;
|
|
9515
11868
|
|
|
9516
|
-
|
|
9517
|
-
|
|
9518
|
-
float xval[16];
|
|
9519
|
-
int8_t L[16];
|
|
9520
|
-
int8_t Laux[16];
|
|
9521
|
-
float waux[16];
|
|
9522
|
-
bool is_on_grid[2];
|
|
9523
|
-
bool is_on_grid_aux[2];
|
|
9524
|
-
uint8_t block_signs[2];
|
|
9525
|
-
uint16_t q2[2*(QK_K/16)];
|
|
11869
|
+
const int bs4 = block_size/4;
|
|
11870
|
+
const int bs8 = block_size/8;
|
|
9526
11871
|
|
|
9527
11872
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
|
9528
11873
|
|
|
11874
|
+
memset(&y[ibl], 0, sizeof(block_iq3_s));
|
|
9529
11875
|
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
|
9530
|
-
|
|
9531
|
-
|
|
11876
|
+
|
|
11877
|
+
uint8_t * qs = y[ibl].qs;
|
|
11878
|
+
uint8_t * qh = y[ibl].qh;
|
|
11879
|
+
uint8_t * signs = y[ibl].signs;
|
|
9532
11880
|
|
|
9533
11881
|
float max_scale = 0;
|
|
9534
11882
|
|
|
9535
11883
|
const float * xbl = x + QK_K*ibl;
|
|
9536
11884
|
float sumx2 = 0;
|
|
9537
11885
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
|
9538
|
-
float sigma2 = sumx2/QK_K;
|
|
9539
|
-
|
|
9540
|
-
for (int ib = 0; ib < QK_K/
|
|
9541
|
-
const float * xb = xbl +
|
|
9542
|
-
|
|
9543
|
-
|
|
9544
|
-
|
|
9545
|
-
|
|
9546
|
-
int
|
|
9547
|
-
|
|
9548
|
-
|
|
9549
|
-
|
|
9550
|
-
|
|
9551
|
-
|
|
9552
|
-
|
|
9553
|
-
|
|
9554
|
-
|
|
9555
|
-
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
|
9556
|
-
for (int i = 1; i < 8; ++i) {
|
|
9557
|
-
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
|
9558
|
-
if (ax < min) {
|
|
9559
|
-
min = ax; imin = i;
|
|
9560
|
-
}
|
|
11886
|
+
float sigma2 = 2*sumx2/QK_K;
|
|
11887
|
+
|
|
11888
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
|
11889
|
+
const float * xb = xbl + block_size*ib;
|
|
11890
|
+
if (quant_weights) {
|
|
11891
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
|
11892
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
|
11893
|
+
} else {
|
|
11894
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
|
11895
|
+
}
|
|
11896
|
+
for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
|
|
11897
|
+
for (int k = 0; k < bs8; ++k) {
|
|
11898
|
+
uint8_t s = 0;
|
|
11899
|
+
for (int i = 0; i < 8; ++i) {
|
|
11900
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
|
11901
|
+
else {
|
|
11902
|
+
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
|
9561
11903
|
}
|
|
9562
|
-
xval[8*k+imin] = -xval[8*k+imin];
|
|
9563
|
-
s ^= (1 << imin);
|
|
9564
11904
|
}
|
|
9565
|
-
block_signs[k] = s
|
|
11905
|
+
block_signs[k] = s;
|
|
9566
11906
|
}
|
|
9567
11907
|
float max = xval[0];
|
|
9568
|
-
for (int i = 1; i <
|
|
11908
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
|
|
9569
11909
|
if (!max) {
|
|
9570
11910
|
scales[ib] = 0;
|
|
9571
|
-
memset(L, 0, 16);
|
|
9572
11911
|
continue;
|
|
9573
11912
|
}
|
|
9574
11913
|
float best = 0;
|
|
9575
11914
|
float scale = max/(2*kMaxQ-1);
|
|
9576
|
-
|
|
9577
|
-
|
|
9578
|
-
float id = (2*kMaxQ-1+is*0.1f)/max;
|
|
11915
|
+
for (int is = -15; is <= 15; ++is) {
|
|
11916
|
+
float id = (2*kMaxQ-1+is*0.2f)/max;
|
|
9579
11917
|
float this_scale = 1/id;
|
|
9580
|
-
for (int k = 0; k <
|
|
9581
|
-
for (int i = 0; i <
|
|
9582
|
-
int l = nearest_int(0.5f*(id*xval[
|
|
9583
|
-
Laux[
|
|
11918
|
+
for (int k = 0; k < bs4; ++k) {
|
|
11919
|
+
for (int i = 0; i < 4; ++i) {
|
|
11920
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
|
11921
|
+
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
|
9584
11922
|
}
|
|
9585
11923
|
uint16_t u = 0;
|
|
9586
|
-
for (int i = 0; i <
|
|
9587
|
-
int grid_index =
|
|
11924
|
+
for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
|
|
11925
|
+
int grid_index = kmap_q3xs[u];
|
|
9588
11926
|
is_on_grid_aux[k] = true;
|
|
9589
11927
|
if (grid_index < 0) {
|
|
9590
11928
|
is_on_grid_aux[k] = false;
|
|
9591
|
-
const uint16_t * neighbours =
|
|
9592
|
-
grid_index =
|
|
11929
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
|
11930
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
|
|
9593
11931
|
}
|
|
9594
11932
|
}
|
|
9595
11933
|
float sumqx = 0, sumq2 = 0;
|
|
9596
|
-
for (int i = 0; i <
|
|
11934
|
+
for (int i = 0; i < block_size; ++i) {
|
|
9597
11935
|
float w = weight[i];
|
|
9598
11936
|
float q = 2*Laux[i] + 1;
|
|
9599
11937
|
sumqx += w*xval[i]*q;
|
|
@@ -9601,31 +11939,32 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
|
9601
11939
|
}
|
|
9602
11940
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
|
9603
11941
|
scale = sumqx/sumq2; best = scale*sumqx;
|
|
9604
|
-
for (int i = 0; i <
|
|
9605
|
-
for (int k = 0; k <
|
|
11942
|
+
for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
|
|
11943
|
+
for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
|
9606
11944
|
}
|
|
9607
11945
|
}
|
|
9608
11946
|
int n_not_ongrid = 0;
|
|
9609
|
-
for (int k = 0; k <
|
|
11947
|
+
for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
|
9610
11948
|
if (n_not_ongrid > 0 && scale > 0) {
|
|
9611
11949
|
float id = 1/scale;
|
|
9612
|
-
for (int k = 0; k <
|
|
11950
|
+
for (int k = 0; k < bs4; ++k) {
|
|
9613
11951
|
if (is_on_grid[k]) continue;
|
|
9614
11952
|
uint16_t u = 0;
|
|
9615
|
-
for (int i = 0; i <
|
|
9616
|
-
int l = nearest_int(0.5f*(id*xval[
|
|
11953
|
+
for (int i = 0; i < 4; ++i) {
|
|
11954
|
+
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
|
9617
11955
|
l = MAX(0, MIN(kMaxQ-1, l));
|
|
9618
|
-
u |= (l <<
|
|
9619
|
-
L[8*k + i] = l;
|
|
11956
|
+
u |= (l << 3*i);
|
|
9620
11957
|
}
|
|
9621
|
-
int grid_index =
|
|
11958
|
+
int grid_index = kmap_q3xs[u];
|
|
9622
11959
|
if (grid_index < 0) {
|
|
9623
|
-
const uint16_t * neighbours =
|
|
9624
|
-
grid_index =
|
|
11960
|
+
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
|
11961
|
+
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
|
|
9625
11962
|
}
|
|
11963
|
+
const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
|
|
11964
|
+
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
|
9626
11965
|
}
|
|
9627
11966
|
float sumqx = 0, sumq2 = 0;
|
|
9628
|
-
for (int i = 0; i <
|
|
11967
|
+
for (int i = 0; i < block_size; ++i) {
|
|
9629
11968
|
float w = weight[i];
|
|
9630
11969
|
float q = 2*L[i] + 1;
|
|
9631
11970
|
sumqx += w*xval[i]*q;
|
|
@@ -9634,356 +11973,572 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
|
9634
11973
|
if (sumq2 > 0) scale = sumqx/sumq2;
|
|
9635
11974
|
}
|
|
9636
11975
|
if (scale < 0) {
|
|
11976
|
+
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
|
11977
|
+
// and correspondingly flip quant signs.
|
|
9637
11978
|
scale = -scale;
|
|
9638
|
-
for (int k = 0; k <
|
|
11979
|
+
for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
|
|
9639
11980
|
}
|
|
9640
|
-
for (int k = 0; k <
|
|
11981
|
+
for (int k = 0; k < bs4; ++k) {
|
|
9641
11982
|
uint16_t u = 0;
|
|
9642
|
-
for (int i = 0; i <
|
|
9643
|
-
int grid_index =
|
|
11983
|
+
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
|
11984
|
+
int grid_index = kmap_q3xs[u];
|
|
9644
11985
|
if (grid_index < 0) {
|
|
9645
11986
|
printf("Oops: found point %u not on grid:", u);
|
|
9646
|
-
for (int i = 0; i <
|
|
11987
|
+
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
|
9647
11988
|
printf("\n");
|
|
9648
11989
|
GGML_ASSERT(false);
|
|
9649
11990
|
}
|
|
9650
|
-
|
|
11991
|
+
qs[k] = grid_index & 255;
|
|
11992
|
+
qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
|
|
9651
11993
|
}
|
|
11994
|
+
qs += bs4;
|
|
11995
|
+
for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
|
|
11996
|
+
signs += bs8;
|
|
9652
11997
|
GGML_ASSERT(scale >= 0);
|
|
9653
11998
|
scales[ib] = scale;
|
|
9654
11999
|
max_scale = MAX(max_scale, scale);
|
|
9655
12000
|
}
|
|
9656
12001
|
|
|
9657
12002
|
if (!max_scale) {
|
|
9658
|
-
memset(y[ibl].qs, 0, QK_K/4);
|
|
9659
12003
|
continue;
|
|
9660
12004
|
}
|
|
9661
12005
|
|
|
9662
12006
|
float d = max_scale/31;
|
|
9663
12007
|
y[ibl].d = GGML_FP32_TO_FP16(d);
|
|
9664
12008
|
float id = 1/d;
|
|
9665
|
-
for (int ib = 0; ib < QK_K/
|
|
9666
|
-
int
|
|
9667
|
-
|
|
9668
|
-
|
|
9669
|
-
|
|
12009
|
+
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
|
12010
|
+
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|
|
12011
|
+
l1 = MAX(0, MIN(15, l1));
|
|
12012
|
+
int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
|
|
12013
|
+
l2 = MAX(0, MIN(15, l2));
|
|
12014
|
+
y[ibl].scales[ib/2] = l1 | (l2 << 4);
|
|
9670
12015
|
}
|
|
9671
|
-
memcpy(y[ibl].qs, q2, QK_K/4);
|
|
9672
12016
|
|
|
9673
12017
|
}
|
|
9674
12018
|
}
|
|
9675
12019
|
|
|
9676
|
-
|
|
12020
|
+
#define IQ3S_BLOCK_SIZE 32
|
|
12021
|
+
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
|
9677
12022
|
(void)hist;
|
|
9678
12023
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
9679
12024
|
int nblock = n_per_row/QK_K;
|
|
12025
|
+
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
|
12026
|
+
float weight[IQ3S_BLOCK_SIZE];
|
|
12027
|
+
float xval[IQ3S_BLOCK_SIZE];
|
|
12028
|
+
int8_t L[IQ3S_BLOCK_SIZE];
|
|
12029
|
+
int8_t Laux[IQ3S_BLOCK_SIZE];
|
|
12030
|
+
float waux[IQ3S_BLOCK_SIZE];
|
|
12031
|
+
bool is_on_grid[IQ3S_BLOCK_SIZE/4];
|
|
12032
|
+
bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
|
|
12033
|
+
uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
|
|
9680
12034
|
char * qrow = (char *)dst;
|
|
9681
12035
|
for (int row = 0; row < nrow; ++row) {
|
|
9682
|
-
|
|
12036
|
+
quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
|
|
12037
|
+
scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
|
|
9683
12038
|
src += n_per_row;
|
|
9684
|
-
qrow += nblock*sizeof(
|
|
12039
|
+
qrow += nblock*sizeof(block_iq3_s);
|
|
9685
12040
|
}
|
|
9686
|
-
return nrow * nblock * sizeof(
|
|
12041
|
+
return nrow * nblock * sizeof(block_iq3_s);
|
|
9687
12042
|
}
|
|
9688
12043
|
|
|
9689
|
-
|
|
12044
|
+
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
|
|
12045
|
+
assert(k % QK_K == 0);
|
|
12046
|
+
block_iq3_s * restrict y = vy;
|
|
12047
|
+
quantize_row_iq3_s_reference(x, y, k);
|
|
12048
|
+
}
|
|
12049
|
+
|
|
12050
|
+
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
|
12051
|
+
assert(k % QK_K == 0);
|
|
12052
|
+
quantize_iq3_s(x, y, 1, k, NULL, NULL);
|
|
12053
|
+
}
|
|
12054
|
+
|
|
12055
|
+
|
|
12056
|
+
// =================================== 1.5 bpw ===================================================
|
|
12057
|
+
|
|
12058
|
+
static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
|
12059
|
+
const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
|
|
12060
|
+
int num_neighbors = neighbours[0];
|
|
12061
|
+
GGML_ASSERT(num_neighbors > 0);
|
|
12062
|
+
float best_score = 0;
|
|
12063
|
+
int grid_index = -1;
|
|
12064
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
|
12065
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
|
12066
|
+
float sumqx = 0, sumq2 = 0;
|
|
12067
|
+
for (int i = 0; i < 8; ++i) {
|
|
12068
|
+
float q = (pg[i] - 3)/2;
|
|
12069
|
+
float w = weight[i];
|
|
12070
|
+
sumqx += w*q*xval[i];
|
|
12071
|
+
sumq2 += w*q*q;
|
|
12072
|
+
}
|
|
12073
|
+
if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
|
12074
|
+
*scale = sumqx/sumq2; best_score = *scale * sumqx;
|
|
12075
|
+
grid_index = neighbours[j];
|
|
12076
|
+
}
|
|
12077
|
+
}
|
|
12078
|
+
if (grid_index < 0) {
|
|
12079
|
+
for (int i = 0; i < ngrid; ++i) {
|
|
12080
|
+
const int8_t * grid_i = (const int8_t *)(grid + i);
|
|
12081
|
+
float sumqx = 0, sumq2 = 0;
|
|
12082
|
+
for (int j = 0; j < 8; ++j) {
|
|
12083
|
+
float w = weight[j];
|
|
12084
|
+
float q = (grid_i[j] - 3)/2;
|
|
12085
|
+
sumqx += w*q*xval[j];
|
|
12086
|
+
sumq2 += w*q*q;
|
|
12087
|
+
}
|
|
12088
|
+
if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
|
12089
|
+
*scale = sumqx/sumq2; best_score = *scale*sumqx;
|
|
12090
|
+
grid_index = i;
|
|
12091
|
+
}
|
|
12092
|
+
}
|
|
12093
|
+
}
|
|
12094
|
+
if (grid_index < 0) {
|
|
12095
|
+
printf("Oops, did not find grid point\n");
|
|
12096
|
+
printf("Have %d neighbours\n", num_neighbors);
|
|
12097
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
|
12098
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
|
12099
|
+
float sumqx = 0, sumq2 = 0;
|
|
12100
|
+
for (int i = 0; i < 8; ++i) {
|
|
12101
|
+
float q = (pg[i] - 3)/2;
|
|
12102
|
+
float w = weight[i];
|
|
12103
|
+
sumqx += w*q*xval[i];
|
|
12104
|
+
sumq2 += w*q*q;
|
|
12105
|
+
}
|
|
12106
|
+
printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
|
|
12107
|
+
}
|
|
12108
|
+
}
|
|
12109
|
+
GGML_ASSERT(grid_index >= 0);
|
|
12110
|
+
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
12111
|
+
*scale *= 1.05f; // This is a fudge factor. Don't ask me why it improves the result.
|
|
12112
|
+
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
12113
|
+
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
|
12114
|
+
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
|
|
12115
|
+
return grid_index;
|
|
12116
|
+
}
|
|
12117
|
+
|
|
12118
|
+
static int iq1_sort_helper(const void * left, const void * right) {
|
|
12119
|
+
const float * l = left;
|
|
12120
|
+
const float * r = right;
|
|
12121
|
+
return *l < *r ? -1 : *l > *r ? 1 : 0;
|
|
12122
|
+
}
|
|
12123
|
+
|
|
12124
|
+
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
|
12125
|
+
|
|
12126
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
|
|
12127
|
+
|
|
12128
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
|
12129
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
|
12130
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
|
12131
|
+
|
|
12132
|
+
GGML_ASSERT(quant_weights && "missing quantization weights");
|
|
12133
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
|
12134
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
|
12135
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
|
12136
|
+
GGML_ASSERT(n%QK_K == 0);
|
|
12137
|
+
|
|
12138
|
+
const int nbl = n/QK_K;
|
|
12139
|
+
|
|
12140
|
+
block_iq1_s * y = vy;
|
|
12141
|
+
|
|
12142
|
+
float scales[QK_K/8];
|
|
12143
|
+
float weight[8];
|
|
12144
|
+
int8_t L[8];
|
|
12145
|
+
float sumx[9];
|
|
12146
|
+
float sumw[9];
|
|
12147
|
+
float pairs[16];
|
|
12148
|
+
int * idx = (int *)(pairs + 1);
|
|
12149
|
+
uint8_t hbit[QK_K/8];
|
|
12150
|
+
|
|
12151
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
|
12152
|
+
|
|
12153
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
|
12154
|
+
memset(y[ibl].qs, 0, QK_K/8);
|
|
12155
|
+
memset(y[ibl].scales, 0, QK_K/16);
|
|
12156
|
+
|
|
12157
|
+
float max_scale = 0;
|
|
12158
|
+
|
|
12159
|
+
const float * xbl = x + QK_K*ibl;
|
|
12160
|
+
float sumx2 = 0;
|
|
12161
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
|
12162
|
+
float sigma2 = sumx2/QK_K;
|
|
12163
|
+
|
|
12164
|
+
for (int ib = 0; ib < QK_K/8; ++ib) {
|
|
12165
|
+
const float * xb = xbl + 8*ib;
|
|
12166
|
+
const float * qw = quant_weights + QK_K*ibl + 8*ib;
|
|
12167
|
+
for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
|
12168
|
+
float max = fabsf(xb[0]);
|
|
12169
|
+
for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i]));
|
|
12170
|
+
if (!max) {
|
|
12171
|
+
scales[ib] = 0;
|
|
12172
|
+
memset(L, 1, 8);
|
|
12173
|
+
continue;
|
|
12174
|
+
}
|
|
12175
|
+
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
|
12176
|
+
// With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
|
|
12177
|
+
// boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
|
|
12178
|
+
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
|
12179
|
+
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
|
12180
|
+
// for each possible and score for each split.
|
|
12181
|
+
for (int j = 0; j < 8; ++j) {
|
|
12182
|
+
pairs[2*j] = xb[j];
|
|
12183
|
+
idx[2*j] = j;
|
|
12184
|
+
}
|
|
12185
|
+
qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper);
|
|
12186
|
+
{
|
|
12187
|
+
sumx[0] = sumw[0] = 0;
|
|
12188
|
+
for (int j = 0; j < 8; ++j) {
|
|
12189
|
+
int i = idx[2*j];
|
|
12190
|
+
sumx[j+1] = sumx[j] + weight[i]*xb[i];
|
|
12191
|
+
sumw[j+1] = sumw[j] + weight[i];
|
|
12192
|
+
}
|
|
12193
|
+
}
|
|
12194
|
+
float best_score = 0, scale = max;
|
|
12195
|
+
int besti1 = 0, besti2 = 0;
|
|
12196
|
+
for (int i1 = 0; i1 <= 8; ++i1) {
|
|
12197
|
+
for (int i2 = i1; i2 <= 8; ++i2) {
|
|
12198
|
+
float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]);
|
|
12199
|
+
float sumq2 = (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]);
|
|
12200
|
+
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
|
12201
|
+
scale = sumqx/sumq2; best_score = scale*sumqx;
|
|
12202
|
+
besti1 = i1; besti2 = i2;
|
|
12203
|
+
}
|
|
12204
|
+
}
|
|
12205
|
+
}
|
|
12206
|
+
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
|
12207
|
+
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
|
12208
|
+
for (int j = besti2; j < 8; ++j) L[idx[2*j]] = 2;
|
|
12209
|
+
if (scale < 0) {
|
|
12210
|
+
for (int j = 0; j < 8; ++j) L[j] = 2 - L[j];
|
|
12211
|
+
scale = -scale;
|
|
12212
|
+
}
|
|
12213
|
+
// Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring
|
|
12214
|
+
// grid point that minimizes SSD.
|
|
12215
|
+
uint16_t u = 0;
|
|
12216
|
+
for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
|
|
12217
|
+
int grid_index = kmap_q2xs[u];
|
|
12218
|
+
if (grid_index < 0) {
|
|
12219
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
|
12220
|
+
grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
|
|
12221
|
+
GGML_ASSERT(grid_index >= 0);
|
|
12222
|
+
}
|
|
12223
|
+
y[ibl].qs[ib] = grid_index & 255;
|
|
12224
|
+
hbit[ib] = grid_index >> 8;
|
|
12225
|
+
GGML_ASSERT(scale >= 0);
|
|
12226
|
+
scales[ib] = scale;
|
|
12227
|
+
max_scale = MAX(max_scale, scale);
|
|
12228
|
+
}
|
|
12229
|
+
|
|
12230
|
+
if (!max_scale) {
|
|
12231
|
+
memset(y[ibl].qs, 0, QK_K/8);
|
|
12232
|
+
continue;
|
|
12233
|
+
}
|
|
12234
|
+
|
|
12235
|
+
float d = max_scale/15;
|
|
12236
|
+
y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed.
|
|
12237
|
+
float id = 1/d;
|
|
12238
|
+
for (int ib = 0; ib < QK_K/8; ++ib) {
|
|
12239
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
|
12240
|
+
l = MAX(0, MIN(7, l));
|
|
12241
|
+
if (hbit[ib]) l |= 8;
|
|
12242
|
+
y[ibl].scales[ib/2] |= (l << 4*(ib%2));
|
|
12243
|
+
}
|
|
12244
|
+
}
|
|
12245
|
+
}
|
|
12246
|
+
|
|
12247
|
+
size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
|
9690
12248
|
(void)hist;
|
|
9691
12249
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
9692
12250
|
int nblock = n_per_row/QK_K;
|
|
9693
12251
|
char * qrow = (char *)dst;
|
|
9694
12252
|
for (int row = 0; row < nrow; ++row) {
|
|
9695
|
-
|
|
12253
|
+
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
|
|
9696
12254
|
src += n_per_row;
|
|
9697
|
-
qrow += nblock*sizeof(
|
|
12255
|
+
qrow += nblock*sizeof(block_iq1_s);
|
|
9698
12256
|
}
|
|
9699
|
-
return nrow * nblock * sizeof(
|
|
12257
|
+
return nrow * nblock * sizeof(block_iq1_s);
|
|
9700
12258
|
}
|
|
9701
12259
|
|
|
9702
|
-
//
|
|
9703
|
-
// ============================================= 3-bit using D4 lattice
|
|
9704
|
-
//
|
|
9705
|
-
|
|
9706
|
-
typedef struct {
|
|
9707
|
-
uint32_t * grid;
|
|
9708
|
-
int * map;
|
|
9709
|
-
uint16_t * neighbours;
|
|
9710
|
-
} iq3_entry_t;
|
|
9711
|
-
|
|
9712
|
-
static iq3_entry_t iq3_data[1] = {
|
|
9713
|
-
{NULL, NULL, NULL},
|
|
9714
|
-
};
|
|
12260
|
+
// ============================ 4-bit non-linear quants
|
|
9715
12261
|
|
|
9716
|
-
static inline int
|
|
9717
|
-
(
|
|
9718
|
-
|
|
9719
|
-
|
|
12262
|
+
static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|
12263
|
+
if (x <= val[0]) return 0;
|
|
12264
|
+
if (x >= val[n-1]) return n-1;
|
|
12265
|
+
int ml = 0, mu = n-1;
|
|
12266
|
+
while (mu-ml > 1) {
|
|
12267
|
+
int mav = (ml+mu)/2;
|
|
12268
|
+
if (x < val[mav]) mu = mav; else ml = mav;
|
|
12269
|
+
}
|
|
12270
|
+
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
|
9720
12271
|
}
|
|
9721
12272
|
|
|
9722
|
-
static
|
|
9723
|
-
|
|
9724
|
-
|
|
9725
|
-
|
|
9726
|
-
|
|
12273
|
+
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
|
|
12274
|
+
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
|
12275
|
+
float * scales, float * weight, uint8_t * L,
|
|
12276
|
+
const int8_t * values,
|
|
12277
|
+
const float * quant_weights) {
|
|
9727
12278
|
|
|
9728
|
-
|
|
9729
|
-
const int gindex = iq3_data_index(grid_size);
|
|
9730
|
-
if (iq3_data[gindex].grid) {
|
|
9731
|
-
return;
|
|
9732
|
-
}
|
|
9733
|
-
static const uint16_t kgrid_256[256] = {
|
|
9734
|
-
0, 2, 4, 9, 11, 15, 16, 18, 25, 34, 59, 61, 65, 67, 72, 74,
|
|
9735
|
-
81, 85, 88, 90, 97, 108, 120, 128, 130, 132, 137, 144, 146, 153, 155, 159,
|
|
9736
|
-
169, 175, 189, 193, 199, 200, 202, 213, 248, 267, 287, 292, 303, 315, 317, 321,
|
|
9737
|
-
327, 346, 362, 413, 436, 456, 460, 462, 483, 497, 513, 515, 520, 522, 529, 531,
|
|
9738
|
-
536, 538, 540, 551, 552, 576, 578, 585, 592, 594, 641, 643, 648, 650, 657, 664,
|
|
9739
|
-
698, 704, 706, 720, 729, 742, 758, 769, 773, 808, 848, 852, 870, 889, 901, 978,
|
|
9740
|
-
992, 1024, 1026, 1033, 1035, 1040, 1042, 1046, 1049, 1058, 1089, 1091, 1093, 1096, 1098, 1105,
|
|
9741
|
-
1112, 1139, 1143, 1144, 1152, 1154, 1161, 1167, 1168, 1170, 1183, 1184, 1197, 1217, 1224, 1228,
|
|
9742
|
-
1272, 1276, 1309, 1323, 1347, 1367, 1377, 1404, 1473, 1475, 1486, 1509, 1537, 1544, 1546, 1553,
|
|
9743
|
-
1555, 1576, 1589, 1594, 1600, 1602, 1616, 1625, 1636, 1638, 1665, 1667, 1672, 1685, 1706, 1722,
|
|
9744
|
-
1737, 1755, 1816, 1831, 1850, 1856, 1862, 1874, 1901, 1932, 1950, 1971, 2011, 2032, 2052, 2063,
|
|
9745
|
-
2077, 2079, 2091, 2095, 2172, 2192, 2207, 2208, 2224, 2230, 2247, 2277, 2308, 2345, 2356, 2389,
|
|
9746
|
-
2403, 2424, 2501, 2504, 2506, 2520, 2570, 2593, 2616, 2624, 2630, 2646, 2669, 2700, 2714, 2746,
|
|
9747
|
-
2754, 2795, 2824, 2835, 2839, 2874, 2882, 2905, 2984, 3028, 3042, 3092, 3108, 3110, 3124, 3153,
|
|
9748
|
-
3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
|
|
9749
|
-
3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
|
|
9750
|
-
};
|
|
9751
|
-
const int kmap_size = 4096;
|
|
9752
|
-
const int nwant = 2;
|
|
9753
|
-
const uint16_t * kgrid = kgrid_256;
|
|
9754
|
-
uint32_t * kgrid_q3xs;
|
|
9755
|
-
int * kmap_q3xs;
|
|
9756
|
-
uint16_t * kneighbors_q3xs;
|
|
12279
|
+
const int ntry = 7;
|
|
9757
12280
|
|
|
9758
|
-
|
|
9759
|
-
|
|
9760
|
-
|
|
9761
|
-
|
|
9762
|
-
|
|
9763
|
-
|
|
9764
|
-
|
|
9765
|
-
|
|
9766
|
-
|
|
9767
|
-
|
|
9768
|
-
|
|
9769
|
-
|
|
9770
|
-
|
|
9771
|
-
|
|
9772
|
-
|
|
9773
|
-
uint8_t * aux8 = (uint8_t *)&aux32;
|
|
9774
|
-
for (int i = 0; i < grid_size; ++i) {
|
|
9775
|
-
aux32 = kgrid_q3xs[i];
|
|
9776
|
-
uint16_t index = 0;
|
|
9777
|
-
for (int k=0; k<4; ++k) {
|
|
9778
|
-
uint16_t q = (aux8[k] - 1)/2;
|
|
9779
|
-
index |= (q << 3*k);
|
|
9780
|
-
}
|
|
9781
|
-
kmap_q3xs[index] = i;
|
|
9782
|
-
}
|
|
9783
|
-
int8_t pos[4];
|
|
9784
|
-
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
|
9785
|
-
int num_neighbors = 0, num_not_in_map = 0;
|
|
9786
|
-
for (int i = 0; i < kmap_size; ++i) {
|
|
9787
|
-
if (kmap_q3xs[i] >= 0) continue;
|
|
9788
|
-
++num_not_in_map;
|
|
9789
|
-
for (int k = 0; k < 4; ++k) {
|
|
9790
|
-
int l = (i >> 3*k) & 0x7;
|
|
9791
|
-
pos[k] = 2*l + 1;
|
|
9792
|
-
}
|
|
9793
|
-
for (int j = 0; j < grid_size; ++j) {
|
|
9794
|
-
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
|
9795
|
-
int d2 = 0;
|
|
9796
|
-
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
|
9797
|
-
dist2[2*j+0] = d2;
|
|
9798
|
-
dist2[2*j+1] = j;
|
|
12281
|
+
float sigma2 = 0;
|
|
12282
|
+
for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
|
|
12283
|
+
sigma2 *= 2.f/super_block_size;
|
|
12284
|
+
|
|
12285
|
+
memset(q4, 0, super_block_size/2);
|
|
12286
|
+
dh[0] = GGML_FP32_TO_FP16(0.f);
|
|
12287
|
+
|
|
12288
|
+
float max_scale = 0, amax_scale = 0;
|
|
12289
|
+
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
|
12290
|
+
const float * xb = x + ib*block_size;
|
|
12291
|
+
if (quant_weights) {
|
|
12292
|
+
const float * qw = quant_weights + ib*block_size;
|
|
12293
|
+
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
|
12294
|
+
} else {
|
|
12295
|
+
for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
|
|
9799
12296
|
}
|
|
9800
|
-
|
|
9801
|
-
int
|
|
9802
|
-
|
|
9803
|
-
|
|
9804
|
-
|
|
9805
|
-
if (nhave == nwant) break;
|
|
9806
|
-
d2 = dist2[2*j];
|
|
9807
|
-
++nhave;
|
|
12297
|
+
float amax = 0, max = 0;
|
|
12298
|
+
for (int j = 0; j < block_size; ++j) {
|
|
12299
|
+
float ax = fabsf(xb[j]);
|
|
12300
|
+
if (ax > amax) {
|
|
12301
|
+
amax = ax; max = xb[j];
|
|
9808
12302
|
}
|
|
9809
|
-
++n;
|
|
9810
12303
|
}
|
|
9811
|
-
|
|
9812
|
-
|
|
9813
|
-
|
|
9814
|
-
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
|
9815
|
-
iq3_data[gindex].neighbours = kneighbors_q3xs;
|
|
9816
|
-
int counter = 0;
|
|
9817
|
-
for (int i = 0; i < kmap_size; ++i) {
|
|
9818
|
-
if (kmap_q3xs[i] >= 0) continue;
|
|
9819
|
-
for (int k = 0; k < 4; ++k) {
|
|
9820
|
-
int l = (i >> 3*k) & 0x7;
|
|
9821
|
-
pos[k] = 2*l + 1;
|
|
12304
|
+
if (!amax) {
|
|
12305
|
+
scales[ib] = 0;
|
|
12306
|
+
continue;
|
|
9822
12307
|
}
|
|
9823
|
-
|
|
9824
|
-
|
|
9825
|
-
|
|
9826
|
-
|
|
9827
|
-
|
|
9828
|
-
|
|
12308
|
+
float d = -max/values[0];
|
|
12309
|
+
float id = 1/d;
|
|
12310
|
+
float sumqx = 0, sumq2 = 0;
|
|
12311
|
+
for (int j = 0; j < block_size; ++j) {
|
|
12312
|
+
float al = id*xb[j];
|
|
12313
|
+
int l = best_index_int8(16, values, al);
|
|
12314
|
+
float q = values[l];
|
|
12315
|
+
float w = weight[j];
|
|
12316
|
+
sumqx += w*q*xb[j];
|
|
12317
|
+
sumq2 += w*q*q;
|
|
12318
|
+
}
|
|
12319
|
+
d = sumqx/sumq2;
|
|
12320
|
+
float best = d*sumqx;
|
|
12321
|
+
for (int itry = -ntry; itry <= ntry; ++itry) {
|
|
12322
|
+
id = (itry + values[0])/max;
|
|
12323
|
+
sumqx = sumq2 = 0;
|
|
12324
|
+
for (int j = 0; j < block_size; ++j) {
|
|
12325
|
+
float al = id*xb[j];
|
|
12326
|
+
int l = best_index_int8(16, values, al);
|
|
12327
|
+
float q = values[l];
|
|
12328
|
+
float w = weight[j];
|
|
12329
|
+
sumqx += w*q*xb[j];
|
|
12330
|
+
sumq2 += w*q*q;
|
|
12331
|
+
}
|
|
12332
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
|
12333
|
+
d = sumqx/sumq2; best = d * sumqx;
|
|
12334
|
+
}
|
|
9829
12335
|
}
|
|
9830
|
-
|
|
9831
|
-
|
|
9832
|
-
|
|
9833
|
-
|
|
9834
|
-
|
|
9835
|
-
|
|
9836
|
-
|
|
9837
|
-
|
|
9838
|
-
|
|
9839
|
-
|
|
12336
|
+
scales[ib] = d;
|
|
12337
|
+
float abs_d = fabsf(d);
|
|
12338
|
+
if (abs_d > amax_scale) {
|
|
12339
|
+
amax_scale = abs_d; max_scale = d;
|
|
12340
|
+
}
|
|
12341
|
+
}
|
|
12342
|
+
|
|
12343
|
+
if (super_block_size/block_size > 1) {
|
|
12344
|
+
int nb = super_block_size/block_size;
|
|
12345
|
+
memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
|
|
12346
|
+
float d = -max_scale/32;
|
|
12347
|
+
dh[0] = GGML_FP32_TO_FP16(d);
|
|
12348
|
+
float id = d ? 1/d : 0.f;
|
|
12349
|
+
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
|
12350
|
+
int l = nearest_int(id*scales[ib]);
|
|
12351
|
+
l = MAX(-32, MIN(31, l));
|
|
12352
|
+
float dl = d * l;
|
|
12353
|
+
float idl = dl ? 1/dl : 0.f;
|
|
12354
|
+
uint8_t * Lb = L + ib*block_size;
|
|
12355
|
+
const float * xb = x + ib*block_size;
|
|
12356
|
+
for (int j = 0; j < block_size; ++j) {
|
|
12357
|
+
Lb[j] = best_index_int8(16, values, idl*xb[j]);
|
|
9840
12358
|
}
|
|
9841
|
-
|
|
9842
|
-
|
|
12359
|
+
l += 32;
|
|
12360
|
+
uint8_t l_l = l & 0xf;
|
|
12361
|
+
uint8_t l_h = l >> 4;
|
|
12362
|
+
if (ib%2 == 0) scales_l[ib/2] = l_l;
|
|
12363
|
+
else scales_l[ib/2] |= (l_l << 4);
|
|
12364
|
+
scales_h[ib/8] |= (l_h << 2*(ib%8));
|
|
12365
|
+
}
|
|
12366
|
+
} else {
|
|
12367
|
+
dh[0] = GGML_FP32_TO_FP16(scales[0]);
|
|
12368
|
+
float id = scales[0] ? 1/scales[0] : 0;
|
|
12369
|
+
for (int j = 0; j < super_block_size; ++j) {
|
|
12370
|
+
L[j] = best_index_int8(16, values, id*x[j]);
|
|
9843
12371
|
}
|
|
9844
|
-
*start = n;
|
|
9845
12372
|
}
|
|
9846
|
-
free(dist2);
|
|
9847
|
-
}
|
|
9848
12373
|
|
|
9849
|
-
|
|
9850
|
-
|
|
9851
|
-
|
|
9852
|
-
|
|
9853
|
-
free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
|
|
9854
|
-
free(iq3_data[gindex].map); iq3_data[gindex].map = NULL;
|
|
9855
|
-
free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL;
|
|
12374
|
+
for (int i = 0; i < super_block_size/32; ++i) {
|
|
12375
|
+
for (int j = 0; j < 16; ++j) {
|
|
12376
|
+
q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
|
|
12377
|
+
}
|
|
9856
12378
|
}
|
|
9857
12379
|
}
|
|
9858
12380
|
|
|
9859
|
-
|
|
9860
|
-
|
|
9861
|
-
|
|
9862
|
-
|
|
9863
|
-
|
|
9864
|
-
|
|
9865
|
-
|
|
9866
|
-
|
|
9867
|
-
|
|
9868
|
-
|
|
9869
|
-
|
|
9870
|
-
|
|
9871
|
-
|
|
12381
|
+
size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
|
12382
|
+
(void)hist;
|
|
12383
|
+
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
|
12384
|
+
int nblock = n_per_row/QK4_NL;
|
|
12385
|
+
char * qrow = (char *)dst;
|
|
12386
|
+
uint8_t L[QK4_NL];
|
|
12387
|
+
float weight[QK4_NL];
|
|
12388
|
+
uint16_t unused_h;
|
|
12389
|
+
uint8_t * unused_l = NULL;
|
|
12390
|
+
float scale;
|
|
12391
|
+
for (int row = 0; row < nrow; ++row) {
|
|
12392
|
+
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
|
12393
|
+
for (int ibl = 0; ibl < nblock; ++ibl) {
|
|
12394
|
+
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
|
12395
|
+
quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
|
|
12396
|
+
&scale, weight, L, kvalues_iq4nl, qw);
|
|
9872
12397
|
}
|
|
9873
|
-
|
|
9874
|
-
|
|
12398
|
+
src += n_per_row;
|
|
12399
|
+
qrow += nblock*sizeof(block_iq4_nl);
|
|
12400
|
+
}
|
|
12401
|
+
return nrow * nblock * sizeof(block_iq4_nl);
|
|
12402
|
+
}
|
|
12403
|
+
|
|
12404
|
+
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
|
12405
|
+
assert(k % QK4_NL == 0);
|
|
12406
|
+
block_iq4_nl * restrict y = vy;
|
|
12407
|
+
quantize_row_iq4_nl_reference(x, y, k);
|
|
12408
|
+
}
|
|
12409
|
+
|
|
12410
|
+
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
|
12411
|
+
assert(k % QK4_NL == 0);
|
|
12412
|
+
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
|
12413
|
+
}
|
|
12414
|
+
|
|
12415
|
+
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
|
12416
|
+
#if QK_K == 64
|
|
12417
|
+
return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
|
|
12418
|
+
#else
|
|
12419
|
+
(void)hist;
|
|
12420
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
12421
|
+
int nblock = n_per_row/QK_K;
|
|
12422
|
+
char * qrow = (char *)dst;
|
|
12423
|
+
uint8_t L[QK_K];
|
|
12424
|
+
float weight[32];
|
|
12425
|
+
float scales[QK_K/32];
|
|
12426
|
+
for (int row = 0; row < nrow; ++row) {
|
|
12427
|
+
block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
|
|
12428
|
+
for (int ibl = 0; ibl < nblock; ++ibl) {
|
|
12429
|
+
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
|
|
12430
|
+
quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
|
|
12431
|
+
scales, weight, L, kvalues_iq4nl, qw);
|
|
9875
12432
|
}
|
|
12433
|
+
src += n_per_row;
|
|
12434
|
+
qrow += nblock*sizeof(block_iq4_xs);
|
|
9876
12435
|
}
|
|
9877
|
-
|
|
9878
|
-
|
|
9879
|
-
for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2;
|
|
9880
|
-
return grid_index;
|
|
12436
|
+
return nrow * nblock * sizeof(block_iq4_xs);
|
|
12437
|
+
#endif
|
|
9881
12438
|
}
|
|
9882
12439
|
|
|
9883
|
-
|
|
12440
|
+
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
|
12441
|
+
assert(k % QK_K == 0);
|
|
12442
|
+
block_iq4_xs * restrict y = vy;
|
|
12443
|
+
quantize_row_iq4_xs_reference(x, y, k);
|
|
12444
|
+
}
|
|
9884
12445
|
|
|
9885
|
-
|
|
12446
|
+
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
|
12447
|
+
assert(k % QK_K == 0);
|
|
12448
|
+
quantize_iq4_xs(x, y, 1, k, NULL, NULL);
|
|
12449
|
+
}
|
|
9886
12450
|
|
|
9887
|
-
|
|
9888
|
-
const int * kmap_q3xs = iq3_data[gindex].map;
|
|
9889
|
-
const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
|
|
12451
|
+
// =============================== 2.5625 bpw
|
|
9890
12452
|
|
|
9891
|
-
|
|
9892
|
-
|
|
9893
|
-
|
|
9894
|
-
|
|
12453
|
+
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
|
12454
|
+
|
|
12455
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
|
12456
|
+
|
|
12457
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
|
12458
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
|
12459
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
|
12460
|
+
|
|
12461
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
|
12462
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
|
12463
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
|
9895
12464
|
GGML_ASSERT(n%QK_K == 0);
|
|
9896
12465
|
|
|
9897
|
-
const int kMaxQ =
|
|
12466
|
+
const int kMaxQ = 3;
|
|
9898
12467
|
|
|
9899
|
-
const int nbl = n/
|
|
12468
|
+
const int nbl = n/QK_K;
|
|
9900
12469
|
|
|
9901
|
-
|
|
12470
|
+
block_iq2_s * y = vy;
|
|
9902
12471
|
|
|
9903
|
-
float scales[QK_K/
|
|
9904
|
-
float weight[
|
|
9905
|
-
float xval[
|
|
9906
|
-
int8_t L[
|
|
9907
|
-
int8_t Laux[
|
|
9908
|
-
float waux[
|
|
9909
|
-
bool is_on_grid[
|
|
9910
|
-
bool is_on_grid_aux[
|
|
9911
|
-
uint8_t block_signs[
|
|
9912
|
-
uint8_t q3[3*(QK_K/8)];
|
|
9913
|
-
uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
|
|
12472
|
+
float scales[QK_K/16];
|
|
12473
|
+
float weight[16];
|
|
12474
|
+
float xval[16];
|
|
12475
|
+
int8_t L[16];
|
|
12476
|
+
int8_t Laux[16];
|
|
12477
|
+
float waux[16];
|
|
12478
|
+
bool is_on_grid[2];
|
|
12479
|
+
bool is_on_grid_aux[2];
|
|
12480
|
+
uint8_t block_signs[2];
|
|
9914
12481
|
|
|
9915
12482
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
|
9916
12483
|
|
|
12484
|
+
memset(&y[ibl], 0, sizeof(block_iq2_s));
|
|
9917
12485
|
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
|
9918
|
-
memset(q3, 0, 3*QK_K/8);
|
|
9919
12486
|
|
|
9920
12487
|
float max_scale = 0;
|
|
9921
12488
|
|
|
9922
12489
|
const float * xbl = x + QK_K*ibl;
|
|
9923
12490
|
float sumx2 = 0;
|
|
9924
12491
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
|
9925
|
-
float sigma2 = sumx2/QK_K;
|
|
12492
|
+
float sigma2 = 2*sumx2/QK_K;
|
|
9926
12493
|
|
|
9927
|
-
for (int ib = 0; ib < QK_K/
|
|
9928
|
-
const float * xb = xbl +
|
|
12494
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
|
12495
|
+
const float * xb = xbl + 16*ib;
|
|
9929
12496
|
if (quant_weights) {
|
|
9930
|
-
const float * qw = quant_weights + QK_K*ibl +
|
|
9931
|
-
for (int i = 0; i <
|
|
12497
|
+
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
|
12498
|
+
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
|
9932
12499
|
} else {
|
|
9933
|
-
for (int i = 0; i <
|
|
12500
|
+
for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
|
|
9934
12501
|
}
|
|
9935
|
-
for (int i = 0; i <
|
|
9936
|
-
for (int k = 0; k <
|
|
9937
|
-
int nflip = 0;
|
|
12502
|
+
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
|
12503
|
+
for (int k = 0; k < 2; ++k) {
|
|
9938
12504
|
uint8_t s = 0;
|
|
9939
12505
|
for (int i = 0; i < 8; ++i) {
|
|
9940
12506
|
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
|
9941
12507
|
else {
|
|
9942
|
-
xval[8*k + i] = -xb[8*k + i];
|
|
9943
|
-
}
|
|
9944
|
-
}
|
|
9945
|
-
if (nflip%2) {
|
|
9946
|
-
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
|
9947
|
-
for (int i = 1; i < 8; ++i) {
|
|
9948
|
-
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
|
9949
|
-
if (ax < min) {
|
|
9950
|
-
min = ax; imin = i;
|
|
9951
|
-
}
|
|
12508
|
+
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
|
9952
12509
|
}
|
|
9953
|
-
xval[8*k+imin] = -xval[8*k+imin];
|
|
9954
|
-
s ^= (1 << imin);
|
|
9955
12510
|
}
|
|
9956
|
-
block_signs[k] = s
|
|
12511
|
+
block_signs[k] = s;
|
|
9957
12512
|
}
|
|
9958
12513
|
float max = xval[0];
|
|
9959
|
-
for (int i = 1; i <
|
|
12514
|
+
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
|
9960
12515
|
if (!max) {
|
|
9961
12516
|
scales[ib] = 0;
|
|
9962
|
-
memset(L, 0, 32);
|
|
9963
12517
|
continue;
|
|
9964
12518
|
}
|
|
9965
12519
|
float best = 0;
|
|
9966
12520
|
float scale = max/(2*kMaxQ-1);
|
|
9967
|
-
|
|
9968
|
-
|
|
12521
|
+
is_on_grid[0] = is_on_grid[1] = true;
|
|
12522
|
+
for (int is = -9; is <= 9; ++is) {
|
|
12523
|
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
|
9969
12524
|
float this_scale = 1/id;
|
|
9970
|
-
for (int k = 0; k <
|
|
9971
|
-
for (int i = 0; i <
|
|
9972
|
-
int l = nearest_int(0.5f*(id*xval[
|
|
9973
|
-
Laux[
|
|
12525
|
+
for (int k = 0; k < 2; ++k) {
|
|
12526
|
+
for (int i = 0; i < 8; ++i) {
|
|
12527
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
|
12528
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
|
9974
12529
|
}
|
|
9975
12530
|
uint16_t u = 0;
|
|
9976
|
-
for (int i = 0; i <
|
|
9977
|
-
int grid_index =
|
|
12531
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
|
12532
|
+
int grid_index = kmap_q2xs[u];
|
|
9978
12533
|
is_on_grid_aux[k] = true;
|
|
9979
12534
|
if (grid_index < 0) {
|
|
9980
12535
|
is_on_grid_aux[k] = false;
|
|
9981
|
-
const uint16_t * neighbours =
|
|
9982
|
-
grid_index =
|
|
12536
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
|
12537
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
|
9983
12538
|
}
|
|
9984
12539
|
}
|
|
9985
12540
|
float sumqx = 0, sumq2 = 0;
|
|
9986
|
-
for (int i = 0; i <
|
|
12541
|
+
for (int i = 0; i < 16; ++i) {
|
|
9987
12542
|
float w = weight[i];
|
|
9988
12543
|
float q = 2*Laux[i] + 1;
|
|
9989
12544
|
sumqx += w*xval[i]*q;
|
|
@@ -9991,32 +12546,31 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
|
9991
12546
|
}
|
|
9992
12547
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
|
9993
12548
|
scale = sumqx/sumq2; best = scale*sumqx;
|
|
9994
|
-
for (int i = 0; i <
|
|
9995
|
-
for (int k = 0; k <
|
|
12549
|
+
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
|
12550
|
+
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
|
9996
12551
|
}
|
|
9997
12552
|
}
|
|
9998
12553
|
int n_not_ongrid = 0;
|
|
9999
|
-
for (int k = 0; k <
|
|
12554
|
+
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
|
10000
12555
|
if (n_not_ongrid > 0 && scale > 0) {
|
|
10001
12556
|
float id = 1/scale;
|
|
10002
|
-
for (int k = 0; k <
|
|
12557
|
+
for (int k = 0; k < 2; ++k) {
|
|
10003
12558
|
if (is_on_grid[k]) continue;
|
|
10004
12559
|
uint16_t u = 0;
|
|
10005
|
-
for (int i = 0; i <
|
|
10006
|
-
int l = nearest_int(0.5f*(id*xval[
|
|
12560
|
+
for (int i = 0; i < 8; ++i) {
|
|
12561
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
|
10007
12562
|
l = MAX(0, MIN(kMaxQ-1, l));
|
|
10008
|
-
u |= (l <<
|
|
12563
|
+
u |= (l << 2*i);
|
|
12564
|
+
L[8*k + i] = l;
|
|
10009
12565
|
}
|
|
10010
|
-
int grid_index =
|
|
12566
|
+
int grid_index = kmap_q2xs[u];
|
|
10011
12567
|
if (grid_index < 0) {
|
|
10012
|
-
const uint16_t * neighbours =
|
|
10013
|
-
grid_index =
|
|
12568
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
|
12569
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
|
10014
12570
|
}
|
|
10015
|
-
const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
|
|
10016
|
-
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
|
10017
12571
|
}
|
|
10018
12572
|
float sumqx = 0, sumq2 = 0;
|
|
10019
|
-
for (int i = 0; i <
|
|
12573
|
+
for (int i = 0; i < 16; ++i) {
|
|
10020
12574
|
float w = weight[i];
|
|
10021
12575
|
float q = 2*L[i] + 1;
|
|
10022
12576
|
sumqx += w*xval[i]*q;
|
|
@@ -10025,110 +12579,65 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict
|
|
|
10025
12579
|
if (sumq2 > 0) scale = sumqx/sumq2;
|
|
10026
12580
|
}
|
|
10027
12581
|
if (scale < 0) {
|
|
10028
|
-
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
|
10029
|
-
// and correspondingly flip quant signs.
|
|
10030
12582
|
scale = -scale;
|
|
10031
|
-
for (int k = 0; k <
|
|
12583
|
+
for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
|
|
10032
12584
|
}
|
|
10033
|
-
for (int k = 0; k <
|
|
12585
|
+
for (int k = 0; k < 2; ++k) {
|
|
10034
12586
|
uint16_t u = 0;
|
|
10035
|
-
for (int i = 0; i <
|
|
10036
|
-
int grid_index =
|
|
12587
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
|
12588
|
+
int grid_index = kmap_q2xs[u];
|
|
10037
12589
|
if (grid_index < 0) {
|
|
10038
12590
|
printf("Oops: found point %u not on grid:", u);
|
|
10039
|
-
for (int i = 0; i <
|
|
12591
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
|
10040
12592
|
printf("\n");
|
|
10041
12593
|
GGML_ASSERT(false);
|
|
10042
12594
|
}
|
|
10043
|
-
|
|
12595
|
+
const int i8 = 2*ib + k;
|
|
12596
|
+
y[ibl].qs[i8] = grid_index & 255;
|
|
12597
|
+
y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
|
|
12598
|
+
y[ibl].qs[QK_K/8 + i8] = block_signs[k];
|
|
10044
12599
|
}
|
|
10045
|
-
scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
|
|
10046
12600
|
GGML_ASSERT(scale >= 0);
|
|
10047
12601
|
scales[ib] = scale;
|
|
10048
12602
|
max_scale = MAX(max_scale, scale);
|
|
10049
12603
|
}
|
|
10050
12604
|
|
|
10051
12605
|
if (!max_scale) {
|
|
10052
|
-
memset(y[ibl].qs, 0, 3*QK_K/8);
|
|
10053
12606
|
continue;
|
|
10054
12607
|
}
|
|
10055
12608
|
|
|
10056
12609
|
float d = max_scale/31;
|
|
10057
|
-
y[ibl].d = GGML_FP32_TO_FP16(d);
|
|
12610
|
+
y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
|
|
10058
12611
|
float id = 1/d;
|
|
10059
|
-
|
|
10060
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
12612
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
|
10061
12613
|
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
|
10062
12614
|
l = MAX(0, MIN(15, l));
|
|
10063
|
-
|
|
10064
|
-
|
|
10065
|
-
const float * xb = xbl + 32*ib;
|
|
10066
|
-
if (quant_weights) {
|
|
10067
|
-
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
|
10068
|
-
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
|
10069
|
-
} else {
|
|
10070
|
-
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
|
10071
|
-
}
|
|
10072
|
-
const float db = 0.25f * d * (1 + 2*l);
|
|
10073
|
-
for (int k = 0; k < 8; ++k) {
|
|
10074
|
-
const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
|
|
10075
|
-
const float * xk = xb + 4*k;
|
|
10076
|
-
const float * wk = weight + 4*k;
|
|
10077
|
-
//const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
|
|
10078
|
-
const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
|
|
10079
|
-
float best_mse = 0; int best_index = q3[8*ib+k];
|
|
10080
|
-
for (int j = 0; j < 4; ++j) {
|
|
10081
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
|
10082
|
-
best_mse += wk[j] * diff * diff;
|
|
10083
|
-
}
|
|
10084
|
-
for (int idx = 0; idx < 256; ++idx) {
|
|
10085
|
-
//grid = (const uint8_t *)(kgrid_q3xs + idx);
|
|
10086
|
-
grid = (const uint8_t *)(iq3xxs_grid + idx);
|
|
10087
|
-
float mse = 0;
|
|
10088
|
-
for (int j = 0; j < 4; ++j) {
|
|
10089
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
|
10090
|
-
mse += wk[j] * diff * diff;
|
|
10091
|
-
}
|
|
10092
|
-
if (mse < best_mse) {
|
|
10093
|
-
best_mse = mse; best_index = idx;
|
|
10094
|
-
}
|
|
10095
|
-
}
|
|
10096
|
-
q3[8*ib+k] = best_index;
|
|
10097
|
-
//grid = (const uint8_t *)(kgrid_q3xs + best_index);
|
|
10098
|
-
grid = (const uint8_t *)(iq3xxs_grid + best_index);
|
|
10099
|
-
for (int j = 0; j < 4; ++j) {
|
|
10100
|
-
float q = db * grid[j] * signs[j];
|
|
10101
|
-
sumqx += wk[j] * q * xk[j];
|
|
10102
|
-
sumq2 += wk[j] * q * q;
|
|
10103
|
-
}
|
|
10104
|
-
}
|
|
10105
|
-
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
|
10106
|
-
}
|
|
12615
|
+
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
|
12616
|
+
else y[ibl].scales[ib/2] |= (l << 4);
|
|
10107
12617
|
}
|
|
10108
|
-
memcpy(y[ibl].qs, q3, 3*QK_K/8);
|
|
10109
12618
|
}
|
|
10110
12619
|
}
|
|
10111
12620
|
|
|
10112
|
-
size_t
|
|
12621
|
+
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
|
10113
12622
|
(void)hist;
|
|
10114
12623
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
10115
12624
|
int nblock = n_per_row/QK_K;
|
|
10116
12625
|
char * qrow = (char *)dst;
|
|
10117
12626
|
for (int row = 0; row < nrow; ++row) {
|
|
10118
|
-
|
|
12627
|
+
quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
|
|
10119
12628
|
src += n_per_row;
|
|
10120
|
-
qrow += nblock*sizeof(
|
|
12629
|
+
qrow += nblock*sizeof(block_iq2_s);
|
|
10121
12630
|
}
|
|
10122
|
-
return nrow * nblock * sizeof(
|
|
12631
|
+
return nrow * nblock * sizeof(block_iq2_s);
|
|
10123
12632
|
}
|
|
10124
12633
|
|
|
10125
|
-
void
|
|
12634
|
+
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
|
10126
12635
|
assert(k % QK_K == 0);
|
|
10127
|
-
|
|
10128
|
-
quantize_row_iq3_xxs_reference(x, y, k);
|
|
12636
|
+
quantize_iq2_s(x, y, 1, k, NULL, NULL);
|
|
10129
12637
|
}
|
|
10130
12638
|
|
|
10131
|
-
void
|
|
12639
|
+
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
|
10132
12640
|
assert(k % QK_K == 0);
|
|
10133
|
-
|
|
12641
|
+
block_iq2_s * restrict y = vy;
|
|
12642
|
+
quantize_row_iq2_s_reference(x, y, k);
|
|
10134
12643
|
}
|