llama_cpp 0.12.6 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -438,6 +438,30 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
|
438
438
|
return res;
|
439
439
|
}
|
440
440
|
|
441
|
+
// NOTE: not tested
|
442
|
+
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
443
|
+
int8x16_t res;
|
444
|
+
|
445
|
+
res[ 0] = a[b[ 0]];
|
446
|
+
res[ 1] = a[b[ 1]];
|
447
|
+
res[ 2] = a[b[ 2]];
|
448
|
+
res[ 3] = a[b[ 3]];
|
449
|
+
res[ 4] = a[b[ 4]];
|
450
|
+
res[ 5] = a[b[ 5]];
|
451
|
+
res[ 6] = a[b[ 6]];
|
452
|
+
res[ 7] = a[b[ 7]];
|
453
|
+
res[ 8] = a[b[ 8]];
|
454
|
+
res[ 9] = a[b[ 9]];
|
455
|
+
res[10] = a[b[10]];
|
456
|
+
res[11] = a[b[11]];
|
457
|
+
res[12] = a[b[12]];
|
458
|
+
res[13] = a[b[13]];
|
459
|
+
res[14] = a[b[14]];
|
460
|
+
res[15] = a[b[15]];
|
461
|
+
|
462
|
+
return res;
|
463
|
+
}
|
464
|
+
|
441
465
|
#else
|
442
466
|
|
443
467
|
#define ggml_int16x8x2_t int16x8x2_t
|
@@ -451,6 +475,7 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
|
451
475
|
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
452
476
|
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
453
477
|
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
478
|
+
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
454
479
|
|
455
480
|
#endif
|
456
481
|
|
@@ -1837,9 +1862,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1837
1862
|
float sigma2 = sumx2/QK_K;
|
1838
1863
|
for (int j = 0; j < QK_K/16; ++j) {
|
1839
1864
|
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
1840
|
-
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
1841
|
-
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
|
1842
|
-
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
1865
|
+
for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
1866
|
+
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
1867
|
+
scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
1843
1868
|
}
|
1844
1869
|
|
1845
1870
|
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
@@ -3480,6 +3505,139 @@ static const uint32_t iq3xxs_grid[256] = {
|
|
3480
3505
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
3481
3506
|
};
|
3482
3507
|
|
3508
|
+
#define NGRID_IQ2XXS 512
|
3509
|
+
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
3510
|
+
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
3511
|
+
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
3512
|
+
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
3513
|
+
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
3514
|
+
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
3515
|
+
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
3516
|
+
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
3517
|
+
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
3518
|
+
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
3519
|
+
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
3520
|
+
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
3521
|
+
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
3522
|
+
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
3523
|
+
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
3524
|
+
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
3525
|
+
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
3526
|
+
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
3527
|
+
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
3528
|
+
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
3529
|
+
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
3530
|
+
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
3531
|
+
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
3532
|
+
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
3533
|
+
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
3534
|
+
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
3535
|
+
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
3536
|
+
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
3537
|
+
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
3538
|
+
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
3539
|
+
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
3540
|
+
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
3541
|
+
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
3542
|
+
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
3543
|
+
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
3544
|
+
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
3545
|
+
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
3546
|
+
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
3547
|
+
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
3548
|
+
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
3549
|
+
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
3550
|
+
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
3551
|
+
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
3552
|
+
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
3553
|
+
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
3554
|
+
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
3555
|
+
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
3556
|
+
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
3557
|
+
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
3558
|
+
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
3559
|
+
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
3560
|
+
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
3561
|
+
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
3562
|
+
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
3563
|
+
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
3564
|
+
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
3565
|
+
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
3566
|
+
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
3567
|
+
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
3568
|
+
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
3569
|
+
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
3570
|
+
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
3571
|
+
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
3572
|
+
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
3573
|
+
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
3574
|
+
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
3575
|
+
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
3576
|
+
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
3577
|
+
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
3578
|
+
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
3579
|
+
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
3580
|
+
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
3581
|
+
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
3582
|
+
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
3583
|
+
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
3584
|
+
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
3585
|
+
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
3586
|
+
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
3587
|
+
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
3588
|
+
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
3589
|
+
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
3590
|
+
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
3591
|
+
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
3592
|
+
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
3593
|
+
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
3594
|
+
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
3595
|
+
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
3596
|
+
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
3597
|
+
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
3598
|
+
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
3599
|
+
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
3600
|
+
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
3601
|
+
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
3602
|
+
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
3603
|
+
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
3604
|
+
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
3605
|
+
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
3606
|
+
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
3607
|
+
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
3608
|
+
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
3609
|
+
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
3610
|
+
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
3611
|
+
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
3612
|
+
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
3613
|
+
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
3614
|
+
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
3615
|
+
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
3616
|
+
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
3617
|
+
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
3618
|
+
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
3619
|
+
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
3620
|
+
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
3621
|
+
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
3622
|
+
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
3623
|
+
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
3624
|
+
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
3625
|
+
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
3626
|
+
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
3627
|
+
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
3628
|
+
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
3629
|
+
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
3630
|
+
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
3631
|
+
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
3632
|
+
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
3633
|
+
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
3634
|
+
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
3635
|
+
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
3636
|
+
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
3637
|
+
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
3638
|
+
|
3639
|
+
};
|
3640
|
+
|
3483
3641
|
static const uint8_t ksigns_iq2xs[128] = {
|
3484
3642
|
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
3485
3643
|
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
@@ -3578,6 +3736,69 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
|
3578
3736
|
}
|
3579
3737
|
}
|
3580
3738
|
|
3739
|
+
// ====================== 1.5625 bpw (de)-quantization
|
3740
|
+
|
3741
|
+
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
|
3742
|
+
assert(k % QK_K == 0);
|
3743
|
+
const int nb = k / QK_K;
|
3744
|
+
|
3745
|
+
float db[4];
|
3746
|
+
uint16_t idx[4];
|
3747
|
+
//const int8_t * grid[4];
|
3748
|
+
|
3749
|
+
for (int i = 0; i < nb; i++) {
|
3750
|
+
|
3751
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
3752
|
+
const uint8_t * sc = x[i].scales;
|
3753
|
+
const uint8_t * qs = x[i].qs;
|
3754
|
+
|
3755
|
+
for (int i8 = 0; i8 < QK_K/8; i8 += 4) {
|
3756
|
+
idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
|
3757
|
+
idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
|
3758
|
+
idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
|
3759
|
+
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
3760
|
+
//grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
3761
|
+
//grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
3762
|
+
//grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
|
3763
|
+
//grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
|
3764
|
+
db[0] = d * (2*(sc[0] & 7) + 1);
|
3765
|
+
db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
|
3766
|
+
db[2] = d * (2*(sc[1] & 7) + 1);
|
3767
|
+
db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
|
3768
|
+
for (int l = 0; l < 4; ++l) {
|
3769
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
3770
|
+
for (int j = 0; j < 8; ++j) {
|
3771
|
+
//y[j] = db[l] * grid[l][j];
|
3772
|
+
y[j] = db[l] * grid[j];
|
3773
|
+
}
|
3774
|
+
y += 8;
|
3775
|
+
}
|
3776
|
+
qs += 4;
|
3777
|
+
sc += 2;
|
3778
|
+
}
|
3779
|
+
}
|
3780
|
+
}
|
3781
|
+
|
3782
|
+
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
3783
|
+
|
3784
|
+
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
|
3785
|
+
assert(k % QK4_NL == 0);
|
3786
|
+
const int nb = k / QK4_NL;
|
3787
|
+
|
3788
|
+
for (int i = 0; i < nb; i++) {
|
3789
|
+
|
3790
|
+
const uint8_t * qs = x[i].qs;
|
3791
|
+
|
3792
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
3793
|
+
for (int j = 0; j < QK4_NL/2; ++j) {
|
3794
|
+
y[j+ 0] = d * kvalues_iq4nl[qs[j] & 0xf];
|
3795
|
+
y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >> 4];
|
3796
|
+
}
|
3797
|
+
y += QK4_NL;
|
3798
|
+
qs += QK4_NL/2;
|
3799
|
+
}
|
3800
|
+
}
|
3801
|
+
|
3581
3802
|
//===================================== Q8_K ==============================================
|
3582
3803
|
|
3583
3804
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
@@ -3848,15 +4069,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
3848
4069
|
|
3849
4070
|
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
|
3850
4071
|
|
3851
|
-
__m128i
|
3852
|
-
__m128i
|
3853
|
-
|
3854
|
-
const __m128i i32_0 = mul_sum_i8_pairs(
|
4072
|
+
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
|
4073
|
+
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
|
4074
|
+
bx_0 = _mm_sub_epi8(bx_0, off);
|
4075
|
+
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
3855
4076
|
|
3856
|
-
|
3857
|
-
|
3858
|
-
|
3859
|
-
const __m128i i32_1 = mul_sum_i8_pairs(
|
4077
|
+
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
4078
|
+
by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
4079
|
+
bx_0 = _mm_sub_epi8(bx_0, off);
|
4080
|
+
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
|
3860
4081
|
|
3861
4082
|
// Convert int32_t to float
|
3862
4083
|
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
@@ -4442,21 +4663,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4442
4663
|
/* Compute combined scale for the block */
|
4443
4664
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
4444
4665
|
|
4445
|
-
__m256i
|
4666
|
+
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
4446
4667
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
4447
4668
|
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
4448
4669
|
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
4449
4670
|
bxhil = _mm_andnot_si128(bxhil, mask);
|
4450
4671
|
bxhih = _mm_andnot_si128(bxhih, mask);
|
4451
|
-
__m128i bxl = _mm256_castsi256_si128(
|
4452
|
-
__m128i bxh = _mm256_extractf128_si256(
|
4672
|
+
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
4673
|
+
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
4453
4674
|
bxl = _mm_or_si128(bxl, bxhil);
|
4454
4675
|
bxh = _mm_or_si128(bxh, bxhih);
|
4455
|
-
|
4676
|
+
bx_0 = MM256_SET_M128I(bxh, bxl);
|
4456
4677
|
|
4457
|
-
const __m256i
|
4678
|
+
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
4458
4679
|
|
4459
|
-
const __m256 q = mul_sum_i8_pairs_float(
|
4680
|
+
const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
|
4460
4681
|
|
4461
4682
|
/* Multiply q with scale and accumulate */
|
4462
4683
|
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
|
@@ -4749,22 +4970,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4749
4970
|
|
4750
4971
|
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
4751
4972
|
|
4752
|
-
__m256i
|
4973
|
+
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
4753
4974
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
4754
4975
|
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
4755
4976
|
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
4756
4977
|
bxhil = _mm_and_si128(bxhil, mask);
|
4757
4978
|
bxhih = _mm_and_si128(bxhih, mask);
|
4758
|
-
__m128i bxl = _mm256_castsi256_si128(
|
4759
|
-
__m128i bxh = _mm256_extractf128_si256(
|
4979
|
+
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
4980
|
+
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
4760
4981
|
bxl = _mm_or_si128(bxl, bxhil);
|
4761
4982
|
bxh = _mm_or_si128(bxh, bxhih);
|
4762
|
-
|
4983
|
+
bx_0 = MM256_SET_M128I(bxh, bxl);
|
4763
4984
|
|
4764
4985
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
4765
|
-
const __m256i
|
4986
|
+
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
4766
4987
|
|
4767
|
-
const __m256 q = mul_sum_us8_pairs_float(
|
4988
|
+
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
4768
4989
|
|
4769
4990
|
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
4770
4991
|
}
|
@@ -4993,10 +5214,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4993
5214
|
|
4994
5215
|
for (int i = 0; i < nb; i++) {
|
4995
5216
|
// load elements
|
4996
|
-
vint8m1_t
|
4997
|
-
vint8m1_t
|
5217
|
+
vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
5218
|
+
vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
4998
5219
|
|
4999
|
-
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(
|
5220
|
+
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
|
5000
5221
|
|
5001
5222
|
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
5002
5223
|
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
@@ -5433,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5433
5654
|
|
5434
5655
|
for (int i = 0; i < nb; ++i) {
|
5435
5656
|
|
5436
|
-
const float d
|
5437
|
-
const float dmin = -y[i].d * (
|
5657
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
5658
|
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
5438
5659
|
|
5439
5660
|
const uint8_t * restrict q2 = x[i].qs;
|
5440
5661
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -5583,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5583
5804
|
|
5584
5805
|
for (int i = 0; i < nb; ++i) {
|
5585
5806
|
|
5586
|
-
const float d
|
5587
|
-
const float dmin = -y[i].d * (
|
5807
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
5808
|
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
5588
5809
|
|
5589
5810
|
const uint8_t * restrict q2 = x[i].qs;
|
5590
5811
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -6237,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6237
6458
|
|
6238
6459
|
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
6239
6460
|
|
6240
|
-
const float d = y[i].d * (
|
6461
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
6241
6462
|
|
6242
6463
|
const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
|
6243
6464
|
q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
|
@@ -6439,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6439
6660
|
|
6440
6661
|
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
6441
6662
|
|
6442
|
-
const float d = y[i].d * (
|
6663
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
6443
6664
|
|
6444
6665
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
6445
6666
|
|
@@ -6942,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6942
7163
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
6943
7164
|
|
6944
7165
|
const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
|
6945
|
-
sum_mins += y[i].d * (
|
7166
|
+
sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;
|
6946
7167
|
|
6947
|
-
const float d = y[i].d * (
|
7168
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
|
6948
7169
|
|
6949
7170
|
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
6950
7171
|
|
@@ -7602,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7602
7823
|
|
7603
7824
|
for (int i = 0; i < nb; ++i) {
|
7604
7825
|
|
7605
|
-
const float d = y[i].d * (
|
7826
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
7606
7827
|
const int8_t * sc = x[i].scales;
|
7607
7828
|
|
7608
7829
|
const uint8_t * restrict q5 = x[i].qs;
|
@@ -7744,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7744
7965
|
|
7745
7966
|
for (int i = 0; i < nb; ++i) {
|
7746
7967
|
|
7747
|
-
const float d = y[i].d * (
|
7968
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
7748
7969
|
const int8_t * sc = x[i].scales;
|
7749
7970
|
|
7750
7971
|
const uint8_t * restrict q5 = x[i].qs;
|
@@ -8312,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8312
8533
|
|
8313
8534
|
for (int i = 0; i < nb; ++i) {
|
8314
8535
|
|
8315
|
-
const float d_all = (
|
8536
|
+
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
8316
8537
|
|
8317
8538
|
const uint8_t * restrict q6 = x[i].ql;
|
8318
8539
|
const uint8_t * restrict qh = x[i].qh;
|
@@ -8483,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8483
8704
|
|
8484
8705
|
for (int i = 0; i < nb; ++i) {
|
8485
8706
|
|
8486
|
-
const float d_all = (
|
8707
|
+
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
8487
8708
|
|
8488
8709
|
const uint8_t * restrict q6 = x[i].ql;
|
8489
8710
|
const uint8_t * restrict qh = x[i].qh;
|
@@ -8972,7 +9193,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
8972
9193
|
#endif
|
8973
9194
|
}
|
8974
9195
|
|
8975
|
-
// TODO
|
8976
9196
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8977
9197
|
assert(n % QK_K == 0);
|
8978
9198
|
assert(nrc == 1);
|
@@ -9107,6 +9327,271 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9107
9327
|
#endif
|
9108
9328
|
}
|
9109
9329
|
|
9330
|
+
#ifdef __AVX2__
|
9331
|
+
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
9332
|
+
const __m256i ax = _mm256_sign_epi8(x, x);
|
9333
|
+
const __m256i sy = _mm256_sign_epi8(y, x);
|
9334
|
+
return _mm256_maddubs_epi16(ax, sy);
|
9335
|
+
}
|
9336
|
+
#endif
|
9337
|
+
|
9338
|
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
9339
|
+
assert(n % QK_K == 0);
|
9340
|
+
assert(nrc == 1);
|
9341
|
+
UNUSED(nrc);
|
9342
|
+
UNUSED(bx);
|
9343
|
+
UNUSED(by);
|
9344
|
+
UNUSED(bs);
|
9345
|
+
|
9346
|
+
const block_iq1_s * restrict x = vx;
|
9347
|
+
const block_q8_K * restrict y = vy;
|
9348
|
+
|
9349
|
+
const int nb = n / QK_K;
|
9350
|
+
|
9351
|
+
#if defined __ARM_NEON
|
9352
|
+
|
9353
|
+
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
9354
|
+
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
9355
|
+
const uint8x16_t m1 = vdupq_n_u8(0x01);
|
9356
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
9357
|
+
|
9358
|
+
uint16_t gindex[8];
|
9359
|
+
uint16x8x2_t vindex;
|
9360
|
+
int8x16x4_t q1b;
|
9361
|
+
ggml_int8x16x4_t q8b;
|
9362
|
+
uint16x8x4_t scales;
|
9363
|
+
int32x4x2_t sumi;
|
9364
|
+
int32x4x2_t dotq;
|
9365
|
+
|
9366
|
+
float sumf = 0;
|
9367
|
+
for (int i = 0; i < nb; ++i) {
|
9368
|
+
|
9369
|
+
const int8_t * q8 = y[i].qs;
|
9370
|
+
const uint8_t * qs = x[i].qs;
|
9371
|
+
const uint8_t * sc = x[i].scales;
|
9372
|
+
|
9373
|
+
sumi.val[0] = sumi.val[1] = vzero;
|
9374
|
+
|
9375
|
+
for (int i128 = 0; i128 < QK_K/128; ++i128) {
|
9376
|
+
const uint8x16_t ql = vld1q_u8(qs); qs += 16;
|
9377
|
+
const uint8x8_t tm1 = vld1_u8 (sc); sc += 8;
|
9378
|
+
const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
|
9379
|
+
const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
|
9380
|
+
const uint8x16_t hbit = vandq_u8(qh, m8);
|
9381
|
+
vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
|
9382
|
+
vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
|
9383
|
+
const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
|
9384
|
+
scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
|
9385
|
+
scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
|
9386
|
+
|
9387
|
+
for (int l = 0; l < 2; ++l) {
|
9388
|
+
vst1q_u16(gindex+0, vindex.val[l]);
|
9389
|
+
q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1])));
|
9390
|
+
q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3])));
|
9391
|
+
q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5])));
|
9392
|
+
q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7])));
|
9393
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9394
|
+
|
9395
|
+
dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1]));
|
9396
|
+
dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3]));
|
9397
|
+
|
9398
|
+
sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
|
9399
|
+
sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
|
9400
|
+
}
|
9401
|
+
}
|
9402
|
+
|
9403
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1]));
|
9404
|
+
}
|
9405
|
+
|
9406
|
+
*s = sumf;
|
9407
|
+
|
9408
|
+
#elif defined __AVX2__
|
9409
|
+
|
9410
|
+
const __m128i m8 = _mm_set1_epi8(0x08);
|
9411
|
+
const __m128i m7 = _mm_set1_epi8(0x07);
|
9412
|
+
const __m128i m1 = _mm_set1_epi8(0x01);
|
9413
|
+
const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
|
9414
|
+
const __m128i shuffle_s[4] = {
|
9415
|
+
_mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
|
9416
|
+
_mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
|
9417
|
+
_mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
|
9418
|
+
_mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
|
9419
|
+
};
|
9420
|
+
|
9421
|
+
uint64_t aux64;
|
9422
|
+
|
9423
|
+
__m256i v_gindex;
|
9424
|
+
const uint16_t * gindex = (const uint16_t *)&v_gindex;
|
9425
|
+
|
9426
|
+
__m256 accum = _mm256_setzero_ps();
|
9427
|
+
for (int i = 0; i < nb; ++i) {
|
9428
|
+
|
9429
|
+
const int8_t * q8 = y[i].qs;
|
9430
|
+
const uint8_t * qs = x[i].qs;
|
9431
|
+
const uint8_t * sc = x[i].scales;
|
9432
|
+
|
9433
|
+
__m256i sumi = _mm256_setzero_si256();
|
9434
|
+
for (int i128 = 0; i128 < QK_K/128; ++i128) {
|
9435
|
+
const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
9436
|
+
memcpy(&aux64, sc, 8); sc += 8;
|
9437
|
+
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
9438
|
+
const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
|
9439
|
+
v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
9440
|
+
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
9441
|
+
|
9442
|
+
for (int i32 = 0; i32 < 4; ++i32) {
|
9443
|
+
const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9444
|
+
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[gindex[4*i32+3]], iq1s_grid[gindex[4*i32+2]],
|
9445
|
+
iq1s_grid[gindex[4*i32+1]], iq1s_grid[gindex[4*i32+0]]);
|
9446
|
+
const __m256i dot = mul_add_epi8(q1b, q8b);
|
9447
|
+
const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
|
9448
|
+
const __m256i p = _mm256_madd_epi16(s16, dot);
|
9449
|
+
sumi = _mm256_add_epi32(sumi, p);
|
9450
|
+
}
|
9451
|
+
|
9452
|
+
}
|
9453
|
+
|
9454
|
+
accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum);
|
9455
|
+
|
9456
|
+
}
|
9457
|
+
|
9458
|
+
*s = hsum_float_8(accum);
|
9459
|
+
|
9460
|
+
#else
|
9461
|
+
|
9462
|
+
int db[4];
|
9463
|
+
uint16_t idx[4];
|
9464
|
+
|
9465
|
+
float sumf = 0;
|
9466
|
+
for (int i = 0; i < nb; ++i) {
|
9467
|
+
|
9468
|
+
const int8_t * q8 = y[i].qs;
|
9469
|
+
const uint8_t * qs = x[i].qs;
|
9470
|
+
const uint8_t * sc = x[i].scales;
|
9471
|
+
|
9472
|
+
int sumi = 0;
|
9473
|
+
for (int i32 = 0; i32 < QK_K/32; ++i32) {
|
9474
|
+
idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
|
9475
|
+
idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
|
9476
|
+
idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
|
9477
|
+
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
9478
|
+
db[0] = (2*(sc[0] & 7) + 1);
|
9479
|
+
db[1] = (2*((sc[0] >> 4) & 7) + 1);
|
9480
|
+
db[2] = (2*(sc[1] & 7) + 1);
|
9481
|
+
db[3] = (2*((sc[1] >> 4) & 7) + 1);
|
9482
|
+
for (int l = 0; l < 4; ++l) {
|
9483
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
9484
|
+
int suml = 0;
|
9485
|
+
for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j];
|
9486
|
+
sumi += db[l] * suml;
|
9487
|
+
q8 += 8;
|
9488
|
+
}
|
9489
|
+
qs += 4;
|
9490
|
+
sc += 2;
|
9491
|
+
}
|
9492
|
+
|
9493
|
+
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
|
9494
|
+
}
|
9495
|
+
|
9496
|
+
*s = sumf;
|
9497
|
+
|
9498
|
+
#endif
|
9499
|
+
}
|
9500
|
+
|
9501
|
+
void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9502
|
+
assert(nrc == 1);
|
9503
|
+
UNUSED(nrc);
|
9504
|
+
UNUSED(bx);
|
9505
|
+
UNUSED(by);
|
9506
|
+
UNUSED(bs);
|
9507
|
+
assert(n % QK4_NL == 0);
|
9508
|
+
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
|
9509
|
+
|
9510
|
+
const block_iq4_nl * restrict x = vx;
|
9511
|
+
const block_q8_0 * restrict y = vy;
|
9512
|
+
|
9513
|
+
const int nb = n / QK4_NL;
|
9514
|
+
|
9515
|
+
#if defined __ARM_NEON
|
9516
|
+
const int8x16_t values = vld1q_s8(kvalues_iq4nl);
|
9517
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
9518
|
+
uint8x16x2_t q4bits;
|
9519
|
+
int8x16x4_t q4b;
|
9520
|
+
int8x16x4_t q8b;
|
9521
|
+
int32x4_t prod_1, prod_2;
|
9522
|
+
|
9523
|
+
float sumf = 0;
|
9524
|
+
|
9525
|
+
for (int ib = 0; ib < nb; ib += 2) {
|
9526
|
+
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
9527
|
+
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
9528
|
+
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
9529
|
+
q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
|
9530
|
+
q8b.val[2] = vld1q_s8(y[ib+1].qs);
|
9531
|
+
q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
|
9532
|
+
|
9533
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
9534
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
9535
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
9536
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
9537
|
+
|
9538
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
9539
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
9540
|
+
|
9541
|
+
sumf +=
|
9542
|
+
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
|
9543
|
+
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
|
9544
|
+
}
|
9545
|
+
|
9546
|
+
*s = sumf;
|
9547
|
+
|
9548
|
+
#elif defined __AVX2__
|
9549
|
+
|
9550
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
9551
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
9552
|
+
const __m256i mone = _mm256_set1_epi16(1);
|
9553
|
+
|
9554
|
+
__m256 accum1 = _mm256_setzero_ps();
|
9555
|
+
__m256 accum2 = _mm256_setzero_ps();
|
9556
|
+
for (int ib = 0; ib < nb; ib += 2) {
|
9557
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
|
9558
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
9559
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
9560
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
9561
|
+
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
9562
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
9563
|
+
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
9564
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
9565
|
+
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
9566
|
+
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
9567
|
+
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
9568
|
+
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
9569
|
+
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
9570
|
+
_mm256_cvtepi32_ps(p_1), accum1);
|
9571
|
+
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
9572
|
+
_mm256_cvtepi32_ps(p_2), accum2);
|
9573
|
+
|
9574
|
+
y += 2;
|
9575
|
+
x += 2;
|
9576
|
+
}
|
9577
|
+
|
9578
|
+
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
9579
|
+
|
9580
|
+
#else
|
9581
|
+
float sumf = 0;
|
9582
|
+
for (int ib = 0; ib < nb; ++ib) {
|
9583
|
+
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
9584
|
+
int sumi1 = 0, sumi2 = 0;
|
9585
|
+
for (int j = 0; j < QK4_NL/2; ++j) {
|
9586
|
+
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
9587
|
+
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
9588
|
+
}
|
9589
|
+
sumf += d * (sumi1 + sumi2);
|
9590
|
+
}
|
9591
|
+
*s = sumf;
|
9592
|
+
#endif
|
9593
|
+
}
|
9594
|
+
|
9110
9595
|
// ================================ IQ2 quantization =============================================
|
9111
9596
|
|
9112
9597
|
typedef struct {
|
@@ -9115,14 +9600,22 @@ typedef struct {
|
|
9115
9600
|
uint16_t * neighbours;
|
9116
9601
|
} iq2_entry_t;
|
9117
9602
|
|
9118
|
-
static iq2_entry_t iq2_data[
|
9603
|
+
static iq2_entry_t iq2_data[3] = {
|
9604
|
+
{NULL, NULL, NULL},
|
9119
9605
|
{NULL, NULL, NULL},
|
9120
9606
|
{NULL, NULL, NULL},
|
9121
9607
|
};
|
9122
9608
|
|
9123
|
-
static inline int iq2_data_index(
|
9124
|
-
GGML_ASSERT(
|
9125
|
-
return
|
9609
|
+
static inline int iq2_data_index(enum ggml_type type) {
|
9610
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
9611
|
+
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
9612
|
+
type == GGML_TYPE_IQ2_XS ? 1 : 2;
|
9613
|
+
}
|
9614
|
+
|
9615
|
+
static inline int iq2_grid_size(enum ggml_type type) {
|
9616
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
9617
|
+
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
9618
|
+
type == GGML_TYPE_IQ2_XS ? 512 : 512;
|
9126
9619
|
}
|
9127
9620
|
|
9128
9621
|
static int iq2_compare_func(const void * left, const void * right) {
|
@@ -9131,12 +9624,13 @@ static int iq2_compare_func(const void * left, const void * right) {
|
|
9131
9624
|
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
9132
9625
|
}
|
9133
9626
|
|
9134
|
-
void iq2xs_init_impl(
|
9135
|
-
const int gindex = iq2_data_index(
|
9627
|
+
void iq2xs_init_impl(enum ggml_type type) {
|
9628
|
+
const int gindex = iq2_data_index(type);
|
9629
|
+
const int grid_size = iq2_grid_size(type);
|
9136
9630
|
if (iq2_data[gindex].grid) {
|
9137
9631
|
return;
|
9138
9632
|
}
|
9139
|
-
static const uint16_t
|
9633
|
+
static const uint16_t kgrid_2bit_256[256] = {
|
9140
9634
|
0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
|
9141
9635
|
100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
|
9142
9636
|
1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
|
@@ -9154,7 +9648,7 @@ void iq2xs_init_impl(int grid_size) {
|
|
9154
9648
|
33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
|
9155
9649
|
37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
|
9156
9650
|
};
|
9157
|
-
static const uint16_t
|
9651
|
+
static const uint16_t kgrid_2bit_512[512] = {
|
9158
9652
|
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
9159
9653
|
73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
|
9160
9654
|
260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
|
@@ -9188,9 +9682,45 @@ void iq2xs_init_impl(int grid_size) {
|
|
9188
9682
|
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
9189
9683
|
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
9190
9684
|
};
|
9685
|
+
static const uint16_t kgrid_1bit_512[512] = {
|
9686
|
+
10, 33, 41, 85, 132, 134, 160, 162, 277, 337, 340, 345, 357, 405, 516, 545,
|
9687
|
+
553, 598, 641, 650, 681, 1042, 1044, 1097, 1169, 1176, 1320, 1345, 1365, 1378, 1434, 1444,
|
9688
|
+
1545, 1617, 1642, 1685, 2053, 2080, 2089, 2133, 2176, 2182, 2208, 2214, 2306, 2384, 2393, 2440,
|
9689
|
+
2453, 2581, 2664, 2690, 2721, 4117, 4161, 4182, 4184, 4261, 4357, 4369, 4372, 4377, 4390, 4422,
|
9690
|
+
4432, 4437, 4449, 4457, 4485, 4497, 4505, 4629, 4677, 4696, 4774, 5205, 5217, 5225, 5386, 5397,
|
9691
|
+
5409, 5445, 5457, 5460, 5461, 5462, 5465, 5472, 5477, 5525, 5545, 5650, 5668, 5717, 5729, 5769,
|
9692
|
+
5777, 6212, 6234, 6244, 6293, 6424, 6482, 6485, 6502, 6505, 6529, 6538, 6565, 6656, 6682, 6788,
|
9693
|
+
6806, 6820, 8218, 8224, 8226, 8232, 8277, 8326, 8354, 8469, 8521, 8530, 8549, 8596, 8737, 8794,
|
9694
|
+
9221, 9253, 9348, 9369, 9380, 9474, 9557, 9633, 9732, 9753, 9793, 9830, 9862, 9880, 10240, 10272,
|
9695
|
+
10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665,
|
9696
|
+
16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685,
|
9697
|
+
17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529,
|
9698
|
+
18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517,
|
9699
|
+
20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872,
|
9700
|
+
20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653,
|
9701
|
+
21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842,
|
9702
|
+
21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913,
|
9703
|
+
21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608,
|
9704
|
+
22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072,
|
9705
|
+
23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110,
|
9706
|
+
25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937,
|
9707
|
+
25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885,
|
9708
|
+
26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808,
|
9709
|
+
32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320,
|
9710
|
+
33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918,
|
9711
|
+
34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125,
|
9712
|
+
37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973,
|
9713
|
+
38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485,
|
9714
|
+
38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497,
|
9715
|
+
39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514,
|
9716
|
+
41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
|
9717
|
+
42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
|
9718
|
+
};
|
9719
|
+
|
9191
9720
|
const int kmap_size = 43692;
|
9192
|
-
const int nwant = 2;
|
9193
|
-
const uint16_t * kgrid =
|
9721
|
+
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
9722
|
+
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
9723
|
+
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : kgrid_1bit_512;
|
9194
9724
|
uint64_t * kgrid_q2xs;
|
9195
9725
|
int * kmap_q2xs;
|
9196
9726
|
uint16_t * kneighbors_q2xs;
|
@@ -9286,9 +9816,9 @@ void iq2xs_init_impl(int grid_size) {
|
|
9286
9816
|
free(dist2);
|
9287
9817
|
}
|
9288
9818
|
|
9289
|
-
void iq2xs_free_impl(
|
9290
|
-
GGML_ASSERT(
|
9291
|
-
const int gindex = iq2_data_index(
|
9819
|
+
void iq2xs_free_impl(enum ggml_type type) {
|
9820
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
9821
|
+
const int gindex = iq2_data_index(type);
|
9292
9822
|
if (iq2_data[gindex].grid) {
|
9293
9823
|
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
9294
9824
|
free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
|
@@ -9322,7 +9852,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
9322
9852
|
|
9323
9853
|
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
9324
9854
|
|
9325
|
-
const int gindex = iq2_data_index(
|
9855
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
|
9326
9856
|
|
9327
9857
|
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
9328
9858
|
const int * kmap_q2xs = iq2_data[gindex].map;
|
@@ -9495,7 +10025,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
9495
10025
|
|
9496
10026
|
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
9497
10027
|
|
9498
|
-
const int gindex = iq2_data_index(
|
10028
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
|
9499
10029
|
|
9500
10030
|
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
9501
10031
|
const int * kmap_q2xs = iq2_data[gindex].map;
|
@@ -10132,3 +10662,327 @@ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * re
|
|
10132
10662
|
assert(k % QK_K == 0);
|
10133
10663
|
quantize_row_iq3_xxs_impl(x, y, k, NULL);
|
10134
10664
|
}
|
10665
|
+
|
10666
|
+
// =================================== 1.5 bpw ===================================================
|
10667
|
+
|
10668
|
+
static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
10669
|
+
const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
|
10670
|
+
int num_neighbors = neighbours[0];
|
10671
|
+
GGML_ASSERT(num_neighbors > 0);
|
10672
|
+
float best_score = 0;
|
10673
|
+
int grid_index = -1;
|
10674
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
10675
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
10676
|
+
float sumqx = 0, sumq2 = 0;
|
10677
|
+
for (int i = 0; i < 8; ++i) {
|
10678
|
+
float q = (pg[i] - 3)/2;
|
10679
|
+
float w = weight[i];
|
10680
|
+
sumqx += w*q*xval[i];
|
10681
|
+
sumq2 += w*q*q;
|
10682
|
+
}
|
10683
|
+
if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
10684
|
+
*scale = sumqx/sumq2; best_score = *scale * sumqx;
|
10685
|
+
grid_index = neighbours[j];
|
10686
|
+
}
|
10687
|
+
}
|
10688
|
+
if (grid_index < 0) {
|
10689
|
+
for (int i = 0; i < ngrid; ++i) {
|
10690
|
+
const int8_t * grid_i = (const int8_t *)(grid + i);
|
10691
|
+
float sumqx = 0, sumq2 = 0;
|
10692
|
+
for (int j = 0; j < 8; ++j) {
|
10693
|
+
float w = weight[j];
|
10694
|
+
float q = (grid_i[j] - 3)/2;
|
10695
|
+
sumqx += w*q*xval[j];
|
10696
|
+
sumq2 += w*q*q;
|
10697
|
+
}
|
10698
|
+
if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
10699
|
+
*scale = sumqx/sumq2; best_score = *scale*sumqx;
|
10700
|
+
grid_index = i;
|
10701
|
+
}
|
10702
|
+
}
|
10703
|
+
}
|
10704
|
+
if (grid_index < 0) {
|
10705
|
+
printf("Oops, did not find grid point\n");
|
10706
|
+
printf("Have %d neighbours\n", num_neighbors);
|
10707
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
10708
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
10709
|
+
float sumqx = 0, sumq2 = 0;
|
10710
|
+
for (int i = 0; i < 8; ++i) {
|
10711
|
+
float q = (pg[i] - 3)/2;
|
10712
|
+
float w = weight[i];
|
10713
|
+
sumqx += w*q*xval[i];
|
10714
|
+
sumq2 += w*q*q;
|
10715
|
+
}
|
10716
|
+
printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
|
10717
|
+
}
|
10718
|
+
}
|
10719
|
+
GGML_ASSERT(grid_index >= 0);
|
10720
|
+
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
10721
|
+
*scale *= 1.05f; // This is a fudge factor. Don't ask me why it improves the result.
|
10722
|
+
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
10723
|
+
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
10724
|
+
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
|
10725
|
+
return grid_index;
|
10726
|
+
}
|
10727
|
+
|
10728
|
+
static int iq1_sort_helper(const void * left, const void * right) {
|
10729
|
+
const float * l = left;
|
10730
|
+
const float * r = right;
|
10731
|
+
return *l < *r ? -1 : *l > *r ? 1 : 0;
|
10732
|
+
}
|
10733
|
+
|
10734
|
+
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
10735
|
+
|
10736
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
|
10737
|
+
|
10738
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
10739
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
10740
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
10741
|
+
|
10742
|
+
GGML_ASSERT(quant_weights && "missing quantization weights");
|
10743
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
10744
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
10745
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
10746
|
+
GGML_ASSERT(n%QK_K == 0);
|
10747
|
+
|
10748
|
+
const int nbl = n/256;
|
10749
|
+
|
10750
|
+
block_iq1_s * y = vy;
|
10751
|
+
|
10752
|
+
float scales[QK_K/8];
|
10753
|
+
float weight[8];
|
10754
|
+
int8_t L[8];
|
10755
|
+
float sumx[9];
|
10756
|
+
float sumw[9];
|
10757
|
+
float pairs[16];
|
10758
|
+
int * idx = (int *)(pairs + 1);
|
10759
|
+
uint8_t hbit[QK_K/8];
|
10760
|
+
|
10761
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
10762
|
+
|
10763
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
10764
|
+
memset(y[ibl].qs, 0, QK_K/8);
|
10765
|
+
memset(y[ibl].scales, 0, QK_K/16);
|
10766
|
+
|
10767
|
+
float max_scale = 0;
|
10768
|
+
|
10769
|
+
const float * xbl = x + QK_K*ibl;
|
10770
|
+
float sumx2 = 0;
|
10771
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
10772
|
+
float sigma2 = sumx2/QK_K;
|
10773
|
+
|
10774
|
+
for (int ib = 0; ib < QK_K/8; ++ib) {
|
10775
|
+
const float * xb = xbl + 8*ib;
|
10776
|
+
const float * qw = quant_weights + QK_K*ibl + 8*ib;
|
10777
|
+
for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
10778
|
+
float max = fabsf(xb[0]);
|
10779
|
+
for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i]));
|
10780
|
+
if (!max) {
|
10781
|
+
scales[ib] = 0;
|
10782
|
+
memset(L, 1, 8);
|
10783
|
+
continue;
|
10784
|
+
}
|
10785
|
+
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
10786
|
+
// With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
|
10787
|
+
// boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
|
10788
|
+
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
10789
|
+
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
10790
|
+
// for each possible and score for each split.
|
10791
|
+
for (int j = 0; j < 8; ++j) {
|
10792
|
+
pairs[2*j] = xb[j];
|
10793
|
+
idx[2*j] = j;
|
10794
|
+
}
|
10795
|
+
qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper);
|
10796
|
+
{
|
10797
|
+
sumx[0] = sumw[0] = 0;
|
10798
|
+
for (int j = 0; j < 8; ++j) {
|
10799
|
+
int i = idx[2*j];
|
10800
|
+
sumx[j+1] = sumx[j] + weight[i]*xb[i];
|
10801
|
+
sumw[j+1] = sumw[j] + weight[i];
|
10802
|
+
}
|
10803
|
+
}
|
10804
|
+
float best_score = 0, scale = max;
|
10805
|
+
int besti1 = 0, besti2 = 0;
|
10806
|
+
for (int i1 = 0; i1 <= 8; ++i1) {
|
10807
|
+
for (int i2 = i1; i2 <= 8; ++i2) {
|
10808
|
+
float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]);
|
10809
|
+
float sumq2 = (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]);
|
10810
|
+
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
10811
|
+
scale = sumqx/sumq2; best_score = scale*sumqx;
|
10812
|
+
besti1 = i1; besti2 = i2;
|
10813
|
+
}
|
10814
|
+
}
|
10815
|
+
}
|
10816
|
+
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
10817
|
+
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
10818
|
+
for (int j = besti2; j < 8; ++j) L[idx[2*j]] = 2;
|
10819
|
+
if (scale < 0) {
|
10820
|
+
for (int j = 0; j < 8; ++j) L[j] = 2 - L[j];
|
10821
|
+
scale = -scale;
|
10822
|
+
}
|
10823
|
+
// Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring
|
10824
|
+
// grid point that minimizes SSD.
|
10825
|
+
uint16_t u = 0;
|
10826
|
+
for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
|
10827
|
+
int grid_index = kmap_q2xs[u];
|
10828
|
+
if (grid_index < 0) {
|
10829
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
10830
|
+
grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
|
10831
|
+
GGML_ASSERT(grid_index >= 0);
|
10832
|
+
}
|
10833
|
+
y[ibl].qs[ib] = grid_index & 255;
|
10834
|
+
hbit[ib] = grid_index >> 8;
|
10835
|
+
GGML_ASSERT(scale >= 0);
|
10836
|
+
scales[ib] = scale;
|
10837
|
+
max_scale = MAX(max_scale, scale);
|
10838
|
+
}
|
10839
|
+
|
10840
|
+
if (!max_scale) {
|
10841
|
+
memset(y[ibl].qs, 0, QK_K/8);
|
10842
|
+
continue;
|
10843
|
+
}
|
10844
|
+
|
10845
|
+
float d = max_scale/15;
|
10846
|
+
y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed.
|
10847
|
+
float id = 1/d;
|
10848
|
+
for (int ib = 0; ib < QK_K/8; ++ib) {
|
10849
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
10850
|
+
l = MAX(0, MIN(7, l));
|
10851
|
+
if (hbit[ib]) l |= 8;
|
10852
|
+
y[ibl].scales[ib/2] |= (l << 4*(ib%2));
|
10853
|
+
}
|
10854
|
+
}
|
10855
|
+
}
|
10856
|
+
|
10857
|
+
size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
10858
|
+
(void)hist;
|
10859
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
10860
|
+
int nblock = n_per_row/QK_K;
|
10861
|
+
char * qrow = (char *)dst;
|
10862
|
+
for (int row = 0; row < nrow; ++row) {
|
10863
|
+
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
|
10864
|
+
src += n_per_row;
|
10865
|
+
qrow += nblock*sizeof(block_iq1_s);
|
10866
|
+
}
|
10867
|
+
return nrow * nblock * sizeof(block_iq1_s);
|
10868
|
+
}
|
10869
|
+
|
10870
|
+
// ============================ 4-bit non-linear quants
|
10871
|
+
|
10872
|
+
static inline int best_index_int8(int n, const int8_t * val, float x) {
|
10873
|
+
if (x <= val[0]) return 0;
|
10874
|
+
if (x >= val[n-1]) return n-1;
|
10875
|
+
int ml = 0, mu = n-1;
|
10876
|
+
while (mu-ml > 1) {
|
10877
|
+
int mav = (ml+mu)/2;
|
10878
|
+
if (x < val[mav]) mu = mav; else ml = mav;
|
10879
|
+
}
|
10880
|
+
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
10881
|
+
}
|
10882
|
+
|
10883
|
+
static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x,
|
10884
|
+
ggml_fp16_t * dh, uint8_t * q4,
|
10885
|
+
float * weight, uint8_t * L,
|
10886
|
+
const int8_t * values,
|
10887
|
+
const float * quant_weights) {
|
10888
|
+
|
10889
|
+
const int ntry = 7;
|
10890
|
+
|
10891
|
+
float sigma2 = 0;
|
10892
|
+
for (int j = 0; j < QK4_NL; ++j) sigma2 += x[j]*x[j];
|
10893
|
+
sigma2 *= 2.f/QK4_NL;
|
10894
|
+
|
10895
|
+
const int nb = QK4_NL/block_size;
|
10896
|
+
|
10897
|
+
memset(q4, 0, QK4_NL/2);
|
10898
|
+
for (int ib = 0; ib < nb; ++ib) {
|
10899
|
+
dh[ib] = GGML_FP32_TO_FP16(0.f);
|
10900
|
+
const float * xb = x + ib*block_size;
|
10901
|
+
if (quant_weights) {
|
10902
|
+
const float * qw = quant_weights + ib*block_size;
|
10903
|
+
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
10904
|
+
} else {
|
10905
|
+
for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
|
10906
|
+
}
|
10907
|
+
float amax = 0, max = 0;
|
10908
|
+
for (int j = 0; j < block_size; ++j) {
|
10909
|
+
float ax = fabsf(xb[j]);
|
10910
|
+
if (ax > amax) {
|
10911
|
+
amax = ax; max = xb[j];
|
10912
|
+
}
|
10913
|
+
}
|
10914
|
+
if (!amax) {
|
10915
|
+
continue;
|
10916
|
+
}
|
10917
|
+
float d = -max/values[0];
|
10918
|
+
float id = 1/d;
|
10919
|
+
float sumqx = 0, sumq2 = 0;
|
10920
|
+
for (int j = 0; j < block_size; ++j) {
|
10921
|
+
float al = id*xb[j];
|
10922
|
+
int l = best_index_int8(16, values, al);
|
10923
|
+
float q = values[l];
|
10924
|
+
float w = weight[j];
|
10925
|
+
sumqx += w*q*xb[j];
|
10926
|
+
sumq2 += w*q*q;
|
10927
|
+
}
|
10928
|
+
float best_id = id;
|
10929
|
+
d = sumqx/sumq2;
|
10930
|
+
float best = d*sumqx;
|
10931
|
+
for (int itry = -ntry; itry <= ntry; ++itry) {
|
10932
|
+
id = (itry + values[0])/max;
|
10933
|
+
sumqx = sumq2 = 0;
|
10934
|
+
for (int j = 0; j < block_size; ++j) {
|
10935
|
+
float al = id*xb[j];
|
10936
|
+
int l = best_index_int8(16, values, al);
|
10937
|
+
float q = values[l];
|
10938
|
+
float w = weight[j];
|
10939
|
+
sumqx += w*q*xb[j];
|
10940
|
+
sumq2 += w*q*q;
|
10941
|
+
}
|
10942
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
10943
|
+
d = sumqx/sumq2; best = d * sumqx;
|
10944
|
+
best_id = id;
|
10945
|
+
}
|
10946
|
+
}
|
10947
|
+
dh[ib] = GGML_FP32_TO_FP16(d);
|
10948
|
+
for (int j = 0; j < block_size; ++j) {
|
10949
|
+
L[ib*block_size + j] = best_index_int8(16, values, best_id*xb[j]);
|
10950
|
+
}
|
10951
|
+
}
|
10952
|
+
for (int i = 0; i < QK4_NL/32; ++i) {
|
10953
|
+
for (int j = 0; j < 16; ++j) {
|
10954
|
+
q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
|
10955
|
+
}
|
10956
|
+
}
|
10957
|
+
}
|
10958
|
+
|
10959
|
+
size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
10960
|
+
(void)hist;
|
10961
|
+
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
10962
|
+
int nblock = n_per_row/QK4_NL;
|
10963
|
+
char * qrow = (char *)dst;
|
10964
|
+
uint8_t L[QK4_NL];
|
10965
|
+
float weight[32];
|
10966
|
+
for (int row = 0; row < nrow; ++row) {
|
10967
|
+
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
10968
|
+
for (int ibl = 0; ibl < nblock; ++ibl) {
|
10969
|
+
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
10970
|
+
quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, weight, L, kvalues_iq4nl, qw);
|
10971
|
+
}
|
10972
|
+
src += n_per_row;
|
10973
|
+
qrow += nblock*sizeof(block_iq4_nl);
|
10974
|
+
}
|
10975
|
+
return nrow * nblock * sizeof(block_iq4_nl);
|
10976
|
+
}
|
10977
|
+
|
10978
|
+
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
10979
|
+
assert(k % QK4_NL == 0);
|
10980
|
+
block_iq4_nl * restrict y = vy;
|
10981
|
+
quantize_row_iq4_nl_reference(x, y, k);
|
10982
|
+
}
|
10983
|
+
|
10984
|
+
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
10985
|
+
assert(k % QK4_NL == 0);
|
10986
|
+
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
10987
|
+
}
|
10988
|
+
|