llama_cpp 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +59 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -4
- data/vendor/tmp/llama.cpp/Makefile +2 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +4 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -21
- data/vendor/tmp/llama.cpp/ggml-backend.h +16 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +63 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +120 -75
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +178 -133
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3432 -1118
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1327 -773
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +227 -15
- data/vendor/tmp/llama.cpp/ggml.h +30 -4
- data/vendor/tmp/llama.cpp/llama.cpp +631 -211
- data/vendor/tmp/llama.cpp/llama.h +28 -10
- metadata +2 -2
|
@@ -51,6 +51,7 @@
|
|
|
51
51
|
|
|
52
52
|
#define UNUSED GGML_UNUSED
|
|
53
53
|
|
|
54
|
+
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
|
54
55
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
|
55
56
|
|
|
56
57
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
|
@@ -463,8 +464,8 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
|
463
464
|
}
|
|
464
465
|
|
|
465
466
|
// NOTE: not tested
|
|
466
|
-
inline static
|
|
467
|
-
|
|
467
|
+
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
|
468
|
+
uint8x16_t res;
|
|
468
469
|
|
|
469
470
|
res[ 0] = a[b[ 0]];
|
|
470
471
|
res[ 1] = a[b[ 1]];
|
|
@@ -3818,71 +3819,71 @@ static const uint32_t iq3xxs_grid[256] = {
|
|
|
3818
3819
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
|
3819
3820
|
};
|
|
3820
3821
|
|
|
3821
|
-
static const uint32_t
|
|
3822
|
-
|
|
3823
|
-
|
|
3824
|
-
|
|
3825
|
-
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
|
|
3829
|
-
|
|
3830
|
-
|
|
3831
|
-
|
|
3832
|
-
|
|
3833
|
-
|
|
3834
|
-
|
|
3835
|
-
|
|
3836
|
-
|
|
3837
|
-
|
|
3838
|
-
|
|
3839
|
-
|
|
3840
|
-
|
|
3841
|
-
|
|
3842
|
-
|
|
3843
|
-
|
|
3844
|
-
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3850
|
-
|
|
3851
|
-
|
|
3852
|
-
|
|
3853
|
-
|
|
3854
|
-
|
|
3855
|
-
|
|
3856
|
-
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
|
|
3860
|
-
|
|
3861
|
-
|
|
3862
|
-
|
|
3863
|
-
|
|
3864
|
-
|
|
3865
|
-
|
|
3866
|
-
|
|
3867
|
-
|
|
3868
|
-
|
|
3869
|
-
|
|
3870
|
-
|
|
3871
|
-
|
|
3872
|
-
|
|
3873
|
-
|
|
3874
|
-
|
|
3875
|
-
|
|
3876
|
-
|
|
3877
|
-
|
|
3878
|
-
|
|
3879
|
-
|
|
3880
|
-
|
|
3881
|
-
|
|
3882
|
-
|
|
3883
|
-
|
|
3884
|
-
|
|
3885
|
-
|
|
3822
|
+
static const uint32_t iq3s_grid[512] = {
|
|
3823
|
+
0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
|
|
3824
|
+
0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
|
|
3825
|
+
0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
|
|
3826
|
+
0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
|
|
3827
|
+
0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
|
|
3828
|
+
0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
|
|
3829
|
+
0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
|
|
3830
|
+
0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
|
|
3831
|
+
0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
|
|
3832
|
+
0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
|
|
3833
|
+
0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
|
|
3834
|
+
0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
|
|
3835
|
+
0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
|
|
3836
|
+
0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
|
|
3837
|
+
0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
|
|
3838
|
+
0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
|
|
3839
|
+
0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
|
|
3840
|
+
0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
|
|
3841
|
+
0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
|
|
3842
|
+
0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
|
|
3843
|
+
0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
|
|
3844
|
+
0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
|
|
3845
|
+
0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
|
|
3846
|
+
0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
|
|
3847
|
+
0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
|
|
3848
|
+
0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
|
|
3849
|
+
0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
|
|
3850
|
+
0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
|
|
3851
|
+
0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
|
|
3852
|
+
0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
|
|
3853
|
+
0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
|
|
3854
|
+
0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
|
|
3855
|
+
0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
|
|
3856
|
+
0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
|
|
3857
|
+
0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
|
|
3858
|
+
0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
|
|
3859
|
+
0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
|
|
3860
|
+
0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
|
|
3861
|
+
0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
|
|
3862
|
+
0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
|
|
3863
|
+
0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
|
|
3864
|
+
0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
|
|
3865
|
+
0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
|
|
3866
|
+
0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
|
|
3867
|
+
0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
|
|
3868
|
+
0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
|
|
3869
|
+
0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
|
|
3870
|
+
0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
|
|
3871
|
+
0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
|
|
3872
|
+
0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
|
|
3873
|
+
0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
|
|
3874
|
+
0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
|
|
3875
|
+
0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
|
|
3876
|
+
0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
|
|
3877
|
+
0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
|
|
3878
|
+
0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
|
|
3879
|
+
0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
|
|
3880
|
+
0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
|
|
3881
|
+
0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
|
|
3882
|
+
0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
|
|
3883
|
+
0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
|
|
3884
|
+
0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
|
|
3885
|
+
0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
|
|
3886
|
+
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
|
3886
3887
|
};
|
|
3887
3888
|
|
|
3888
3889
|
#define NGRID_IQ2XXS 512
|
|
@@ -4162,11 +4163,11 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
|
4162
4163
|
const uint8_t * signs = x[i].signs;
|
|
4163
4164
|
|
|
4164
4165
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
4165
|
-
const float db1 = d * (
|
|
4166
|
-
const float db2 = d * (
|
|
4166
|
+
const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
|
4167
|
+
const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
|
4167
4168
|
for (int l = 0; l < 4; ++l) {
|
|
4168
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
|
4169
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
|
4169
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
|
4170
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
|
4170
4171
|
for (int j = 0; j < 4; ++j) {
|
|
4171
4172
|
y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
|
4172
4173
|
y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
@@ -4176,8 +4177,8 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
|
4176
4177
|
qs += 8;
|
|
4177
4178
|
signs += 4;
|
|
4178
4179
|
for (int l = 0; l < 4; ++l) {
|
|
4179
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
|
4180
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
|
4180
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
|
|
4181
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
|
|
4181
4182
|
for (int j = 0; j < 4; ++j) {
|
|
4182
4183
|
y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
|
4183
4184
|
y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
|
@@ -9563,7 +9564,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9563
9564
|
|
|
9564
9565
|
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
|
9565
9566
|
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
|
9566
|
-
const __m256i full_signs =
|
|
9567
|
+
const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
|
|
9567
9568
|
|
|
9568
9569
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
|
9569
9570
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
|
@@ -9585,8 +9586,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9585
9586
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
|
9586
9587
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
|
9587
9588
|
|
|
9588
|
-
const __m256i sc1 =
|
|
9589
|
-
const __m256i sc2 =
|
|
9589
|
+
const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
|
9590
|
+
const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
|
9590
9591
|
|
|
9591
9592
|
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
|
9592
9593
|
|
|
@@ -9653,8 +9654,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9653
9654
|
|
|
9654
9655
|
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
|
9655
9656
|
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
|
9656
|
-
const __m256i full_signs_1 =
|
|
9657
|
-
const __m256i full_signs_2 =
|
|
9657
|
+
const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
|
|
9658
|
+
const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
|
|
9658
9659
|
|
|
9659
9660
|
__m256i signs;
|
|
9660
9661
|
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
|
@@ -10089,18 +10090,34 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
10089
10090
|
|
|
10090
10091
|
#if defined(__ARM_NEON)
|
|
10091
10092
|
|
|
10093
|
+
typedef union {
|
|
10094
|
+
uint16x8_t vec_index;
|
|
10095
|
+
uint16_t index[8];
|
|
10096
|
+
} vec_index_t;
|
|
10097
|
+
|
|
10092
10098
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
10093
10099
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
|
10094
10100
|
};
|
|
10095
10101
|
|
|
10096
10102
|
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
|
10097
10103
|
|
|
10098
|
-
const
|
|
10099
|
-
|
|
10104
|
+
static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
|
|
10105
|
+
|
|
10106
|
+
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
|
10107
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
|
10108
|
+
const int16x8_t hshift = vld1q_s16(k_shift);
|
|
10109
|
+
const uint16x8_t m256 = vdupq_n_u16(256);
|
|
10110
|
+
const uint8x16_t m1 = vdupq_n_u8(1);
|
|
10100
10111
|
|
|
10101
10112
|
uint8x16x2_t vs;
|
|
10102
10113
|
ggml_int8x16x4_t q3s;
|
|
10103
10114
|
ggml_int8x16x4_t q8b;
|
|
10115
|
+
vec_index_t idx;
|
|
10116
|
+
|
|
10117
|
+
#if QK_K == 256
|
|
10118
|
+
uint32_t scales32[2];
|
|
10119
|
+
const uint8_t * scales8 = (const uint8_t *)scales32;
|
|
10120
|
+
#endif
|
|
10104
10121
|
|
|
10105
10122
|
float sumf = 0;
|
|
10106
10123
|
for (int i = 0; i < nb; ++i) {
|
|
@@ -10109,47 +10126,63 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
10109
10126
|
const uint8_t * restrict qh = x[i].qh;
|
|
10110
10127
|
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
|
10111
10128
|
const int8_t * restrict q8 = y[i].qs;
|
|
10129
|
+
|
|
10130
|
+
#if QK_K == 256
|
|
10131
|
+
memcpy(scales32, x[i].scales, 4);
|
|
10132
|
+
scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
|
|
10133
|
+
scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
|
|
10134
|
+
#endif
|
|
10135
|
+
|
|
10112
10136
|
int sumi1 = 0, sumi2 = 0;
|
|
10113
10137
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
10114
10138
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
10115
|
-
|
|
10116
|
-
|
|
10117
|
-
|
|
10118
|
-
|
|
10119
|
-
|
|
10120
|
-
|
|
10121
|
-
|
|
10122
|
-
|
|
10123
|
-
|
|
10139
|
+
|
|
10140
|
+
const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
|
|
10141
|
+
idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
|
|
10142
|
+
const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
|
10143
|
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
|
|
10144
|
+
const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
|
10145
|
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
|
|
10146
|
+
idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
|
|
10147
|
+
const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
|
10148
|
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
|
|
10149
|
+
const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
|
10150
|
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
|
|
10151
|
+
|
|
10124
10152
|
|
|
10125
10153
|
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
|
10126
10154
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
|
10127
10155
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
|
10128
|
-
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
|
10129
|
-
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
|
10156
|
+
vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
|
|
10157
|
+
vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
|
|
10130
10158
|
|
|
10131
|
-
q3s.val[0] =
|
|
10132
|
-
q3s.val[1] =
|
|
10159
|
+
q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
|
|
10160
|
+
q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
|
|
10133
10161
|
|
|
10134
10162
|
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
|
10135
10163
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
|
10136
10164
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
|
10137
|
-
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
|
10138
|
-
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
|
10165
|
+
vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
|
|
10166
|
+
vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
|
|
10139
10167
|
|
|
10140
10168
|
signs += 4;
|
|
10141
10169
|
|
|
10142
|
-
q3s.val[2] =
|
|
10143
|
-
q3s.val[3] =
|
|
10170
|
+
q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
|
|
10171
|
+
q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
|
|
10144
10172
|
|
|
10145
10173
|
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
|
10146
10174
|
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
|
10175
|
+
#if QK_K == 256
|
|
10176
|
+
sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
|
|
10177
|
+
sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
|
|
10178
|
+
#else
|
|
10147
10179
|
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
|
10148
10180
|
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
|
10181
|
+
#endif
|
|
10149
10182
|
}
|
|
10150
10183
|
sumf += d*(sumi1 + sumi2);
|
|
10151
10184
|
}
|
|
10152
|
-
*s =
|
|
10185
|
+
*s = sumf;
|
|
10153
10186
|
|
|
10154
10187
|
#elif defined(__AVX2__)
|
|
10155
10188
|
|
|
@@ -10164,6 +10197,16 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
10164
10197
|
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
|
10165
10198
|
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
|
10166
10199
|
|
|
10200
|
+
const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
|
10201
|
+
const __m256i idx_mask = _mm256_set1_epi32(256);
|
|
10202
|
+
|
|
10203
|
+
typedef union {
|
|
10204
|
+
__m256i vec[2];
|
|
10205
|
+
uint32_t index[16];
|
|
10206
|
+
} index_t;
|
|
10207
|
+
|
|
10208
|
+
index_t idx;
|
|
10209
|
+
|
|
10167
10210
|
__m256 accumf = _mm256_setzero_ps();
|
|
10168
10211
|
for (int i = 0; i < nb; ++i) {
|
|
10169
10212
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
@@ -10176,24 +10219,25 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
10176
10219
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
10177
10220
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10178
10221
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10179
|
-
const __m256i
|
|
10180
|
-
|
|
10181
|
-
|
|
10182
|
-
|
|
10183
|
-
|
|
10184
|
-
|
|
10185
|
-
|
|
10186
|
-
|
|
10187
|
-
|
|
10188
|
-
const __m256i
|
|
10189
|
-
|
|
10190
|
-
|
|
10191
|
-
|
|
10192
|
-
|
|
10193
|
-
|
|
10194
|
-
|
|
10195
|
-
|
|
10196
|
-
|
|
10222
|
+
const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
|
|
10223
|
+
idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
|
|
10224
|
+
idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
|
|
10225
|
+
idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
|
|
10226
|
+
idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
|
|
10227
|
+
idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
|
|
10228
|
+
idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
|
|
10229
|
+
|
|
10230
|
+
// At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
|
|
10231
|
+
//const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
|
|
10232
|
+
//const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
|
|
10233
|
+
const __m256i q2_1 = _mm256_set_epi32(
|
|
10234
|
+
iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
|
|
10235
|
+
iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
|
|
10236
|
+
);
|
|
10237
|
+
const __m256i q2_2 = _mm256_set_epi32(
|
|
10238
|
+
iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
|
|
10239
|
+
iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
|
|
10240
|
+
);
|
|
10197
10241
|
|
|
10198
10242
|
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
|
10199
10243
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
|
@@ -10221,7 +10265,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
10221
10265
|
|
|
10222
10266
|
}
|
|
10223
10267
|
|
|
10224
|
-
*s =
|
|
10268
|
+
*s = hsum_float_8(accumf);
|
|
10225
10269
|
|
|
10226
10270
|
#else
|
|
10227
10271
|
|
|
@@ -10238,8 +10282,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
10238
10282
|
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
10239
10283
|
int32_t sumi = 0;
|
|
10240
10284
|
for (int l = 0; l < 4; ++l) {
|
|
10241
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
|
10242
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
|
10285
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
10286
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
10243
10287
|
for (int j = 0; j < 4; ++j) {
|
|
10244
10288
|
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
10245
10289
|
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
@@ -10251,8 +10295,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
10251
10295
|
bsum += sumi * ls1;
|
|
10252
10296
|
sumi = 0;
|
|
10253
10297
|
for (int l = 0; l < 4; ++l) {
|
|
10254
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
|
10255
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
|
10298
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
10299
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
10256
10300
|
for (int j = 0; j < 4; ++j) {
|
|
10257
10301
|
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
10258
10302
|
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
@@ -10265,7 +10309,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
10265
10309
|
}
|
|
10266
10310
|
sumf += d * bsum;
|
|
10267
10311
|
}
|
|
10268
|
-
*s =
|
|
10312
|
+
*s = sumf;
|
|
10269
10313
|
#endif
|
|
10270
10314
|
}
|
|
10271
10315
|
|
|
@@ -10508,10 +10552,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
|
10508
10552
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
|
10509
10553
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
|
10510
10554
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
|
10511
|
-
const __m256i q4b_1 =
|
|
10512
|
-
|
|
10513
|
-
const __m256i q4b_2 =
|
|
10514
|
-
|
|
10555
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
|
10556
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
|
10557
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
|
10558
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
|
10515
10559
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
|
10516
10560
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
10517
10561
|
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
|
@@ -10618,10 +10662,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
10618
10662
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
|
10619
10663
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10620
10664
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
10621
|
-
const __m256i q4b_1 =
|
|
10622
|
-
|
|
10623
|
-
const __m256i q4b_2 =
|
|
10624
|
-
|
|
10665
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
|
10666
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
|
10667
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
|
10668
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
|
10625
10669
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
|
10626
10670
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
10627
10671
|
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
|
@@ -11912,7 +11956,8 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
|
11912
11956
|
}
|
|
11913
11957
|
float best = 0;
|
|
11914
11958
|
float scale = max/(2*kMaxQ-1);
|
|
11915
|
-
for (int
|
|
11959
|
+
for (int k = 0; k < bs4; ++k) is_on_grid[k] = false;
|
|
11960
|
+
for (int is = -9; is <= 9; ++is) {
|
|
11916
11961
|
float id = (2*kMaxQ-1+is*0.2f)/max;
|
|
11917
11962
|
float this_scale = 1/id;
|
|
11918
11963
|
for (int k = 0; k < bs4; ++k) {
|
|
@@ -11948,7 +11993,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
|
11948
11993
|
if (n_not_ongrid > 0 && scale > 0) {
|
|
11949
11994
|
float id = 1/scale;
|
|
11950
11995
|
for (int k = 0; k < bs4; ++k) {
|
|
11951
|
-
if (is_on_grid[k]) continue;
|
|
11996
|
+
//if (is_on_grid[k]) continue;
|
|
11952
11997
|
uint16_t u = 0;
|
|
11953
11998
|
for (int i = 0; i < 4; ++i) {
|
|
11954
11999
|
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
|
@@ -12004,7 +12049,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
|
12004
12049
|
}
|
|
12005
12050
|
|
|
12006
12051
|
float d = max_scale/31;
|
|
12007
|
-
y[ibl].d = GGML_FP32_TO_FP16(d);
|
|
12052
|
+
y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f);
|
|
12008
12053
|
float id = 1/d;
|
|
12009
12054
|
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
|
12010
12055
|
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|