llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -1,6 +1,12 @@
|
|
1
|
+
#define GGML_COMMON_IMPL_C
|
2
|
+
#include "ggml-common.h"
|
3
|
+
|
1
4
|
#include "ggml-quants.h"
|
2
5
|
#include "ggml-impl.h"
|
3
6
|
|
7
|
+
#define GGML_COMMON_IMPL_C
|
8
|
+
#include "ggml-common.h"
|
9
|
+
|
4
10
|
#include <math.h>
|
5
11
|
#include <string.h>
|
6
12
|
#include <assert.h>
|
@@ -51,6 +57,7 @@
|
|
51
57
|
|
52
58
|
#define UNUSED GGML_UNUSED
|
53
59
|
|
60
|
+
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
54
61
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
55
62
|
|
56
63
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
@@ -463,8 +470,8 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
463
470
|
}
|
464
471
|
|
465
472
|
// NOTE: not tested
|
466
|
-
inline static
|
467
|
-
|
473
|
+
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
474
|
+
uint8x16_t res;
|
468
475
|
|
469
476
|
res[ 0] = a[b[ 0]];
|
470
477
|
res[ 1] = a[b[ 1]];
|
@@ -947,7 +954,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
|
|
947
954
|
const float d = amax / ((1 << 7) - 1);
|
948
955
|
const float id = d ? 1.0f/d : 0.0f;
|
949
956
|
|
950
|
-
y[i].d = d;
|
957
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
951
958
|
|
952
959
|
int sum = 0;
|
953
960
|
|
@@ -962,7 +969,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
|
|
962
969
|
sum += y[i].qs[QK8_1/2 + j];
|
963
970
|
}
|
964
971
|
|
965
|
-
y[i].s = sum*d;
|
972
|
+
y[i].s = GGML_FP32_TO_FP16(sum*d);
|
966
973
|
}
|
967
974
|
}
|
968
975
|
|
@@ -990,7 +997,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
990
997
|
const float d = amax / ((1 << 7) - 1);
|
991
998
|
const float id = d ? 1.0f/d : 0.0f;
|
992
999
|
|
993
|
-
y[i].d = d;
|
1000
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
994
1001
|
|
995
1002
|
int32x4_t accv = vdupq_n_s32(0);
|
996
1003
|
|
@@ -1006,7 +1013,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1006
1013
|
accv = vaddq_s32(accv, vi);
|
1007
1014
|
}
|
1008
1015
|
|
1009
|
-
y[i].s = d * vaddvq_s32(accv);
|
1016
|
+
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
|
1010
1017
|
}
|
1011
1018
|
#elif defined(__wasm_simd128__)
|
1012
1019
|
for (int i = 0; i < nb; i++) {
|
@@ -1029,7 +1036,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1029
1036
|
const float d = amax / ((1 << 7) - 1);
|
1030
1037
|
const float id = d ? 1.0f/d : 0.0f;
|
1031
1038
|
|
1032
|
-
y[i].d = d;
|
1039
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1033
1040
|
|
1034
1041
|
v128_t accv = wasm_i32x4_splat(0);
|
1035
1042
|
|
@@ -1045,10 +1052,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1045
1052
|
accv = wasm_i32x4_add(accv, vi);
|
1046
1053
|
}
|
1047
1054
|
|
1048
|
-
y[i].s =
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1055
|
+
y[i].s = GGML_FP32_TO_FP16(
|
1056
|
+
d * (wasm_i32x4_extract_lane(accv, 0) +
|
1057
|
+
wasm_i32x4_extract_lane(accv, 1) +
|
1058
|
+
wasm_i32x4_extract_lane(accv, 2) +
|
1059
|
+
wasm_i32x4_extract_lane(accv, 3)));
|
1052
1060
|
}
|
1053
1061
|
#elif defined(__AVX2__) || defined(__AVX__)
|
1054
1062
|
for (int i = 0; i < nb; i++) {
|
@@ -1073,7 +1081,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1073
1081
|
|
1074
1082
|
// Quantize these floats
|
1075
1083
|
const float d = maxScalar / 127.f;
|
1076
|
-
y[i].d = d;
|
1084
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1077
1085
|
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
1078
1086
|
const __m256 mul = _mm256_set1_ps( id );
|
1079
1087
|
|
@@ -1097,7 +1105,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1097
1105
|
|
1098
1106
|
#if defined(__AVX2__)
|
1099
1107
|
// Compute the sum of the quants and set y[i].s
|
1100
|
-
y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
|
1108
|
+
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
|
1101
1109
|
|
1102
1110
|
// Convert int32 to int16
|
1103
1111
|
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
@@ -1127,7 +1135,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1127
1135
|
// Compute the sum of the quants and set y[i].s
|
1128
1136
|
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
|
1129
1137
|
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
|
1130
|
-
y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
|
1138
|
+
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
|
1131
1139
|
|
1132
1140
|
// Convert int32 to int16
|
1133
1141
|
ni0 = _mm_packs_epi32( ni0, ni1 );
|
@@ -1158,7 +1166,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1158
1166
|
const float d = amax / ((1 << 7) - 1);
|
1159
1167
|
const float id = d ? 1.0f/d : 0.0f;
|
1160
1168
|
|
1161
|
-
y[i].d = d;
|
1169
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1162
1170
|
|
1163
1171
|
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
|
1164
1172
|
|
@@ -1175,7 +1183,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1175
1183
|
|
1176
1184
|
// set y[i].s
|
1177
1185
|
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
1178
|
-
y[i].s = sum*d;
|
1186
|
+
y[i].s = GGML_FP32_TO_FP16(sum*d);
|
1179
1187
|
}
|
1180
1188
|
#else
|
1181
1189
|
GGML_UNUSED(nb);
|
@@ -1700,16 +1708,6 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
|
1700
1708
|
quantize_row_q2_K_reference(x, vy, k);
|
1701
1709
|
}
|
1702
1710
|
|
1703
|
-
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
1704
|
-
(void)hist; // TODO: collect histograms
|
1705
|
-
|
1706
|
-
for (int j = 0; j < n; j += k) {
|
1707
|
-
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
1708
|
-
quantize_row_q2_K_reference(src + j, y, k);
|
1709
|
-
}
|
1710
|
-
return (n/QK_K*sizeof(block_q2_K));
|
1711
|
-
}
|
1712
|
-
|
1713
1711
|
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
1714
1712
|
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
1715
1713
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
@@ -1962,8 +1960,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1962
1960
|
}
|
1963
1961
|
}
|
1964
1962
|
|
1965
|
-
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row,
|
1966
|
-
(void)hist;
|
1963
|
+
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
1967
1964
|
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
1968
1965
|
if (!quant_weights) {
|
1969
1966
|
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
@@ -2182,16 +2179,6 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
|
2182
2179
|
quantize_row_q3_K_reference(x, vy, k);
|
2183
2180
|
}
|
2184
2181
|
|
2185
|
-
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
2186
|
-
(void)hist; // TODO: collect histograms
|
2187
|
-
|
2188
|
-
for (int j = 0; j < n; j += k) {
|
2189
|
-
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
2190
|
-
quantize_row_q3_K_reference(src + j, y, k);
|
2191
|
-
}
|
2192
|
-
return (n/QK_K*sizeof(block_q3_K));
|
2193
|
-
}
|
2194
|
-
|
2195
2182
|
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
|
2196
2183
|
#if QK_K != 256
|
2197
2184
|
(void)quant_weights;
|
@@ -2281,8 +2268,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
2281
2268
|
#endif
|
2282
2269
|
}
|
2283
2270
|
|
2284
|
-
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row,
|
2285
|
-
(void)hist;
|
2271
|
+
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
2286
2272
|
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
2287
2273
|
if (!quant_weights) {
|
2288
2274
|
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
@@ -2452,17 +2438,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
|
2452
2438
|
quantize_row_q4_K_reference(x, y, k);
|
2453
2439
|
}
|
2454
2440
|
|
2455
|
-
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
2456
|
-
assert(k % QK_K == 0);
|
2457
|
-
(void)hist; // TODO: collect histograms
|
2458
|
-
|
2459
|
-
for (int j = 0; j < n; j += k) {
|
2460
|
-
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
2461
|
-
quantize_row_q4_K_reference(src + j, y, k);
|
2462
|
-
}
|
2463
|
-
return (n/QK_K*sizeof(block_q4_K));
|
2464
|
-
}
|
2465
|
-
|
2466
2441
|
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
|
2467
2442
|
#if QK_K != 256
|
2468
2443
|
(void)quant_weights;
|
@@ -2541,8 +2516,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
2541
2516
|
#endif
|
2542
2517
|
}
|
2543
2518
|
|
2544
|
-
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row,
|
2545
|
-
(void)hist;
|
2519
|
+
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
2546
2520
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
2547
2521
|
if (!quant_weights) {
|
2548
2522
|
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
@@ -2753,17 +2727,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
|
2753
2727
|
quantize_row_q5_K_reference(x, y, k);
|
2754
2728
|
}
|
2755
2729
|
|
2756
|
-
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
2757
|
-
assert(k % QK_K == 0);
|
2758
|
-
(void)hist; // TODO: collect histograms
|
2759
|
-
|
2760
|
-
for (int j = 0; j < n; j += k) {
|
2761
|
-
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
2762
|
-
quantize_row_q5_K_reference(src + j, y, k);
|
2763
|
-
}
|
2764
|
-
return (n/QK_K*sizeof(block_q5_K));
|
2765
|
-
}
|
2766
|
-
|
2767
2730
|
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
|
2768
2731
|
#if QK_K != 256
|
2769
2732
|
(void)quant_weights;
|
@@ -2862,8 +2825,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
2862
2825
|
#endif
|
2863
2826
|
}
|
2864
2827
|
|
2865
|
-
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row,
|
2866
|
-
(void)hist;
|
2828
|
+
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
2867
2829
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
2868
2830
|
if (!quant_weights) {
|
2869
2831
|
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
@@ -3016,17 +2978,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
|
3016
2978
|
quantize_row_q6_K_reference(x, y, k);
|
3017
2979
|
}
|
3018
2980
|
|
3019
|
-
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
3020
|
-
assert(k % QK_K == 0);
|
3021
|
-
(void)hist; // TODO: collect histograms
|
3022
|
-
|
3023
|
-
for (int j = 0; j < n; j += k) {
|
3024
|
-
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
3025
|
-
quantize_row_q6_K_reference(src + j, y, k);
|
3026
|
-
}
|
3027
|
-
return (n/QK_K*sizeof(block_q6_K));
|
3028
|
-
}
|
3029
|
-
|
3030
2981
|
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
|
3031
2982
|
#if QK_K != 256
|
3032
2983
|
(void)quant_weights;
|
@@ -3116,8 +3067,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
3116
3067
|
#endif
|
3117
3068
|
}
|
3118
3069
|
|
3119
|
-
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row,
|
3120
|
-
(void)hist;
|
3070
|
+
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3121
3071
|
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
3122
3072
|
if (!quant_weights) {
|
3123
3073
|
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
@@ -3161,9 +3111,10 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
3161
3111
|
}
|
3162
3112
|
}
|
3163
3113
|
|
3164
|
-
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row,
|
3114
|
+
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3165
3115
|
if (!quant_weights) {
|
3166
|
-
|
3116
|
+
quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
|
3117
|
+
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
3167
3118
|
}
|
3168
3119
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
3169
3120
|
char * qrow = (char *)dst;
|
@@ -3205,9 +3156,10 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
3205
3156
|
}
|
3206
3157
|
}
|
3207
3158
|
|
3208
|
-
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row,
|
3159
|
+
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3209
3160
|
if (!quant_weights) {
|
3210
|
-
|
3161
|
+
quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
|
3162
|
+
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
3211
3163
|
}
|
3212
3164
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
3213
3165
|
char * qrow = (char *)dst;
|
@@ -3258,9 +3210,10 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
3258
3210
|
}
|
3259
3211
|
}
|
3260
3212
|
|
3261
|
-
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row,
|
3213
|
+
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3262
3214
|
if (!quant_weights) {
|
3263
|
-
|
3215
|
+
quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
|
3216
|
+
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
3264
3217
|
}
|
3265
3218
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
3266
3219
|
char * qrow = (char *)dst;
|
@@ -3310,9 +3263,10 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
3310
3263
|
}
|
3311
3264
|
}
|
3312
3265
|
|
3313
|
-
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row,
|
3266
|
+
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3314
3267
|
if (!quant_weights) {
|
3315
|
-
|
3268
|
+
quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
|
3269
|
+
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
3316
3270
|
}
|
3317
3271
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
3318
3272
|
char * qrow = (char *)dst;
|
@@ -3324,712 +3278,14 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
|
|
3324
3278
|
return nrow * row_size;
|
3325
3279
|
}
|
3326
3280
|
|
3327
|
-
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
|
3333
|
-
0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
|
3334
|
-
0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
|
3335
|
-
0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
|
3336
|
-
0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
|
3337
|
-
0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
|
3338
|
-
0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
|
3339
|
-
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
|
3340
|
-
0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
|
3341
|
-
0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
|
3342
|
-
0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
|
3343
|
-
0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
|
3344
|
-
0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
|
3345
|
-
0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
|
3346
|
-
0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
|
3347
|
-
0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
|
3348
|
-
0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
|
3349
|
-
0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
|
3350
|
-
0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
|
3351
|
-
0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
|
3352
|
-
0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
|
3353
|
-
0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
|
3354
|
-
0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
|
3355
|
-
0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
|
3356
|
-
0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
|
3357
|
-
0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
|
3358
|
-
0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
|
3359
|
-
0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
|
3360
|
-
0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
|
3361
|
-
0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
|
3362
|
-
0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
|
3363
|
-
0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
|
3364
|
-
0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
|
3365
|
-
0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
|
3366
|
-
0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
|
3367
|
-
0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
|
3368
|
-
0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
|
3369
|
-
0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
|
3370
|
-
0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
|
3371
|
-
0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
|
3372
|
-
0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
|
3373
|
-
0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
|
3374
|
-
0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
|
3375
|
-
0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
|
3376
|
-
0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
|
3377
|
-
0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
|
3378
|
-
0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
|
3379
|
-
0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
|
3380
|
-
0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
|
3381
|
-
0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
|
3382
|
-
0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
|
3383
|
-
0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
|
3384
|
-
0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
|
3385
|
-
0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
|
3386
|
-
0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
|
3387
|
-
0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
|
3388
|
-
0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
|
3389
|
-
0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
|
3390
|
-
0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
|
3391
|
-
0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
|
3392
|
-
0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
|
3393
|
-
0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
|
3394
|
-
};
|
3395
|
-
|
3396
|
-
static const uint64_t iq2xs_grid[512] = {
|
3397
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
3398
|
-
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
3399
|
-
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
3400
|
-
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
3401
|
-
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
3402
|
-
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
3403
|
-
0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
|
3404
|
-
0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
|
3405
|
-
0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
|
3406
|
-
0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
|
3407
|
-
0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
|
3408
|
-
0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
|
3409
|
-
0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
|
3410
|
-
0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
|
3411
|
-
0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
|
3412
|
-
0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
|
3413
|
-
0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
|
3414
|
-
0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
|
3415
|
-
0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
|
3416
|
-
0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
|
3417
|
-
0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
|
3418
|
-
0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
|
3419
|
-
0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
|
3420
|
-
0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
|
3421
|
-
0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
|
3422
|
-
0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
|
3423
|
-
0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
|
3424
|
-
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
|
3425
|
-
0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
|
3426
|
-
0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
|
3427
|
-
0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
|
3428
|
-
0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
|
3429
|
-
0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
|
3430
|
-
0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
|
3431
|
-
0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
|
3432
|
-
0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
|
3433
|
-
0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
|
3434
|
-
0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
|
3435
|
-
0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
|
3436
|
-
0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
|
3437
|
-
0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
|
3438
|
-
0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
|
3439
|
-
0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
|
3440
|
-
0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
|
3441
|
-
0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
|
3442
|
-
0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
|
3443
|
-
0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
|
3444
|
-
0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
|
3445
|
-
0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
|
3446
|
-
0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
|
3447
|
-
0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
|
3448
|
-
0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
|
3449
|
-
0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
|
3450
|
-
0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
|
3451
|
-
0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
|
3452
|
-
0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
|
3453
|
-
0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
|
3454
|
-
0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
|
3455
|
-
0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
|
3456
|
-
0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
|
3457
|
-
0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
|
3458
|
-
0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
|
3459
|
-
0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
|
3460
|
-
0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
|
3461
|
-
0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
|
3462
|
-
0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
3463
|
-
0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
|
3464
|
-
0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
|
3465
|
-
0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
|
3466
|
-
0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
|
3467
|
-
0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
|
3468
|
-
0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
|
3469
|
-
0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
|
3470
|
-
0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
|
3471
|
-
0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
|
3472
|
-
0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
|
3473
|
-
0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
|
3474
|
-
0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
|
3475
|
-
0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
|
3476
|
-
0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
|
3477
|
-
0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
|
3478
|
-
0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
|
3479
|
-
0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
|
3480
|
-
0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
|
3481
|
-
0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
|
3482
|
-
0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
|
3483
|
-
0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
|
3484
|
-
0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
|
3485
|
-
0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
|
3486
|
-
0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
|
3487
|
-
0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
|
3488
|
-
0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
|
3489
|
-
0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
|
3490
|
-
0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
|
3491
|
-
0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
|
3492
|
-
0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
|
3493
|
-
0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
|
3494
|
-
0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
|
3495
|
-
0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
|
3496
|
-
0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
|
3497
|
-
0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
|
3498
|
-
0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
|
3499
|
-
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
|
3500
|
-
0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
|
3501
|
-
0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
|
3502
|
-
0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
|
3503
|
-
0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
|
3504
|
-
0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
|
3505
|
-
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
|
3506
|
-
0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
|
3507
|
-
0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
|
3508
|
-
0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
|
3509
|
-
0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
|
3510
|
-
0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
|
3511
|
-
0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
|
3512
|
-
0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
|
3513
|
-
0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
|
3514
|
-
0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
|
3515
|
-
0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
|
3516
|
-
0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
|
3517
|
-
0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
|
3518
|
-
0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
|
3519
|
-
0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
|
3520
|
-
0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
|
3521
|
-
0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
|
3522
|
-
0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
|
3523
|
-
0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
|
3524
|
-
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
3525
|
-
};
|
3526
|
-
|
3527
|
-
static const uint64_t iq2s_grid[1024] = {
|
3528
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
3529
|
-
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
3530
|
-
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
3531
|
-
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
3532
|
-
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
3533
|
-
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
3534
|
-
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
3535
|
-
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
3536
|
-
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
3537
|
-
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
3538
|
-
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
3539
|
-
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
3540
|
-
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
3541
|
-
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
3542
|
-
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
3543
|
-
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
3544
|
-
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
3545
|
-
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
3546
|
-
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
3547
|
-
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
3548
|
-
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
3549
|
-
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
3550
|
-
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
3551
|
-
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
3552
|
-
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
3553
|
-
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
3554
|
-
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
3555
|
-
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
3556
|
-
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
3557
|
-
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
3558
|
-
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
3559
|
-
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
3560
|
-
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
3561
|
-
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
3562
|
-
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
3563
|
-
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
3564
|
-
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
3565
|
-
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
3566
|
-
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
3567
|
-
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
3568
|
-
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
3569
|
-
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
3570
|
-
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
3571
|
-
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
3572
|
-
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
3573
|
-
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
3574
|
-
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
3575
|
-
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
3576
|
-
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
3577
|
-
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
3578
|
-
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
3579
|
-
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
3580
|
-
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
3581
|
-
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
3582
|
-
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
3583
|
-
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
3584
|
-
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
3585
|
-
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
3586
|
-
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
3587
|
-
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
3588
|
-
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
3589
|
-
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
3590
|
-
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
3591
|
-
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
3592
|
-
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
3593
|
-
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
3594
|
-
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
3595
|
-
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
3596
|
-
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
3597
|
-
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
3598
|
-
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
3599
|
-
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
3600
|
-
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
3601
|
-
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
3602
|
-
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
3603
|
-
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
3604
|
-
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
3605
|
-
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
3606
|
-
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
3607
|
-
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
3608
|
-
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
3609
|
-
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
3610
|
-
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
3611
|
-
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
3612
|
-
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
3613
|
-
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
3614
|
-
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
3615
|
-
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
3616
|
-
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
3617
|
-
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
3618
|
-
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
3619
|
-
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
3620
|
-
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
3621
|
-
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
3622
|
-
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
3623
|
-
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
3624
|
-
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
3625
|
-
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
3626
|
-
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
3627
|
-
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
3628
|
-
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
3629
|
-
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
3630
|
-
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
3631
|
-
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
3632
|
-
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
3633
|
-
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
3634
|
-
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
3635
|
-
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
3636
|
-
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
3637
|
-
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
3638
|
-
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
3639
|
-
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
3640
|
-
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
3641
|
-
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
3642
|
-
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
3643
|
-
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
3644
|
-
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
3645
|
-
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
3646
|
-
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
3647
|
-
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
3648
|
-
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
3649
|
-
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
3650
|
-
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
3651
|
-
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
3652
|
-
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
3653
|
-
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
3654
|
-
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
3655
|
-
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
3656
|
-
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
3657
|
-
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
3658
|
-
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
3659
|
-
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
3660
|
-
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
3661
|
-
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
3662
|
-
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
3663
|
-
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
3664
|
-
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
3665
|
-
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
3666
|
-
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
3667
|
-
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
3668
|
-
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
3669
|
-
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
3670
|
-
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
3671
|
-
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
3672
|
-
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
3673
|
-
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
3674
|
-
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
3675
|
-
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
3676
|
-
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
3677
|
-
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
3678
|
-
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
3679
|
-
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
3680
|
-
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
3681
|
-
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
3682
|
-
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
3683
|
-
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
3684
|
-
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
3685
|
-
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
3686
|
-
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
3687
|
-
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
3688
|
-
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
3689
|
-
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
3690
|
-
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
3691
|
-
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
3692
|
-
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
3693
|
-
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
3694
|
-
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
3695
|
-
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
3696
|
-
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
3697
|
-
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
3698
|
-
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
3699
|
-
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
3700
|
-
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
3701
|
-
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
3702
|
-
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
3703
|
-
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
3704
|
-
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
3705
|
-
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
3706
|
-
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
3707
|
-
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
3708
|
-
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
3709
|
-
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
3710
|
-
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
3711
|
-
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
3712
|
-
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
3713
|
-
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
3714
|
-
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
3715
|
-
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
3716
|
-
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
3717
|
-
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
3718
|
-
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
3719
|
-
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
3720
|
-
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
3721
|
-
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
3722
|
-
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
3723
|
-
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
3724
|
-
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
3725
|
-
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
3726
|
-
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
3727
|
-
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
3728
|
-
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
3729
|
-
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
3730
|
-
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
3731
|
-
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
3732
|
-
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
3733
|
-
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
3734
|
-
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
3735
|
-
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
3736
|
-
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
3737
|
-
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
3738
|
-
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
3739
|
-
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
3740
|
-
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
3741
|
-
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
3742
|
-
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
3743
|
-
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
3744
|
-
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
3745
|
-
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
3746
|
-
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
3747
|
-
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
3748
|
-
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
3749
|
-
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
3750
|
-
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
3751
|
-
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
3752
|
-
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
3753
|
-
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
3754
|
-
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
3755
|
-
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
3756
|
-
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
3757
|
-
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
3758
|
-
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
3759
|
-
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
3760
|
-
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
3761
|
-
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
3762
|
-
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
3763
|
-
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
3764
|
-
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
3765
|
-
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
3766
|
-
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
3767
|
-
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
3768
|
-
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
3769
|
-
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
3770
|
-
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
3771
|
-
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
3772
|
-
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
3773
|
-
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
3774
|
-
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
3775
|
-
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
3776
|
-
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
3777
|
-
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
3778
|
-
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
3779
|
-
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
3780
|
-
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
3781
|
-
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
3782
|
-
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
3783
|
-
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
3784
|
-
};
|
3785
|
-
|
3786
|
-
static const uint32_t iq3xxs_grid[256] = {
|
3787
|
-
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
3788
|
-
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
3789
|
-
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
3790
|
-
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
3791
|
-
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
3792
|
-
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
3793
|
-
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
3794
|
-
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
3795
|
-
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
3796
|
-
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
3797
|
-
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
3798
|
-
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
3799
|
-
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
3800
|
-
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
3801
|
-
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
3802
|
-
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
3803
|
-
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
3804
|
-
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
3805
|
-
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
3806
|
-
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
3807
|
-
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
3808
|
-
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
3809
|
-
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
3810
|
-
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
3811
|
-
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
3812
|
-
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
3813
|
-
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
3814
|
-
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
3815
|
-
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
3816
|
-
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
3817
|
-
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
3818
|
-
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
3819
|
-
};
|
3820
|
-
|
3821
|
-
static const uint32_t iq3xs_grid[512] = {
|
3822
|
-
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
3823
|
-
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
3824
|
-
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
3825
|
-
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
3826
|
-
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
3827
|
-
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
3828
|
-
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
3829
|
-
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
3830
|
-
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
3831
|
-
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
3832
|
-
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
3833
|
-
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
3834
|
-
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
3835
|
-
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
3836
|
-
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
3837
|
-
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
3838
|
-
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
3839
|
-
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
3840
|
-
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
3841
|
-
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
3842
|
-
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
3843
|
-
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
3844
|
-
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
3845
|
-
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
3846
|
-
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
3847
|
-
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
3848
|
-
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
3849
|
-
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
3850
|
-
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
3851
|
-
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
3852
|
-
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
3853
|
-
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
3854
|
-
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
3855
|
-
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
3856
|
-
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
3857
|
-
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
3858
|
-
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
3859
|
-
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
3860
|
-
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
3861
|
-
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
3862
|
-
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
3863
|
-
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
3864
|
-
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
3865
|
-
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
3866
|
-
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
3867
|
-
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
3868
|
-
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
3869
|
-
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
3870
|
-
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
3871
|
-
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
3872
|
-
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
3873
|
-
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
3874
|
-
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
3875
|
-
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
3876
|
-
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
3877
|
-
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
3878
|
-
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
3879
|
-
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
3880
|
-
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
3881
|
-
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
3882
|
-
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
3883
|
-
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
3884
|
-
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
3885
|
-
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
3886
|
-
};
|
3887
|
-
|
3888
|
-
#define NGRID_IQ2XXS 512
|
3889
|
-
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
3890
|
-
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
3891
|
-
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
3892
|
-
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
3893
|
-
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
3894
|
-
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
3895
|
-
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
3896
|
-
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
3897
|
-
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
3898
|
-
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
3899
|
-
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
3900
|
-
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
3901
|
-
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
3902
|
-
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
3903
|
-
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
3904
|
-
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
3905
|
-
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
3906
|
-
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
3907
|
-
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
3908
|
-
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
3909
|
-
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
3910
|
-
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
3911
|
-
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
3912
|
-
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
3913
|
-
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
3914
|
-
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
3915
|
-
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
3916
|
-
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
3917
|
-
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
3918
|
-
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
3919
|
-
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
3920
|
-
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
3921
|
-
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
3922
|
-
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
3923
|
-
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
3924
|
-
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
3925
|
-
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
3926
|
-
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
3927
|
-
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
3928
|
-
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
3929
|
-
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
3930
|
-
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
3931
|
-
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
3932
|
-
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
3933
|
-
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
3934
|
-
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
3935
|
-
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
3936
|
-
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
3937
|
-
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
3938
|
-
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
3939
|
-
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
3940
|
-
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
3941
|
-
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
3942
|
-
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
3943
|
-
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
3944
|
-
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
3945
|
-
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
3946
|
-
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
3947
|
-
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
3948
|
-
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
3949
|
-
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
3950
|
-
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
3951
|
-
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
3952
|
-
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
3953
|
-
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
3954
|
-
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
3955
|
-
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
3956
|
-
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
3957
|
-
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
3958
|
-
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
3959
|
-
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
3960
|
-
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
3961
|
-
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
3962
|
-
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
3963
|
-
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
3964
|
-
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
3965
|
-
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
3966
|
-
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
3967
|
-
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
3968
|
-
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
3969
|
-
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
3970
|
-
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
3971
|
-
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
3972
|
-
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
3973
|
-
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
3974
|
-
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
3975
|
-
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
3976
|
-
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
3977
|
-
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
3978
|
-
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
3979
|
-
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
3980
|
-
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
3981
|
-
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
3982
|
-
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
3983
|
-
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
3984
|
-
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
3985
|
-
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
3986
|
-
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
3987
|
-
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
3988
|
-
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
3989
|
-
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
3990
|
-
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
3991
|
-
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
3992
|
-
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
3993
|
-
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
3994
|
-
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
3995
|
-
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
3996
|
-
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
3997
|
-
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
3998
|
-
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
3999
|
-
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
4000
|
-
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
4001
|
-
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
4002
|
-
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
4003
|
-
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
4004
|
-
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
4005
|
-
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
4006
|
-
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
4007
|
-
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
4008
|
-
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
4009
|
-
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
4010
|
-
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
4011
|
-
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
4012
|
-
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
4013
|
-
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
4014
|
-
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
4015
|
-
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
4016
|
-
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
4017
|
-
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
4018
|
-
|
4019
|
-
};
|
4020
|
-
|
4021
|
-
static const uint8_t ksigns_iq2xs[128] = {
|
4022
|
-
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
4023
|
-
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
4024
|
-
160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
|
4025
|
-
48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
|
4026
|
-
192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
|
4027
|
-
80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
|
4028
|
-
96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
|
4029
|
-
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
4030
|
-
};
|
3281
|
+
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3282
|
+
(void)quant_weights; // not used
|
3283
|
+
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
3284
|
+
quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
|
3285
|
+
return nrow * row_size;
|
3286
|
+
}
|
4031
3287
|
|
4032
|
-
|
3288
|
+
// ====================== "True" 2-bit (de)-quantization
|
4033
3289
|
|
4034
3290
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
4035
3291
|
assert(k % QK_K == 0);
|
@@ -4162,11 +3418,11 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
4162
3418
|
const uint8_t * signs = x[i].signs;
|
4163
3419
|
|
4164
3420
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
4165
|
-
const float db1 = d * (
|
4166
|
-
const float db2 = d * (
|
3421
|
+
const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
3422
|
+
const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
4167
3423
|
for (int l = 0; l < 4; ++l) {
|
4168
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
4169
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
3424
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
3425
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
4170
3426
|
for (int j = 0; j < 4; ++j) {
|
4171
3427
|
y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4172
3428
|
y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
@@ -4176,8 +3432,8 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
4176
3432
|
qs += 8;
|
4177
3433
|
signs += 4;
|
4178
3434
|
for (int l = 0; l < 4; ++l) {
|
4179
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
4180
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
3435
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
|
3436
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
|
4181
3437
|
for (int j = 0; j < 4; ++j) {
|
4182
3438
|
y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4183
3439
|
y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
@@ -4197,39 +3453,23 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
|
4197
3453
|
assert(k % QK_K == 0);
|
4198
3454
|
const int nb = k / QK_K;
|
4199
3455
|
|
4200
|
-
float db[4];
|
4201
|
-
uint16_t idx[4];
|
4202
|
-
//const int8_t * grid[4];
|
4203
|
-
|
4204
3456
|
for (int i = 0; i < nb; i++) {
|
4205
3457
|
|
4206
3458
|
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4207
|
-
const uint8_t
|
4208
|
-
const
|
3459
|
+
const uint8_t * qs = x[i].qs;
|
3460
|
+
const uint16_t * qh = x[i].qh;
|
4209
3461
|
|
4210
|
-
for (int
|
4211
|
-
|
4212
|
-
|
4213
|
-
idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
|
4214
|
-
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
4215
|
-
//grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
4216
|
-
//grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
4217
|
-
//grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
|
4218
|
-
//grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
|
4219
|
-
db[0] = d * (2*(sc[0] & 7) + 1);
|
4220
|
-
db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
|
4221
|
-
db[2] = d * (2*(sc[1] & 7) + 1);
|
4222
|
-
db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
|
3462
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
3463
|
+
const float dl = d * (2*((qh[ib] >> 12) & 7) + 1);
|
3464
|
+
const float delta = qh[ib] & 0x8000 ? -IQ1S_DELTA : IQ1S_DELTA;
|
4223
3465
|
for (int l = 0; l < 4; ++l) {
|
4224
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid +
|
3466
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
4225
3467
|
for (int j = 0; j < 8; ++j) {
|
4226
|
-
|
4227
|
-
y[j] = db[l] * grid[j];
|
3468
|
+
y[j] = dl * (grid[j] + delta);
|
4228
3469
|
}
|
4229
3470
|
y += 8;
|
4230
3471
|
}
|
4231
3472
|
qs += 4;
|
4232
|
-
sc += 2;
|
4233
3473
|
}
|
4234
3474
|
}
|
4235
3475
|
}
|
@@ -4783,10 +4023,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4783
4023
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
4784
4024
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
4785
4025
|
|
4786
|
-
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
|
4787
|
-
GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
|
4788
|
-
GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
|
4789
|
-
GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
|
4026
|
+
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
4027
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
4028
|
+
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
4029
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
4790
4030
|
summs0 += summs_t;
|
4791
4031
|
|
4792
4032
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
@@ -4807,10 +4047,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4807
4047
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
4808
4048
|
|
4809
4049
|
// mmla into int32x4_t
|
4810
|
-
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*
|
4811
|
-
GGML_FP16_TO_FP32(b_x0->d)*
|
4812
|
-
GGML_FP16_TO_FP32(b_x1->d)*
|
4813
|
-
GGML_FP16_TO_FP32(b_x1->d)*
|
4050
|
+
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
4051
|
+
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
4052
|
+
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
4053
|
+
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
4814
4054
|
|
4815
4055
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4816
4056
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
@@ -4851,7 +4091,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4851
4091
|
const block_q8_1 * restrict y0 = &y[i + 0];
|
4852
4092
|
const block_q8_1 * restrict y1 = &y[i + 1];
|
4853
4093
|
|
4854
|
-
summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
|
4094
|
+
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
4855
4095
|
|
4856
4096
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
4857
4097
|
|
@@ -4874,8 +4114,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4874
4114
|
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
4875
4115
|
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
4876
4116
|
|
4877
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
4878
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
4117
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
4118
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
4879
4119
|
}
|
4880
4120
|
|
4881
4121
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
@@ -4888,9 +4128,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4888
4128
|
// Main loop
|
4889
4129
|
for (int i = 0; i < nb; ++i) {
|
4890
4130
|
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
4891
|
-
const float d1 = y[i].d;
|
4131
|
+
const float d1 = GGML_FP16_TO_FP32(y[i].d);
|
4892
4132
|
|
4893
|
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
4133
|
+
summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
|
4894
4134
|
|
4895
4135
|
const __m256 d0v = _mm256_set1_ps( d0 );
|
4896
4136
|
const __m256 d1v = _mm256_set1_ps( d1 );
|
@@ -4942,7 +4182,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4942
4182
|
|
4943
4183
|
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
4944
4184
|
|
4945
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
4185
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
4946
4186
|
}
|
4947
4187
|
|
4948
4188
|
*s = sumf;
|
@@ -4960,7 +4200,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4960
4200
|
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
4961
4201
|
}
|
4962
4202
|
|
4963
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
4203
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
4964
4204
|
}
|
4965
4205
|
|
4966
4206
|
*s = sumf;
|
@@ -5296,8 +4536,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5296
4536
|
|
5297
4537
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
5298
4538
|
|
5299
|
-
summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
|
5300
|
-
summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
|
4539
|
+
summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
4540
|
+
summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
5301
4541
|
|
5302
4542
|
// extract the 5th bit via lookup table ((b) << 4)
|
5303
4543
|
memcpy(&qh0, x0->qh, sizeof(qh0));
|
@@ -5341,10 +4581,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5341
4581
|
|
5342
4582
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
5343
4583
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
5344
|
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
4584
|
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
5345
4585
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
5346
4586
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
5347
|
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
4587
|
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
5348
4588
|
}
|
5349
4589
|
|
5350
4590
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
@@ -5361,7 +4601,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5361
4601
|
const block_q5_1 * restrict x0 = &x[i];
|
5362
4602
|
const block_q8_1 * restrict y0 = &y[i];
|
5363
4603
|
|
5364
|
-
summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
|
4604
|
+
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
5365
4605
|
|
5366
4606
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
5367
4607
|
|
@@ -5408,7 +4648,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5408
4648
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
5409
4649
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
5410
4650
|
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
5411
|
-
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
4651
|
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
5412
4652
|
}
|
5413
4653
|
|
5414
4654
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
@@ -5423,14 +4663,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5423
4663
|
for (int i = 0; i < nb; i++) {
|
5424
4664
|
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
|
5425
4665
|
|
5426
|
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
4666
|
+
summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
|
5427
4667
|
|
5428
4668
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
5429
4669
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
5430
4670
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
5431
4671
|
qx = _mm256_or_si256(qx, bxhi);
|
5432
4672
|
|
5433
|
-
const __m256 dy = _mm256_set1_ps(y[i].d);
|
4673
|
+
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
|
5434
4674
|
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
5435
4675
|
|
5436
4676
|
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
@@ -5450,7 +4690,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5450
4690
|
for (int i = 0; i < nb; i++) {
|
5451
4691
|
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
|
5452
4692
|
|
5453
|
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
4693
|
+
summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
|
5454
4694
|
|
5455
4695
|
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
5456
4696
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
@@ -5464,7 +4704,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5464
4704
|
bxh = _mm_or_si128(bxh, bxhih);
|
5465
4705
|
bx_0 = MM256_SET_M128I(bxh, bxl);
|
5466
4706
|
|
5467
|
-
const __m256 dy = _mm256_set1_ps(y[i].d);
|
4707
|
+
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
|
5468
4708
|
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
5469
4709
|
|
5470
4710
|
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
@@ -5531,7 +4771,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5531
4771
|
|
5532
4772
|
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
5533
4773
|
|
5534
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
4774
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
5535
4775
|
}
|
5536
4776
|
|
5537
4777
|
*s = sumf;
|
@@ -5555,7 +4795,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5555
4795
|
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
5556
4796
|
}
|
5557
4797
|
|
5558
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
4798
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
5559
4799
|
}
|
5560
4800
|
|
5561
4801
|
*s = sumf;
|
@@ -9563,7 +8803,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9563
8803
|
|
9564
8804
|
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
9565
8805
|
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
9566
|
-
const __m256i full_signs =
|
8806
|
+
const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
|
9567
8807
|
|
9568
8808
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
9569
8809
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
@@ -9585,8 +8825,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9585
8825
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
9586
8826
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
9587
8827
|
|
9588
|
-
const __m256i sc1 =
|
9589
|
-
const __m256i sc2 =
|
8828
|
+
const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
8829
|
+
const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
9590
8830
|
|
9591
8831
|
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
9592
8832
|
|
@@ -9653,8 +8893,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9653
8893
|
|
9654
8894
|
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
9655
8895
|
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
9656
|
-
const __m256i full_signs_1 =
|
9657
|
-
const __m256i full_signs_2 =
|
8896
|
+
const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
|
8897
|
+
const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
|
9658
8898
|
|
9659
8899
|
__m256i signs;
|
9660
8900
|
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
@@ -9757,8 +8997,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9757
8997
|
|
9758
8998
|
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
9759
8999
|
|
9760
|
-
const
|
9761
|
-
const uint8x16_t
|
9000
|
+
const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
|
9001
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
9762
9002
|
const uint8x16_t m1 = vdupq_n_u8(1);
|
9763
9003
|
const int32x4_t vzero = vdupq_n_s32(0);
|
9764
9004
|
|
@@ -9789,7 +9029,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9789
9029
|
vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
|
9790
9030
|
qs += 8;
|
9791
9031
|
|
9792
|
-
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
9032
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
|
9793
9033
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9794
9034
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9795
9035
|
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
@@ -9798,7 +9038,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9798
9038
|
q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
|
9799
9039
|
q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
|
9800
9040
|
|
9801
|
-
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
9041
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
|
9802
9042
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9803
9043
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9804
9044
|
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
@@ -9869,12 +9109,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9869
9109
|
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
9870
9110
|
qs += 8;
|
9871
9111
|
|
9872
|
-
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
9112
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
|
9873
9113
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9874
9114
|
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
9875
9115
|
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
9876
9116
|
|
9877
|
-
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
9117
|
+
aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
|
9878
9118
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9879
9119
|
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
9880
9120
|
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
@@ -10074,7 +9314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
10074
9314
|
#endif
|
10075
9315
|
}
|
10076
9316
|
|
10077
|
-
void ggml_vec_dot_iq3_s_q8_K (int n, float *
|
9317
|
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
10078
9318
|
assert(n % QK_K == 0);
|
10079
9319
|
assert(nrc == 1);
|
10080
9320
|
UNUSED(nrc);
|
@@ -10089,18 +9329,35 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10089
9329
|
|
10090
9330
|
#if defined(__ARM_NEON)
|
10091
9331
|
|
9332
|
+
typedef union {
|
9333
|
+
uint16x8_t vec_index;
|
9334
|
+
uint16_t index[8];
|
9335
|
+
} vec_index_t;
|
9336
|
+
|
10092
9337
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10093
9338
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10094
9339
|
};
|
10095
9340
|
|
10096
9341
|
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
10097
9342
|
|
10098
|
-
const
|
10099
|
-
|
9343
|
+
static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
|
9344
|
+
|
9345
|
+
const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
|
9346
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
9347
|
+
|
9348
|
+
const int16x8_t hshift = vld1q_s16(k_shift);
|
9349
|
+
const uint16x8_t m256 = vdupq_n_u16(256);
|
9350
|
+
const uint8x16_t m1 = vdupq_n_u8(1);
|
10100
9351
|
|
10101
9352
|
uint8x16x2_t vs;
|
10102
9353
|
ggml_int8x16x4_t q3s;
|
10103
9354
|
ggml_int8x16x4_t q8b;
|
9355
|
+
vec_index_t idx;
|
9356
|
+
|
9357
|
+
#if QK_K == 256
|
9358
|
+
uint32_t scales32[2];
|
9359
|
+
const uint8_t * scales8 = (const uint8_t *)scales32;
|
9360
|
+
#endif
|
10104
9361
|
|
10105
9362
|
float sumf = 0;
|
10106
9363
|
for (int i = 0; i < nb; ++i) {
|
@@ -10109,47 +9366,63 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10109
9366
|
const uint8_t * restrict qh = x[i].qh;
|
10110
9367
|
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10111
9368
|
const int8_t * restrict q8 = y[i].qs;
|
9369
|
+
|
9370
|
+
#if QK_K == 256
|
9371
|
+
memcpy(scales32, x[i].scales, 4);
|
9372
|
+
scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
|
9373
|
+
scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
|
9374
|
+
#endif
|
9375
|
+
|
10112
9376
|
int sumi1 = 0, sumi2 = 0;
|
10113
9377
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10114
9378
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
10115
|
-
const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
|
10116
|
-
iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
|
10117
|
-
const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
|
10118
|
-
iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
|
10119
|
-
const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
|
10120
|
-
iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
|
10121
|
-
const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
|
10122
|
-
iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
|
10123
|
-
qs += 16;
|
10124
9379
|
|
10125
|
-
|
9380
|
+
const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
|
9381
|
+
idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
|
9382
|
+
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
9383
|
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
|
9384
|
+
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
9385
|
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
|
9386
|
+
idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
|
9387
|
+
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
9388
|
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
|
9389
|
+
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
9390
|
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
|
9391
|
+
|
9392
|
+
|
9393
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
|
10126
9394
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10127
9395
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10128
|
-
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
10129
|
-
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9396
|
+
vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
|
9397
|
+
vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
|
10130
9398
|
|
10131
|
-
q3s.val[0] =
|
10132
|
-
q3s.val[1] =
|
9399
|
+
q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
|
9400
|
+
q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
|
10133
9401
|
|
10134
|
-
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
9402
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
|
10135
9403
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10136
9404
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10137
|
-
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
10138
|
-
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9405
|
+
vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
|
9406
|
+
vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
|
10139
9407
|
|
10140
9408
|
signs += 4;
|
10141
9409
|
|
10142
|
-
q3s.val[2] =
|
10143
|
-
q3s.val[3] =
|
9410
|
+
q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
|
9411
|
+
q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
|
10144
9412
|
|
10145
9413
|
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
10146
9414
|
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
9415
|
+
#if QK_K == 256
|
9416
|
+
sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
|
9417
|
+
sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
|
9418
|
+
#else
|
10147
9419
|
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
10148
9420
|
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
9421
|
+
#endif
|
10149
9422
|
}
|
10150
9423
|
sumf += d*(sumi1 + sumi2);
|
10151
9424
|
}
|
10152
|
-
*s =
|
9425
|
+
*s = sumf;
|
10153
9426
|
|
10154
9427
|
#elif defined(__AVX2__)
|
10155
9428
|
|
@@ -10164,6 +9437,16 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10164
9437
|
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
10165
9438
|
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
10166
9439
|
|
9440
|
+
const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
9441
|
+
const __m256i idx_mask = _mm256_set1_epi32(256);
|
9442
|
+
|
9443
|
+
typedef union {
|
9444
|
+
__m256i vec[2];
|
9445
|
+
uint32_t index[16];
|
9446
|
+
} index_t;
|
9447
|
+
|
9448
|
+
index_t idx;
|
9449
|
+
|
10167
9450
|
__m256 accumf = _mm256_setzero_ps();
|
10168
9451
|
for (int i = 0; i < nb; ++i) {
|
10169
9452
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
@@ -10176,24 +9459,25 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10176
9459
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10177
9460
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10178
9461
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10179
|
-
const __m256i
|
10180
|
-
|
10181
|
-
|
10182
|
-
|
10183
|
-
|
10184
|
-
|
10185
|
-
|
10186
|
-
|
10187
|
-
|
10188
|
-
const __m256i
|
10189
|
-
|
10190
|
-
|
10191
|
-
|
10192
|
-
|
10193
|
-
|
10194
|
-
|
10195
|
-
|
10196
|
-
|
9462
|
+
const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
|
9463
|
+
idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
|
9464
|
+
idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
|
9465
|
+
idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
|
9466
|
+
idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
|
9467
|
+
idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
|
9468
|
+
idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
|
9469
|
+
|
9470
|
+
// At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
|
9471
|
+
//const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
|
9472
|
+
//const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
|
9473
|
+
const __m256i q2_1 = _mm256_set_epi32(
|
9474
|
+
iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
|
9475
|
+
iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
|
9476
|
+
);
|
9477
|
+
const __m256i q2_2 = _mm256_set_epi32(
|
9478
|
+
iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
|
9479
|
+
iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
|
9480
|
+
);
|
10197
9481
|
|
10198
9482
|
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
10199
9483
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
@@ -10221,7 +9505,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10221
9505
|
|
10222
9506
|
}
|
10223
9507
|
|
10224
|
-
*s =
|
9508
|
+
*s = hsum_float_8(accumf);
|
10225
9509
|
|
10226
9510
|
#else
|
10227
9511
|
|
@@ -10238,8 +9522,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10238
9522
|
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
10239
9523
|
int32_t sumi = 0;
|
10240
9524
|
for (int l = 0; l < 4; ++l) {
|
10241
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
10242
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
9525
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
9526
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
10243
9527
|
for (int j = 0; j < 4; ++j) {
|
10244
9528
|
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10245
9529
|
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
@@ -10251,8 +9535,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10251
9535
|
bsum += sumi * ls1;
|
10252
9536
|
sumi = 0;
|
10253
9537
|
for (int l = 0; l < 4; ++l) {
|
10254
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
10255
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
9538
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
9539
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
10256
9540
|
for (int j = 0; j < 4; ++j) {
|
10257
9541
|
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10258
9542
|
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
@@ -10265,7 +9549,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10265
9549
|
}
|
10266
9550
|
sumf += d * bsum;
|
10267
9551
|
}
|
10268
|
-
*s =
|
9552
|
+
*s = sumf;
|
10269
9553
|
#endif
|
10270
9554
|
}
|
10271
9555
|
|
@@ -10278,7 +9562,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
10278
9562
|
}
|
10279
9563
|
#endif
|
10280
9564
|
|
10281
|
-
void ggml_vec_dot_iq1_s_q8_K (int n, float *
|
9565
|
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
10282
9566
|
assert(n % QK_K == 0);
|
10283
9567
|
assert(nrc == 1);
|
10284
9568
|
UNUSED(nrc);
|
@@ -10291,155 +9575,119 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
10291
9575
|
|
10292
9576
|
const int nb = n / QK_K;
|
10293
9577
|
|
10294
|
-
|
10295
|
-
#if defined __ARM_NEON && QK_K == 256
|
10296
|
-
|
10297
|
-
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
10298
|
-
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
10299
|
-
const uint8x16_t m1 = vdupq_n_u8(0x01);
|
10300
|
-
const int32x4_t vzero = vdupq_n_s32(0);
|
9578
|
+
#if defined __ARM_NEON
|
10301
9579
|
|
10302
|
-
|
10303
|
-
uint16x8x2_t vindex;
|
10304
|
-
int8x16x4_t q1b;
|
9580
|
+
ggml_int8x16x4_t q1b;
|
10305
9581
|
ggml_int8x16x4_t q8b;
|
10306
|
-
uint16x8x4_t scales;
|
10307
|
-
int32x4x2_t sumi;
|
10308
|
-
int32x4x2_t dotq;
|
10309
9582
|
|
10310
9583
|
float sumf = 0;
|
10311
9584
|
for (int i = 0; i < nb; ++i) {
|
10312
9585
|
|
10313
|
-
const int8_t
|
10314
|
-
const uint8_t
|
10315
|
-
const
|
9586
|
+
const int8_t * q8 = y[i].qs;
|
9587
|
+
const uint8_t * qs = x[i].qs;
|
9588
|
+
const uint16_t * qh = x[i].qh;
|
10316
9589
|
|
10317
|
-
|
9590
|
+
int sumi1 = 0, sumi2 = 0, sumi3 = 0;
|
10318
9591
|
|
10319
|
-
for (int
|
10320
|
-
const uint8x16_t ql = vld1q_u8(qs); qs += 16;
|
10321
|
-
const uint8x8_t tm1 = vld1_u8 (sc); sc += 8;
|
10322
|
-
const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
|
10323
|
-
const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
|
10324
|
-
const uint8x16_t hbit = vandq_u8(qh, m8);
|
10325
|
-
vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
|
10326
|
-
vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
|
10327
|
-
const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
|
10328
|
-
scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
|
10329
|
-
scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
|
9592
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
10330
9593
|
|
10331
|
-
|
10332
|
-
|
10333
|
-
|
10334
|
-
|
10335
|
-
|
10336
|
-
|
10337
|
-
|
9594
|
+
q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
|
9595
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
|
9596
|
+
q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
|
9597
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
|
9598
|
+
q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
|
9599
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
|
9600
|
+
q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
|
9601
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
|
9602
|
+
qs += 8;
|
10338
9603
|
|
10339
|
-
|
10340
|
-
|
9604
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9605
|
+
|
9606
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
|
9607
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);
|
9608
|
+
|
9609
|
+
const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
9610
|
+
const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
9611
|
+
sumi1 += vaddvq_s32(p1) * ls1;
|
9612
|
+
sumi2 += vaddvq_s32(p2) * ls2;
|
9613
|
+
sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
|
9614
|
+
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);
|
10341
9615
|
|
10342
|
-
sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
|
10343
|
-
sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
|
10344
|
-
}
|
10345
9616
|
}
|
10346
9617
|
|
10347
|
-
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) *
|
9618
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
|
10348
9619
|
}
|
10349
9620
|
|
10350
9621
|
*s = sumf;
|
10351
9622
|
|
10352
|
-
|
10353
|
-
#elif defined __AVX2__ && QK_K == 256
|
10354
|
-
|
10355
|
-
const __m128i m8 = _mm_set1_epi8(0x08);
|
10356
|
-
const __m128i m7 = _mm_set1_epi8(0x07);
|
10357
|
-
const __m128i m1 = _mm_set1_epi8(0x01);
|
10358
|
-
const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
|
10359
|
-
const __m128i shuffle_s[4] = {
|
10360
|
-
_mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
|
10361
|
-
_mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
|
10362
|
-
_mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
|
10363
|
-
_mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
|
10364
|
-
};
|
10365
|
-
|
10366
|
-
uint64_t aux64;
|
10367
|
-
|
10368
|
-
typedef union m256i_uint16 {
|
10369
|
-
__m256i reg;
|
10370
|
-
uint16_t s[16];
|
10371
|
-
} m256i_uint16_t;
|
10372
|
-
|
10373
|
-
m256i_uint16_t v_gindex;
|
9623
|
+
#elif defined __AVX2__
|
10374
9624
|
|
10375
9625
|
__m256 accum = _mm256_setzero_ps();
|
9626
|
+
float accum1 = 0;
|
10376
9627
|
for (int i = 0; i < nb; ++i) {
|
10377
9628
|
|
10378
|
-
const int8_t
|
10379
|
-
const uint8_t
|
10380
|
-
const
|
9629
|
+
const int8_t * q8 = y[i].qs;
|
9630
|
+
const uint8_t * qs = x[i].qs;
|
9631
|
+
const uint16_t * qh = x[i].qh;
|
10381
9632
|
|
10382
9633
|
__m256i sumi = _mm256_setzero_si256();
|
10383
|
-
|
10384
|
-
|
10385
|
-
|
10386
|
-
|
10387
|
-
const __m256i
|
10388
|
-
|
10389
|
-
|
9634
|
+
int sumi1 = 0;
|
9635
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
9636
|
+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
|
9637
|
+
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
9638
|
+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
|
9639
|
+
iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
9640
|
+
qs += 8;
|
9641
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9642
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
10390
9643
|
|
10391
|
-
|
10392
|
-
|
10393
|
-
|
10394
|
-
|
10395
|
-
|
10396
|
-
|
10397
|
-
const __m256i p = _mm256_madd_epi16(s16, dot);
|
10398
|
-
sumi = _mm256_add_epi32(sumi, p);
|
10399
|
-
}
|
9644
|
+
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
9645
|
+
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
9646
|
+
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
9647
|
+
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
9648
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
|
9649
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
|
10400
9650
|
|
9651
|
+
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
|
9652
|
+
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
|
9653
|
+
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
10401
9654
|
}
|
10402
9655
|
|
10403
|
-
|
9656
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
9657
|
+
accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
|
9658
|
+
accum1 += d * sumi1;
|
10404
9659
|
|
10405
9660
|
}
|
10406
9661
|
|
10407
|
-
*s = hsum_float_8(accum);
|
9662
|
+
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
10408
9663
|
|
10409
9664
|
#else
|
10410
9665
|
|
10411
|
-
int db[4];
|
10412
|
-
uint16_t idx[4];
|
10413
|
-
|
10414
9666
|
float sumf = 0;
|
10415
|
-
for (int i = 0; i < nb; ++
|
9667
|
+
for (int i = 0; i < nb; i++) {
|
10416
9668
|
|
10417
|
-
const int8_t
|
10418
|
-
const uint8_t
|
10419
|
-
const
|
9669
|
+
const int8_t * q8 = y[i].qs;
|
9670
|
+
const uint8_t * qs = x[i].qs;
|
9671
|
+
const uint16_t * qh = x[i].qh;
|
10420
9672
|
|
10421
|
-
int sumi = 0;
|
10422
|
-
for (int
|
10423
|
-
|
10424
|
-
|
10425
|
-
|
10426
|
-
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
10427
|
-
db[0] = (2*(sc[0] & 7) + 1);
|
10428
|
-
db[1] = (2*((sc[0] >> 4) & 7) + 1);
|
10429
|
-
db[2] = (2*(sc[1] & 7) + 1);
|
10430
|
-
db[3] = (2*((sc[1] >> 4) & 7) + 1);
|
9673
|
+
int sumi = 0, sumi1 = 0;
|
9674
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
9675
|
+
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
9676
|
+
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
9677
|
+
int lsum = 0;
|
10431
9678
|
for (int l = 0; l < 4; ++l) {
|
10432
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid +
|
10433
|
-
int
|
10434
|
-
|
10435
|
-
|
9679
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
9680
|
+
for (int j = 0; j < 8; ++j) {
|
9681
|
+
lsum += q8[j] * grid[j];
|
9682
|
+
}
|
10436
9683
|
q8 += 8;
|
10437
9684
|
}
|
9685
|
+
sumi += ls * lsum;
|
9686
|
+
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
10438
9687
|
qs += 4;
|
10439
|
-
sc += 2;
|
10440
9688
|
}
|
10441
9689
|
|
10442
|
-
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
|
9690
|
+
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
10443
9691
|
}
|
10444
9692
|
|
10445
9693
|
*s = sumf;
|
@@ -10508,10 +9756,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
10508
9756
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
10509
9757
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
10510
9758
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
10511
|
-
const __m256i q4b_1 =
|
10512
|
-
|
10513
|
-
const __m256i q4b_2 =
|
10514
|
-
|
9759
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
9760
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
9761
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
9762
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
10515
9763
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
10516
9764
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
10517
9765
|
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
@@ -10618,10 +9866,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
10618
9866
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
10619
9867
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10620
9868
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10621
|
-
const __m256i q4b_1 =
|
10622
|
-
|
10623
|
-
const __m256i q4b_2 =
|
10624
|
-
|
9869
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
9870
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
9871
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
9872
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
10625
9873
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
10626
9874
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
10627
9875
|
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
@@ -10700,7 +9948,7 @@ static inline int iq2_grid_size(enum ggml_type type) {
|
|
10700
9948
|
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10701
9949
|
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
10702
9950
|
type == GGML_TYPE_IQ2_XS ? 512 :
|
10703
|
-
type == GGML_TYPE_IQ1_S ?
|
9951
|
+
type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024;
|
10704
9952
|
}
|
10705
9953
|
|
10706
9954
|
static int iq2_compare_func(const void * left, const void * right) {
|
@@ -10767,39 +10015,135 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10767
10015
|
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
10768
10016
|
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
10769
10017
|
};
|
10770
|
-
static const uint16_t
|
10771
|
-
|
10772
|
-
|
10773
|
-
|
10774
|
-
|
10775
|
-
|
10776
|
-
|
10777
|
-
|
10778
|
-
|
10779
|
-
|
10780
|
-
|
10781
|
-
|
10782
|
-
|
10783
|
-
|
10784
|
-
|
10785
|
-
|
10786
|
-
|
10787
|
-
|
10788
|
-
|
10789
|
-
|
10790
|
-
|
10791
|
-
|
10792
|
-
|
10793
|
-
|
10794
|
-
|
10795
|
-
|
10796
|
-
|
10797
|
-
|
10798
|
-
|
10799
|
-
|
10800
|
-
|
10801
|
-
|
10802
|
-
|
10018
|
+
static const uint16_t kgrid_1bit_2048[NGRID_IQ1S] = {
|
10019
|
+
0, 2, 5, 8, 10, 17, 21, 32, 34, 40, 42, 69, 81, 84, 86, 101,
|
10020
|
+
128, 130, 136, 138, 149, 160, 162, 168, 170, 260, 261, 273, 276, 278, 281, 282,
|
10021
|
+
293, 321, 326, 329, 338, 341, 346, 353, 356, 358, 360, 389, 401, 404, 406, 421,
|
10022
|
+
512, 514, 520, 522, 533, 544, 546, 552, 554, 581, 593, 601, 612, 617, 640, 642,
|
10023
|
+
648, 650, 657, 661, 665, 672, 674, 680, 682, 1041, 1044, 1046, 1061, 1089, 1097, 1109,
|
10024
|
+
1114, 1124, 1125, 1169, 1177, 1189, 1281, 1284, 1285, 1286, 1301, 1304, 1306, 1321, 1344, 1349,
|
10025
|
+
1354, 1360, 1361, 1364, 1365, 1366, 1369, 1376, 1378, 1381, 1384, 1386, 1409, 1425, 1429, 1432,
|
10026
|
+
1434, 1441, 1444, 1445, 1446, 1449, 1556, 1561, 1601, 1604, 1616, 1618, 1621, 1624, 1632, 1633,
|
10027
|
+
1638, 1641, 1669, 1681, 1684, 1689, 2048, 2050, 2056, 2058, 2069, 2080, 2082, 2088, 2090, 2117,
|
10028
|
+
2129, 2134, 2149, 2176, 2178, 2184, 2186, 2197, 2208, 2210, 2216, 2218, 2309, 2321, 2324, 2329,
|
10029
|
+
2340, 2341, 2369, 2384, 2385, 2389, 2401, 2404, 2409, 2449, 2452, 2454, 2457, 2469, 2560, 2562,
|
10030
|
+
2568, 2570, 2581, 2592, 2594, 2600, 2602, 2629, 2641, 2649, 2657, 2661, 2688, 2690, 2693, 2696,
|
10031
|
+
2698, 2709, 2720, 2722, 2728, 2730, 4112, 4113, 4116, 4121, 4132, 4133, 4161, 4164, 4176, 4181,
|
10032
|
+
4184, 4193, 4196, 4197, 4201, 4241, 4244, 4246, 4257, 4261, 4353, 4356, 4358, 4361, 4368, 4370,
|
10033
|
+
4373, 4376, 4385, 4388, 4393, 4421, 4426, 4432, 4433, 4434, 4436, 4437, 4438, 4441, 4448, 4453,
|
10034
|
+
4484, 4498, 4501, 4513, 4516, 4625, 4628, 4630, 4645, 4672, 4678, 4681, 4690, 4693, 4696, 4698,
|
10035
|
+
4708, 4710, 4741, 4753, 4756, 4758, 4773, 5121, 5126, 5129, 5140, 5141, 5144, 5145, 5153, 5158,
|
10036
|
+
5185, 5189, 5190, 5192, 5194, 5201, 5204, 5205, 5206, 5209, 5218, 5221, 5224, 5252, 5257, 5264,
|
10037
|
+
5268, 5269, 5272, 5273, 5274, 5281, 5284, 5285, 5289, 5378, 5381, 5386, 5393, 5396, 5397, 5398,
|
10038
|
+
5401, 5408, 5410, 5413, 5416, 5418, 5441, 5444, 5445, 5446, 5457, 5458, 5460, 5461, 5462, 5465,
|
10039
|
+
5466, 5473, 5476, 5477, 5478, 5481, 5504, 5506, 5508, 5509, 5512, 5514, 5520, 5521, 5524, 5525,
|
10040
|
+
5526, 5529, 5530, 5536, 5538, 5541, 5633, 5636, 5637, 5638, 5653, 5654, 5656, 5658, 5665, 5670,
|
10041
|
+
5696, 5698, 5700, 5701, 5704, 5706, 5713, 5717, 5718, 5720, 5721, 5729, 5732, 5733, 5736, 5737,
|
10042
|
+
5738, 5766, 5770, 5778, 5781, 5796, 5801, 6161, 6166, 6181, 6209, 6212, 6214, 6217, 6224, 6229,
|
10043
|
+
6232, 6234, 6240, 6241, 6244, 6246, 6249, 6277, 6289, 6292, 6309, 6416, 6418, 6421, 6426, 6433,
|
10044
|
+
6437, 6466, 6468, 6469, 6472, 6481, 6484, 6485, 6486, 6489, 6490, 6496, 6501, 6506, 6537, 6545,
|
10045
|
+
6546, 6549, 6552, 6561, 6566, 6569, 6665, 6678, 6692, 6694, 6724, 6726, 6729, 6736, 6738, 6741,
|
10046
|
+
6744, 6753, 6758, 6761, 6789, 6801, 6806, 6810, 8192, 8194, 8200, 8202, 8213, 8224, 8226, 8229,
|
10047
|
+
8232, 8234, 8261, 8273, 8281, 8289, 8293, 8320, 8322, 8328, 8330, 8341, 8352, 8354, 8357, 8360,
|
10048
|
+
8362, 8453, 8465, 8468, 8473, 8485, 8514, 8516, 8521, 8533, 8536, 8538, 8545, 8548, 8549, 8550,
|
10049
|
+
8581, 8592, 8598, 8601, 8613, 8705, 8712, 8714, 8721, 8725, 8736, 8738, 8744, 8746, 8773, 8785,
|
10050
|
+
8790, 8793, 8805, 8833, 8840, 8842, 8849, 8853, 8864, 8866, 8872, 8874, 9221, 9236, 9238, 9241,
|
10051
|
+
9253, 9284, 9285, 9286, 9289, 9298, 9301, 9304, 9306, 9318, 9349, 9361, 9364, 9369, 9377, 9381,
|
10052
|
+
9481, 9493, 9505, 9513, 9536, 9541, 9544, 9553, 9556, 9557, 9561, 9570, 9573, 9576, 9609, 9616,
|
10053
|
+
9620, 9621, 9624, 9626, 9633, 9636, 9638, 9641, 9733, 9744, 9746, 9753, 9765, 9793, 9801, 9813,
|
10054
|
+
9824, 9825, 9833, 9860, 9862, 9872, 9882, 10240, 10242, 10248, 10250, 10261, 10272, 10274, 10280, 10282,
|
10055
|
+
10309, 10321, 10324, 10341, 10368, 10370, 10376, 10378, 10400, 10402, 10408, 10410, 10505, 10513, 10516, 10521,
|
10056
|
+
10533, 10566, 10569, 10578, 10581, 10593, 10596, 10598, 10601, 10629, 10640, 10646, 10649, 10660, 10661, 10752,
|
10057
|
+
10754, 10760, 10762, 10784, 10786, 10792, 10794, 10821, 10833, 10838, 10841, 10853, 10880, 10882, 10888, 10890,
|
10058
|
+
10901, 10912, 10914, 10920, 10922, 16389, 16401, 16406, 16421, 16457, 16466, 16469, 16472, 16474, 16481, 16484,
|
10059
|
+
16486, 16532, 16537, 16545, 16550, 16640, 16641, 16644, 16646, 16649, 16658, 16661, 16662, 16664, 16666, 16673,
|
10060
|
+
16678, 16681, 16709, 16712, 16714, 16721, 16724, 16725, 16726, 16729, 16730, 16741, 16744, 16746, 16769, 16772,
|
10061
|
+
16774, 16784, 16786, 16789, 16800, 16801, 16802, 16901, 16913, 16916, 16918, 16933, 16961, 16978, 16981, 16986,
|
10062
|
+
16996, 17001, 17033, 17044, 17061, 17409, 17429, 17433, 17449, 17477, 17480, 17482, 17489, 17492, 17493, 17494,
|
10063
|
+
17505, 17506, 17509, 17512, 17514, 17537, 17542, 17545, 17552, 17554, 17557, 17568, 17569, 17577, 17665, 17666,
|
10064
|
+
17669, 17674, 17681, 17684, 17685, 17686, 17689, 17696, 17701, 17706, 17729, 17732, 17733, 17734, 17737, 17744,
|
10065
|
+
17745, 17748, 17749, 17750, 17752, 17753, 17761, 17764, 17765, 17766, 17769, 17794, 17796, 17797, 17800, 17809,
|
10066
|
+
17812, 17813, 17814, 17817, 17818, 17829, 17832, 17834, 17921, 17925, 17929, 17940, 17941, 17944, 17946, 17953,
|
10067
|
+
17956, 17961, 17984, 17986, 17989, 17992, 18000, 18001, 18002, 18005, 18006, 18009, 18018, 18021, 18024, 18049,
|
10068
|
+
18053, 18058, 18068, 18069, 18081, 18084, 18086, 18437, 18449, 18453, 18458, 18469, 18498, 18505, 18512, 18517,
|
10069
|
+
18520, 18529, 18532, 18534, 18537, 18565, 18577, 18580, 18582, 18585, 18597, 18689, 18693, 18694, 18698, 18704,
|
10070
|
+
18708, 18709, 18712, 18721, 18724, 18726, 18752, 18757, 18762, 18769, 18770, 18772, 18773, 18774, 18777, 18784,
|
10071
|
+
18786, 18789, 18790, 18794, 18822, 18825, 18834, 18837, 18838, 18840, 18849, 18852, 18854, 18857, 18966, 19012,
|
10072
|
+
19014, 19017, 19029, 19032, 19034, 19044, 19049, 19092, 19109, 20481, 20484, 20485, 20486, 20489, 20498, 20501,
|
10073
|
+
20506, 20513, 20516, 20521, 20544, 20549, 20552, 20561, 20564, 20565, 20566, 20569, 20581, 20584, 20614, 20617,
|
10074
|
+
20629, 20632, 20640, 20641, 20646, 20649, 20741, 20744, 20745, 20746, 20753, 20756, 20757, 20758, 20760, 20761,
|
10075
|
+
20768, 20773, 20774, 20776, 20778, 20801, 20804, 20805, 20806, 20809, 20816, 20817, 20818, 20820, 20821, 20822,
|
10076
|
+
20824, 20825, 20826, 20833, 20836, 20837, 20838, 20841, 20866, 20869, 20881, 20884, 20885, 20886, 20889, 20896,
|
10077
|
+
20901, 20906, 20993, 20998, 21010, 21013, 21018, 21025, 21028, 21058, 21061, 21066, 21073, 21076, 21077, 21078,
|
10078
|
+
21081, 21090, 21093, 21125, 21136, 21138, 21141, 21145, 21146, 21156, 21508, 21509, 21521, 21524, 21525, 21526,
|
10079
|
+
21528, 21529, 21537, 21541, 21544, 21546, 21569, 21572, 21573, 21574, 21577, 21578, 21584, 21585, 21588, 21589,
|
10080
|
+
21590, 21592, 21593, 21594, 21601, 21602, 21604, 21605, 21606, 21609, 21632, 21640, 21642, 21649, 21652, 21653,
|
10081
|
+
21654, 21657, 21665, 21668, 21669, 21674, 21761, 21762, 21764, 21765, 21766, 21769, 21776, 21777, 21778, 21780,
|
10082
|
+
21781, 21782, 21785, 21786, 21793, 21796, 21797, 21798, 21801, 21824, 21825, 21826, 21828, 21829, 21830, 21832,
|
10083
|
+
21833, 21840, 21841, 21842, 21844, 21845, 21846, 21848, 21849, 21850, 21856, 21857, 21860, 21861, 21862, 21864,
|
10084
|
+
21865, 21866, 21889, 21892, 21893, 21897, 21898, 21904, 21905, 21908, 21909, 21910, 21912, 21913, 21921, 21924,
|
10085
|
+
21925, 21926, 21929, 22016, 22017, 22018, 22020, 22022, 22024, 22025, 22033, 22036, 22037, 22040, 22041, 22048,
|
10086
|
+
22049, 22050, 22052, 22053, 22054, 22056, 22057, 22081, 22085, 22086, 22088, 22089, 22090, 22096, 22097, 22098,
|
10087
|
+
22100, 22101, 22102, 22104, 22105, 22106, 22113, 22116, 22117, 22121, 22146, 22149, 22150, 22152, 22153, 22154,
|
10088
|
+
22161, 22165, 22170, 22178, 22181, 22182, 22184, 22185, 22532, 22533, 22534, 22537, 22544, 22549, 22552, 22561,
|
10089
|
+
22570, 22597, 22600, 22602, 22609, 22612, 22613, 22614, 22616, 22617, 22624, 22626, 22628, 22629, 22658, 22665,
|
10090
|
+
22672, 22674, 22677, 22680, 22689, 22697, 22785, 22786, 22789, 22794, 22801, 22804, 22805, 22806, 22809, 22821,
|
10091
|
+
22849, 22852, 22853, 22854, 22857, 22864, 22865, 22866, 22868, 22869, 22870, 22872, 22873, 22874, 22881, 22884,
|
10092
|
+
22885, 22886, 22889, 22913, 22917, 22921, 22929, 22932, 22933, 22934, 22936, 22937, 22949, 23044, 23048, 23061,
|
10093
|
+
23066, 23072, 23077, 23078, 23081, 23109, 23112, 23113, 23121, 23125, 23126, 23128, 23129, 23138, 23141, 23144,
|
10094
|
+
23146, 23169, 23178, 23186, 23189, 23190, 23192, 23194, 23201, 24581, 24596, 24598, 24601, 24613, 24644, 24656,
|
10095
|
+
24661, 24662, 24664, 24666, 24673, 24676, 24678, 24681, 24705, 24726, 24741, 24833, 24836, 24838, 24841, 24850,
|
10096
|
+
24853, 24865, 24866, 24870, 24873, 24901, 24905, 24913, 24917, 24918, 24921, 24933, 24934, 24938, 24964, 24970,
|
10097
|
+
24978, 24981, 24993, 24998, 25001, 25105, 25110, 25113, 25152, 25153, 25158, 25173, 25174, 25176, 25184, 25221,
|
10098
|
+
25233, 25238, 25253, 25617, 25618, 25621, 25622, 25626, 25633, 25638, 25641, 25664, 25666, 25669, 25672, 25674,
|
10099
|
+
25681, 25684, 25685, 25686, 25689, 25690, 25696, 25698, 25701, 25732, 25733, 25737, 25744, 25746, 25748, 25749,
|
10100
|
+
25750, 25752, 25754, 25761, 25764, 25769, 25861, 25864, 25866, 25873, 25877, 25878, 25881, 25924, 25925, 25926,
|
10101
|
+
25929, 25936, 25937, 25940, 25941, 25942, 25945, 25953, 25956, 25957, 25958, 25961, 25990, 25993, 25994, 26001,
|
10102
|
+
26005, 26006, 26009, 26010, 26018, 26021, 26022, 26024, 26114, 26121, 26133, 26144, 26150, 26152, 26153, 26176,
|
10103
|
+
26181, 26184, 26186, 26193, 26196, 26197, 26198, 26200, 26202, 26208, 26213, 26216, 26240, 26242, 26245, 26250,
|
10104
|
+
26260, 26262, 26264, 26265, 26272, 26276, 26278, 26282, 26646, 26649, 26661, 26689, 26706, 26709, 26714, 26721,
|
10105
|
+
26729, 26757, 26769, 26776, 26790, 26881, 26884, 26896, 26901, 26913, 26916, 26918, 26921, 26944, 26945, 26949,
|
10106
|
+
26950, 26952, 26961, 26964, 26965, 26966, 26969, 26976, 26981, 26986, 27010, 27012, 27018, 27029, 27041, 27044,
|
10107
|
+
27045, 27049, 27153, 27158, 27160, 27201, 27204, 27209, 27216, 27221, 27224, 27226, 27236, 27237, 27241, 27270,
|
10108
|
+
27284, 27288, 27290, 27302, 32768, 32770, 32776, 32778, 32800, 32802, 32808, 32810, 32837, 32848, 32849, 32852,
|
10109
|
+
32854, 32857, 32869, 32896, 32898, 32904, 32906, 32917, 32928, 32930, 32936, 32938, 33029, 33041, 33044, 33046,
|
10110
|
+
33049, 33061, 33089, 33092, 33097, 33104, 33106, 33109, 33110, 33112, 33113, 33124, 33126, 33129, 33157, 33161,
|
10111
|
+
33172, 33174, 33177, 33189, 33280, 33282, 33288, 33290, 33301, 33312, 33314, 33320, 33322, 33361, 33364, 33369,
|
10112
|
+
33381, 33408, 33410, 33416, 33418, 33429, 33440, 33442, 33448, 33450, 33812, 33817, 33857, 33860, 33873, 33877,
|
10113
|
+
33882, 33889, 33892, 33897, 33940, 33945, 34049, 34057, 34066, 34069, 34074, 34086, 34089, 34112, 34113, 34117,
|
10114
|
+
34120, 34129, 34132, 34133, 34134, 34137, 34138, 34149, 34150, 34152, 34154, 34177, 34180, 34182, 34185, 34192,
|
10115
|
+
34194, 34197, 34200, 34214, 34321, 34326, 34329, 34341, 34369, 34372, 34377, 34378, 34384, 34389, 34393, 34394,
|
10116
|
+
34401, 34406, 34410, 34437, 34449, 34458, 34468, 34816, 34818, 34824, 34826, 34837, 34848, 34850, 34856, 34858,
|
10117
|
+
34881, 34885, 34897, 34900, 34905, 34917, 34921, 34944, 34946, 34952, 34954, 34965, 34976, 34978, 34984, 34986,
|
10118
|
+
35077, 35078, 35089, 35092, 35094, 35109, 35137, 35140, 35142, 35145, 35152, 35154, 35157, 35162, 35169, 35172,
|
10119
|
+
35205, 35222, 35225, 35237, 35328, 35330, 35336, 35338, 35349, 35360, 35362, 35368, 35370, 35397, 35409, 35412,
|
10120
|
+
35414, 35456, 35458, 35464, 35466, 35477, 35488, 35490, 35496, 35498, 36869, 36881, 36886, 36888, 36889, 36901,
|
10121
|
+
36929, 36934, 36937, 36949, 36952, 36954, 36969, 36970, 36997, 37009, 37012, 37014, 37017, 37029, 37121, 37124,
|
10122
|
+
37126, 37129, 37136, 37141, 37144, 37146, 37153, 37156, 37158, 37161, 37184, 37189, 37200, 37201, 37204, 37205,
|
10123
|
+
37206, 37209, 37218, 37221, 37252, 37254, 37266, 37269, 37272, 37281, 37284, 37286, 37289, 37381, 37393, 37396,
|
10124
|
+
37401, 37413, 37444, 37446, 37449, 37456, 37458, 37461, 37464, 37478, 37481, 37509, 37524, 37526, 37545, 37889,
|
10125
|
+
37892, 37894, 37904, 37909, 37912, 37926, 37952, 37962, 37969, 37972, 37973, 37974, 37976, 37977, 37984, 37985,
|
10126
|
+
37986, 37989, 38020, 38022, 38034, 38036, 38037, 38040, 38049, 38057, 38144, 38149, 38152, 38154, 38160, 38161,
|
10127
|
+
38164, 38165, 38166, 38169, 38177, 38181, 38185, 38186, 38209, 38212, 38213, 38214, 38217, 38224, 38225, 38226,
|
10128
|
+
38228, 38229, 38230, 38232, 38233, 38234, 38241, 38244, 38245, 38246, 38249, 38273, 38277, 38280, 38289, 38290,
|
10129
|
+
38292, 38293, 38294, 38297, 38298, 38304, 38306, 38309, 38312, 38314, 38401, 38404, 38416, 38421, 38425, 38432,
|
10130
|
+
38438, 38441, 38469, 38472, 38473, 38481, 38482, 38485, 38486, 38489, 38501, 38504, 38530, 38532, 38537, 38538,
|
10131
|
+
38546, 38548, 38549, 38564, 38566, 38569, 38917, 38934, 38937, 38949, 38977, 38982, 38992, 38994, 38997, 38998,
|
10132
|
+
39002, 39012, 39013, 39045, 39057, 39062, 39065, 39077, 39172, 39174, 39177, 39184, 39186, 39189, 39192, 39194,
|
10133
|
+
39200, 39201, 39204, 39206, 39232, 39234, 39237, 39240, 39242, 39249, 39252, 39253, 39254, 39257, 39266, 39269,
|
10134
|
+
39270, 39274, 39297, 39300, 39312, 39314, 39317, 39322, 39329, 39334, 39429, 39445, 39461, 39492, 39494, 39497,
|
10135
|
+
39504, 39509, 39512, 39521, 39557, 39569, 39572, 39573, 39574, 40960, 40962, 40968, 40970, 40981, 40992, 40994,
|
10136
|
+
41000, 41002, 41029, 41041, 41044, 41046, 41049, 41088, 41090, 41096, 41098, 41109, 41120, 41122, 41128, 41130,
|
10137
|
+
41221, 41225, 41233, 41236, 41238, 41241, 41242, 41286, 41289, 41297, 41301, 41304, 41306, 41313, 41316, 41349,
|
10138
|
+
41360, 41362, 41366, 41369, 41474, 41480, 41482, 41488, 41497, 41506, 41512, 41514, 41541, 41553, 41558, 41561,
|
10139
|
+
41573, 41600, 41602, 41608, 41610, 41621, 41632, 41634, 41640, 41642, 42009, 42021, 42049, 42052, 42064, 42068,
|
10140
|
+
42069, 42072, 42074, 42081, 42085, 42086, 42088, 42089, 42117, 42246, 42249, 42256, 42258, 42261, 42264, 42278,
|
10141
|
+
42281, 42306, 42309, 42321, 42324, 42325, 42326, 42329, 42341, 42346, 42369, 42372, 42373, 42374, 42377, 42386,
|
10142
|
+
42389, 42392, 42501, 42513, 42518, 42522, 42529, 42533, 42564, 42566, 42570, 42578, 42581, 42582, 42584, 42592,
|
10143
|
+
42594, 42630, 42640, 42645, 42646, 42649, 42657, 42660, 42662, 43008, 43010, 43016, 43018, 43040, 43042, 43048,
|
10144
|
+
43050, 43089, 43092, 43094, 43097, 43136, 43138, 43144, 43146, 43157, 43168, 43170, 43176, 43178, 43269, 43284,
|
10145
|
+
43289, 43297, 43301, 43329, 43344, 43349, 43354, 43361, 43366, 43369, 43408, 43414, 43520, 43522, 43528, 43530,
|
10146
|
+
43552, 43554, 43560, 43562, 43601, 43604, 43606, 43648, 43650, 43656, 43658, 43669, 43680, 43682, 43688, 43690,
|
10803
10147
|
};
|
10804
10148
|
static const uint16_t kgrid_2bit_1024[1024] = {
|
10805
10149
|
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
@@ -10873,12 +10217,12 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10873
10217
|
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
10874
10218
|
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
10875
10219
|
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10876
|
-
type == GGML_TYPE_IQ1_S ?
|
10220
|
+
type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024;
|
10877
10221
|
uint64_t * kgrid_q2xs;
|
10878
10222
|
int * kmap_q2xs;
|
10879
10223
|
uint16_t * kneighbors_q2xs;
|
10880
10224
|
|
10881
|
-
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
10225
|
+
//printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
10882
10226
|
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
|
10883
10227
|
for (int k = 0; k < grid_size; ++k) {
|
10884
10228
|
int8_t * pos = (int8_t *)(the_grid + k);
|
@@ -10933,7 +10277,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10933
10277
|
}
|
10934
10278
|
num_neighbors += n;
|
10935
10279
|
}
|
10936
|
-
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
10280
|
+
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
10937
10281
|
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
10938
10282
|
iq2_data[gindex].neighbours = kneighbors_q2xs;
|
10939
10283
|
int counter = 0;
|
@@ -11356,8 +10700,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
11356
10700
|
}
|
11357
10701
|
}
|
11358
10702
|
|
11359
|
-
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
11360
|
-
(void)hist;
|
10703
|
+
size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
11361
10704
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11362
10705
|
int nblock = n_per_row/QK_K;
|
11363
10706
|
char * qrow = (char *)dst;
|
@@ -11369,8 +10712,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
|
11369
10712
|
return nrow * nblock * sizeof(block_iq2_xxs);
|
11370
10713
|
}
|
11371
10714
|
|
11372
|
-
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row,
|
11373
|
-
(void)hist;
|
10715
|
+
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
11374
10716
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11375
10717
|
int nblock = n_per_row/QK_K;
|
11376
10718
|
char * qrow = (char *)dst;
|
@@ -11474,7 +10816,7 @@ void iq3xs_init_impl(int grid_size) {
|
|
11474
10816
|
int * kmap_q3xs;
|
11475
10817
|
uint16_t * kneighbors_q3xs;
|
11476
10818
|
|
11477
|
-
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
10819
|
+
//printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
11478
10820
|
uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
|
11479
10821
|
for (int k = 0; k < grid_size; ++k) {
|
11480
10822
|
int8_t * pos = (int8_t *)(the_grid + k);
|
@@ -11529,7 +10871,7 @@ void iq3xs_init_impl(int grid_size) {
|
|
11529
10871
|
}
|
11530
10872
|
num_neighbors += n;
|
11531
10873
|
}
|
11532
|
-
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
10874
|
+
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
11533
10875
|
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
11534
10876
|
iq3_data[gindex].neighbours = kneighbors_q3xs;
|
11535
10877
|
int counter = 0;
|
@@ -11812,8 +11154,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
11812
11154
|
}
|
11813
11155
|
}
|
11814
11156
|
|
11815
|
-
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
11816
|
-
(void)hist;
|
11157
|
+
size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
11817
11158
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11818
11159
|
int nblock = n_per_row/QK_K;
|
11819
11160
|
char * qrow = (char *)dst;
|
@@ -11912,7 +11253,8 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
11912
11253
|
}
|
11913
11254
|
float best = 0;
|
11914
11255
|
float scale = max/(2*kMaxQ-1);
|
11915
|
-
for (int
|
11256
|
+
for (int k = 0; k < bs4; ++k) is_on_grid[k] = false;
|
11257
|
+
for (int is = -9; is <= 9; ++is) {
|
11916
11258
|
float id = (2*kMaxQ-1+is*0.2f)/max;
|
11917
11259
|
float this_scale = 1/id;
|
11918
11260
|
for (int k = 0; k < bs4; ++k) {
|
@@ -11948,7 +11290,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
11948
11290
|
if (n_not_ongrid > 0 && scale > 0) {
|
11949
11291
|
float id = 1/scale;
|
11950
11292
|
for (int k = 0; k < bs4; ++k) {
|
11951
|
-
if (is_on_grid[k]) continue;
|
11293
|
+
//if (is_on_grid[k]) continue;
|
11952
11294
|
uint16_t u = 0;
|
11953
11295
|
for (int i = 0; i < 4; ++i) {
|
11954
11296
|
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
@@ -12004,7 +11346,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
12004
11346
|
}
|
12005
11347
|
|
12006
11348
|
float d = max_scale/31;
|
12007
|
-
y[ibl].d = GGML_FP32_TO_FP16(d);
|
11349
|
+
y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f);
|
12008
11350
|
float id = 1/d;
|
12009
11351
|
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
12010
11352
|
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|
@@ -12018,8 +11360,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
12018
11360
|
}
|
12019
11361
|
|
12020
11362
|
#define IQ3S_BLOCK_SIZE 32
|
12021
|
-
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row,
|
12022
|
-
(void)hist;
|
11363
|
+
size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12023
11364
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
12024
11365
|
int nblock = n_per_row/QK_K;
|
12025
11366
|
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
@@ -12049,7 +11390,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
|
|
12049
11390
|
|
12050
11391
|
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
12051
11392
|
assert(k % QK_K == 0);
|
12052
|
-
quantize_iq3_s(x, y, 1, k, NULL
|
11393
|
+
quantize_iq3_s(x, y, 1, k, NULL);
|
12053
11394
|
}
|
12054
11395
|
|
12055
11396
|
|
@@ -12115,12 +11456,70 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
12115
11456
|
return grid_index;
|
12116
11457
|
}
|
12117
11458
|
|
11459
|
+
static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
11460
|
+
const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
|
11461
|
+
int num_neighbors = neighbours[0];
|
11462
|
+
GGML_ASSERT(num_neighbors > 0);
|
11463
|
+
float best_score = FLT_MAX;
|
11464
|
+
int grid_index = -1;
|
11465
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
11466
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
11467
|
+
float d2 = 0;
|
11468
|
+
for (int i = 0; i < 8; ++i) {
|
11469
|
+
float q = xg[(pg[i] - 1)/2];
|
11470
|
+
float w = weight[i];
|
11471
|
+
float diff = scale*q - xval[i];
|
11472
|
+
d2 += w*diff*diff;
|
11473
|
+
}
|
11474
|
+
if (d2 < best_score) {
|
11475
|
+
best_score = d2;
|
11476
|
+
grid_index = neighbours[j];
|
11477
|
+
}
|
11478
|
+
}
|
11479
|
+
if (grid_index < 0) {
|
11480
|
+
for (int i = 0; i < ngrid; ++i) {
|
11481
|
+
const int8_t * grid_i = (const int8_t *)(grid + i);
|
11482
|
+
float d2 = 0;
|
11483
|
+
for (int j = 0; j < 8; ++j) {
|
11484
|
+
float w = weight[j];
|
11485
|
+
float q = xg[(grid_i[j] - 1)/2];
|
11486
|
+
float diff = scale*q - xval[i];
|
11487
|
+
d2 += w*diff*diff;
|
11488
|
+
}
|
11489
|
+
if (d2 < best_score) {
|
11490
|
+
best_score = d2;
|
11491
|
+
grid_index = i;
|
11492
|
+
}
|
11493
|
+
}
|
11494
|
+
}
|
11495
|
+
if (grid_index < 0) {
|
11496
|
+
printf("Oops, did not find grid point\n");
|
11497
|
+
printf("Have %d neighbours\n", num_neighbors);
|
11498
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
11499
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
11500
|
+
float sumqx = 0, sumq2 = 0;
|
11501
|
+
for (int i = 0; i < 8; ++i) {
|
11502
|
+
float q = xg[(pg[i] - 1)/2];
|
11503
|
+
float w = weight[i];
|
11504
|
+
sumqx += w*q*xval[i];
|
11505
|
+
sumq2 += w*q*q;
|
11506
|
+
}
|
11507
|
+
printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
|
11508
|
+
}
|
11509
|
+
}
|
11510
|
+
GGML_ASSERT(grid_index >= 0);
|
11511
|
+
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
11512
|
+
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
|
11513
|
+
return grid_index;
|
11514
|
+
}
|
11515
|
+
|
12118
11516
|
static int iq1_sort_helper(const void * left, const void * right) {
|
12119
11517
|
const float * l = left;
|
12120
11518
|
const float * r = right;
|
12121
11519
|
return *l < *r ? -1 : *l > *r ? 1 : 0;
|
12122
11520
|
}
|
12123
11521
|
|
11522
|
+
#define IQ1S_BLOCK_SIZE 32
|
12124
11523
|
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
12125
11524
|
|
12126
11525
|
const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
|
@@ -12139,37 +11538,41 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
12139
11538
|
|
12140
11539
|
block_iq1_s * y = vy;
|
12141
11540
|
|
12142
|
-
float
|
12143
|
-
float
|
12144
|
-
|
12145
|
-
float
|
12146
|
-
float
|
12147
|
-
|
11541
|
+
const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
|
11542
|
+
const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
|
11543
|
+
|
11544
|
+
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
11545
|
+
float weight[IQ1S_BLOCK_SIZE];
|
11546
|
+
int8_t L[IQ1S_BLOCK_SIZE];
|
11547
|
+
float sumx[IQ1S_BLOCK_SIZE+1];
|
11548
|
+
float sumw[IQ1S_BLOCK_SIZE+1];
|
11549
|
+
float pairs[2*IQ1S_BLOCK_SIZE];
|
12148
11550
|
int * idx = (int *)(pairs + 1);
|
12149
|
-
|
11551
|
+
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
11552
|
+
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
12150
11553
|
|
12151
11554
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
12152
11555
|
|
12153
11556
|
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
12154
11557
|
memset(y[ibl].qs, 0, QK_K/8);
|
12155
|
-
memset(y[ibl].
|
11558
|
+
memset(y[ibl].qh, 0, QK_K/16);
|
12156
11559
|
|
12157
11560
|
float max_scale = 0;
|
12158
11561
|
|
12159
11562
|
const float * xbl = x + QK_K*ibl;
|
12160
11563
|
float sumx2 = 0;
|
12161
11564
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
12162
|
-
float sigma2 = sumx2/QK_K;
|
11565
|
+
float sigma2 = 2*sumx2/QK_K;
|
12163
11566
|
|
12164
|
-
for (int ib = 0; ib < QK_K/
|
12165
|
-
const float * xb = xbl +
|
12166
|
-
const float * qw = quant_weights + QK_K*ibl +
|
12167
|
-
for (int i = 0; i <
|
11567
|
+
for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
|
11568
|
+
const float * xb = xbl + IQ1S_BLOCK_SIZE*ib;
|
11569
|
+
const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib;
|
11570
|
+
for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12168
11571
|
float max = fabsf(xb[0]);
|
12169
|
-
for (int i = 1; i <
|
11572
|
+
for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i]));
|
12170
11573
|
if (!max) {
|
12171
11574
|
scales[ib] = 0;
|
12172
|
-
memset(L, 1,
|
11575
|
+
memset(L, 1, IQ1S_BLOCK_SIZE);
|
12173
11576
|
continue;
|
12174
11577
|
}
|
12175
11578
|
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
@@ -12178,52 +11581,81 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
12178
11581
|
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
12179
11582
|
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
12180
11583
|
// for each possible and score for each split.
|
12181
|
-
for (int j = 0; j <
|
11584
|
+
for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
|
12182
11585
|
pairs[2*j] = xb[j];
|
12183
11586
|
idx[2*j] = j;
|
12184
11587
|
}
|
12185
|
-
qsort(pairs,
|
11588
|
+
qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper);
|
12186
11589
|
{
|
12187
11590
|
sumx[0] = sumw[0] = 0;
|
12188
|
-
for (int j = 0; j <
|
11591
|
+
for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
|
12189
11592
|
int i = idx[2*j];
|
12190
11593
|
sumx[j+1] = sumx[j] + weight[i]*xb[i];
|
12191
11594
|
sumw[j+1] = sumw[j] + weight[i];
|
12192
11595
|
}
|
12193
11596
|
}
|
12194
11597
|
float best_score = 0, scale = max;
|
12195
|
-
int besti1 =
|
12196
|
-
for (int i1 = 0; i1 <=
|
12197
|
-
for (int i2 = i1; i2 <=
|
12198
|
-
float sumqx =
|
12199
|
-
float sumq2 =
|
11598
|
+
int besti1 = -1, besti2 = -1, best_shift = 0;
|
11599
|
+
for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) {
|
11600
|
+
for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) {
|
11601
|
+
float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2];
|
11602
|
+
float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2];
|
12200
11603
|
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
12201
11604
|
scale = sumqx/sumq2; best_score = scale*sumqx;
|
12202
|
-
besti1 = i1; besti2 = i2;
|
11605
|
+
besti1 = i1; besti2 = i2; best_shift = 1;
|
11606
|
+
}
|
11607
|
+
sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2];
|
11608
|
+
sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2];
|
11609
|
+
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
11610
|
+
scale = sumqx/sumq2; best_score = scale*sumqx;
|
11611
|
+
besti1 = i1; besti2 = i2; best_shift = -1;
|
12203
11612
|
}
|
12204
11613
|
}
|
12205
11614
|
}
|
11615
|
+
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
|
12206
11616
|
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
12207
11617
|
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
12208
|
-
for (int j = besti2; j <
|
11618
|
+
for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2;
|
12209
11619
|
if (scale < 0) {
|
12210
|
-
for (int j = 0; j <
|
12211
|
-
scale = -scale;
|
11620
|
+
for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j];
|
11621
|
+
scale = -scale; best_shift = -best_shift;
|
11622
|
+
}
|
11623
|
+
bool all_on_grid = true;
|
11624
|
+
const float * xx = best_shift == 1 ? x_p : x_m;
|
11625
|
+
for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
|
11626
|
+
uint16_t u = 0;
|
11627
|
+
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
|
11628
|
+
int grid_index = kmap_q2xs[u];
|
11629
|
+
if (grid_index < 0) {
|
11630
|
+
all_on_grid = false;
|
11631
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
11632
|
+
grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
|
11633
|
+
GGML_ASSERT(grid_index >= 0);
|
11634
|
+
}
|
11635
|
+
index[k] = grid_index;
|
11636
|
+
}
|
11637
|
+
if (!all_on_grid) {
|
11638
|
+
float sumqx = 0, sumq2 = 0;
|
11639
|
+
for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
|
11640
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
|
11641
|
+
for (int j = 0; j < 8; ++j) {
|
11642
|
+
float w = weight[8*k + j];
|
11643
|
+
float q = xx[(pg[j] - 1)/2];
|
11644
|
+
sumqx += w*q*xb[8*k+j];
|
11645
|
+
sumq2 += w*q*q;
|
11646
|
+
}
|
11647
|
+
}
|
11648
|
+
if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
|
11649
|
+
}
|
11650
|
+
uint16_t h = 0;
|
11651
|
+
for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
|
11652
|
+
y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255;
|
11653
|
+
h |= (index[k] >> 8) << 3*k;
|
12212
11654
|
}
|
12213
|
-
|
12214
|
-
// grid point that minimizes SSD.
|
12215
|
-
uint16_t u = 0;
|
12216
|
-
for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
|
12217
|
-
int grid_index = kmap_q2xs[u];
|
12218
|
-
if (grid_index < 0) {
|
12219
|
-
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12220
|
-
grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
|
12221
|
-
GGML_ASSERT(grid_index >= 0);
|
12222
|
-
}
|
12223
|
-
y[ibl].qs[ib] = grid_index & 255;
|
12224
|
-
hbit[ib] = grid_index >> 8;
|
11655
|
+
y[ibl].qh[ib] = h;
|
12225
11656
|
GGML_ASSERT(scale >= 0);
|
12226
11657
|
scales[ib] = scale;
|
11658
|
+
shifts[ib] = best_shift;
|
12227
11659
|
max_scale = MAX(max_scale, scale);
|
12228
11660
|
}
|
12229
11661
|
|
@@ -12233,19 +11665,18 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
12233
11665
|
}
|
12234
11666
|
|
12235
11667
|
float d = max_scale/15;
|
12236
|
-
y[ibl].d = GGML_FP32_TO_FP16(d*1.
|
11668
|
+
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
|
12237
11669
|
float id = 1/d;
|
12238
|
-
for (int ib = 0; ib < QK_K/
|
11670
|
+
for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
|
12239
11671
|
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
12240
11672
|
l = MAX(0, MIN(7, l));
|
12241
|
-
if (
|
12242
|
-
y[ibl].
|
11673
|
+
if (shifts[ib] == -1) l |= 8;
|
11674
|
+
y[ibl].qh[ib] |= (l << 12);
|
12243
11675
|
}
|
12244
11676
|
}
|
12245
11677
|
}
|
12246
11678
|
|
12247
|
-
size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row,
|
12248
|
-
(void)hist;
|
11679
|
+
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12249
11680
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
12250
11681
|
int nblock = n_per_row/QK_K;
|
12251
11682
|
char * qrow = (char *)dst;
|
@@ -12270,7 +11701,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|
12270
11701
|
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
12271
11702
|
}
|
12272
11703
|
|
12273
|
-
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float *
|
11704
|
+
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
|
12274
11705
|
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
12275
11706
|
float * scales, float * weight, uint8_t * L,
|
12276
11707
|
const int8_t * values,
|
@@ -12378,8 +11809,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
12378
11809
|
}
|
12379
11810
|
}
|
12380
11811
|
|
12381
|
-
size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row,
|
12382
|
-
(void)hist;
|
11812
|
+
size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12383
11813
|
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
12384
11814
|
int nblock = n_per_row/QK4_NL;
|
12385
11815
|
char * qrow = (char *)dst;
|
@@ -12409,14 +11839,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
|
12409
11839
|
|
12410
11840
|
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
12411
11841
|
assert(k % QK4_NL == 0);
|
12412
|
-
quantize_iq4_nl(x, y, 1, k, NULL
|
11842
|
+
quantize_iq4_nl(x, y, 1, k, NULL);
|
12413
11843
|
}
|
12414
11844
|
|
12415
|
-
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row,
|
11845
|
+
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12416
11846
|
#if QK_K == 64
|
12417
|
-
return quantize_iq4_nl(src, dst, nrow, n_per_row,
|
11847
|
+
return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
|
12418
11848
|
#else
|
12419
|
-
(void)hist;
|
12420
11849
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
12421
11850
|
int nblock = n_per_row/QK_K;
|
12422
11851
|
char * qrow = (char *)dst;
|
@@ -12445,7 +11874,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
|
12445
11874
|
|
12446
11875
|
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
12447
11876
|
assert(k % QK_K == 0);
|
12448
|
-
quantize_iq4_xs(x, y, 1, k, NULL
|
11877
|
+
quantize_iq4_xs(x, y, 1, k, NULL);
|
12449
11878
|
}
|
12450
11879
|
|
12451
11880
|
// =============================== 2.5625 bpw
|
@@ -12618,8 +12047,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
12618
12047
|
}
|
12619
12048
|
}
|
12620
12049
|
|
12621
|
-
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row,
|
12622
|
-
(void)hist;
|
12050
|
+
size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12623
12051
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
12624
12052
|
int nblock = n_per_row/QK_K;
|
12625
12053
|
char * qrow = (char *)dst;
|
@@ -12633,7 +12061,7 @@ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, in
|
|
12633
12061
|
|
12634
12062
|
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
12635
12063
|
assert(k % QK_K == 0);
|
12636
|
-
quantize_iq2_s(x, y, 1, k, NULL
|
12064
|
+
quantize_iq2_s(x, y, 1, k, NULL);
|
12637
12065
|
}
|
12638
12066
|
|
12639
12067
|
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|