llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -1,6 +1,12 @@
|
|
1
|
+
#define GGML_COMMON_IMPL_C
|
2
|
+
#include "ggml-common.h"
|
3
|
+
|
1
4
|
#include "ggml-quants.h"
|
2
5
|
#include "ggml-impl.h"
|
3
6
|
|
7
|
+
#define GGML_COMMON_IMPL_C
|
8
|
+
#include "ggml-common.h"
|
9
|
+
|
4
10
|
#include <math.h>
|
5
11
|
#include <string.h>
|
6
12
|
#include <assert.h>
|
@@ -51,6 +57,7 @@
|
|
51
57
|
|
52
58
|
#define UNUSED GGML_UNUSED
|
53
59
|
|
60
|
+
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
54
61
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
55
62
|
|
56
63
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
@@ -463,8 +470,8 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
463
470
|
}
|
464
471
|
|
465
472
|
// NOTE: not tested
|
466
|
-
inline static
|
467
|
-
|
473
|
+
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
474
|
+
uint8x16_t res;
|
468
475
|
|
469
476
|
res[ 0] = a[b[ 0]];
|
470
477
|
res[ 1] = a[b[ 1]];
|
@@ -947,7 +954,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
|
|
947
954
|
const float d = amax / ((1 << 7) - 1);
|
948
955
|
const float id = d ? 1.0f/d : 0.0f;
|
949
956
|
|
950
|
-
y[i].d = d;
|
957
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
951
958
|
|
952
959
|
int sum = 0;
|
953
960
|
|
@@ -962,7 +969,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
|
|
962
969
|
sum += y[i].qs[QK8_1/2 + j];
|
963
970
|
}
|
964
971
|
|
965
|
-
y[i].s = sum*d;
|
972
|
+
y[i].s = GGML_FP32_TO_FP16(sum*d);
|
966
973
|
}
|
967
974
|
}
|
968
975
|
|
@@ -990,7 +997,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
990
997
|
const float d = amax / ((1 << 7) - 1);
|
991
998
|
const float id = d ? 1.0f/d : 0.0f;
|
992
999
|
|
993
|
-
y[i].d = d;
|
1000
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
994
1001
|
|
995
1002
|
int32x4_t accv = vdupq_n_s32(0);
|
996
1003
|
|
@@ -1006,7 +1013,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1006
1013
|
accv = vaddq_s32(accv, vi);
|
1007
1014
|
}
|
1008
1015
|
|
1009
|
-
y[i].s = d * vaddvq_s32(accv);
|
1016
|
+
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
|
1010
1017
|
}
|
1011
1018
|
#elif defined(__wasm_simd128__)
|
1012
1019
|
for (int i = 0; i < nb; i++) {
|
@@ -1029,7 +1036,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1029
1036
|
const float d = amax / ((1 << 7) - 1);
|
1030
1037
|
const float id = d ? 1.0f/d : 0.0f;
|
1031
1038
|
|
1032
|
-
y[i].d = d;
|
1039
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1033
1040
|
|
1034
1041
|
v128_t accv = wasm_i32x4_splat(0);
|
1035
1042
|
|
@@ -1045,10 +1052,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1045
1052
|
accv = wasm_i32x4_add(accv, vi);
|
1046
1053
|
}
|
1047
1054
|
|
1048
|
-
y[i].s =
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1055
|
+
y[i].s = GGML_FP32_TO_FP16(
|
1056
|
+
d * (wasm_i32x4_extract_lane(accv, 0) +
|
1057
|
+
wasm_i32x4_extract_lane(accv, 1) +
|
1058
|
+
wasm_i32x4_extract_lane(accv, 2) +
|
1059
|
+
wasm_i32x4_extract_lane(accv, 3)));
|
1052
1060
|
}
|
1053
1061
|
#elif defined(__AVX2__) || defined(__AVX__)
|
1054
1062
|
for (int i = 0; i < nb; i++) {
|
@@ -1073,7 +1081,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1073
1081
|
|
1074
1082
|
// Quantize these floats
|
1075
1083
|
const float d = maxScalar / 127.f;
|
1076
|
-
y[i].d = d;
|
1084
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1077
1085
|
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
1078
1086
|
const __m256 mul = _mm256_set1_ps( id );
|
1079
1087
|
|
@@ -1097,7 +1105,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1097
1105
|
|
1098
1106
|
#if defined(__AVX2__)
|
1099
1107
|
// Compute the sum of the quants and set y[i].s
|
1100
|
-
y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
|
1108
|
+
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
|
1101
1109
|
|
1102
1110
|
// Convert int32 to int16
|
1103
1111
|
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
@@ -1127,7 +1135,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1127
1135
|
// Compute the sum of the quants and set y[i].s
|
1128
1136
|
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
|
1129
1137
|
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
|
1130
|
-
y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
|
1138
|
+
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
|
1131
1139
|
|
1132
1140
|
// Convert int32 to int16
|
1133
1141
|
ni0 = _mm_packs_epi32( ni0, ni1 );
|
@@ -1158,7 +1166,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1158
1166
|
const float d = amax / ((1 << 7) - 1);
|
1159
1167
|
const float id = d ? 1.0f/d : 0.0f;
|
1160
1168
|
|
1161
|
-
y[i].d = d;
|
1169
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1162
1170
|
|
1163
1171
|
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
|
1164
1172
|
|
@@ -1175,7 +1183,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1175
1183
|
|
1176
1184
|
// set y[i].s
|
1177
1185
|
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
1178
|
-
y[i].s = sum*d;
|
1186
|
+
y[i].s = GGML_FP32_TO_FP16(sum*d);
|
1179
1187
|
}
|
1180
1188
|
#else
|
1181
1189
|
GGML_UNUSED(nb);
|
@@ -1700,16 +1708,6 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
|
1700
1708
|
quantize_row_q2_K_reference(x, vy, k);
|
1701
1709
|
}
|
1702
1710
|
|
1703
|
-
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
1704
|
-
(void)hist; // TODO: collect histograms
|
1705
|
-
|
1706
|
-
for (int j = 0; j < n; j += k) {
|
1707
|
-
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
1708
|
-
quantize_row_q2_K_reference(src + j, y, k);
|
1709
|
-
}
|
1710
|
-
return (n/QK_K*sizeof(block_q2_K));
|
1711
|
-
}
|
1712
|
-
|
1713
1711
|
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
1714
1712
|
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
1715
1713
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
@@ -1962,8 +1960,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1962
1960
|
}
|
1963
1961
|
}
|
1964
1962
|
|
1965
|
-
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row,
|
1966
|
-
(void)hist;
|
1963
|
+
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
1967
1964
|
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
1968
1965
|
if (!quant_weights) {
|
1969
1966
|
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
@@ -2182,16 +2179,6 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
|
2182
2179
|
quantize_row_q3_K_reference(x, vy, k);
|
2183
2180
|
}
|
2184
2181
|
|
2185
|
-
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
2186
|
-
(void)hist; // TODO: collect histograms
|
2187
|
-
|
2188
|
-
for (int j = 0; j < n; j += k) {
|
2189
|
-
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
2190
|
-
quantize_row_q3_K_reference(src + j, y, k);
|
2191
|
-
}
|
2192
|
-
return (n/QK_K*sizeof(block_q3_K));
|
2193
|
-
}
|
2194
|
-
|
2195
2182
|
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
|
2196
2183
|
#if QK_K != 256
|
2197
2184
|
(void)quant_weights;
|
@@ -2281,8 +2268,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
2281
2268
|
#endif
|
2282
2269
|
}
|
2283
2270
|
|
2284
|
-
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row,
|
2285
|
-
(void)hist;
|
2271
|
+
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
2286
2272
|
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
2287
2273
|
if (!quant_weights) {
|
2288
2274
|
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
@@ -2452,17 +2438,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
|
2452
2438
|
quantize_row_q4_K_reference(x, y, k);
|
2453
2439
|
}
|
2454
2440
|
|
2455
|
-
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
2456
|
-
assert(k % QK_K == 0);
|
2457
|
-
(void)hist; // TODO: collect histograms
|
2458
|
-
|
2459
|
-
for (int j = 0; j < n; j += k) {
|
2460
|
-
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
2461
|
-
quantize_row_q4_K_reference(src + j, y, k);
|
2462
|
-
}
|
2463
|
-
return (n/QK_K*sizeof(block_q4_K));
|
2464
|
-
}
|
2465
|
-
|
2466
2441
|
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
|
2467
2442
|
#if QK_K != 256
|
2468
2443
|
(void)quant_weights;
|
@@ -2541,8 +2516,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
2541
2516
|
#endif
|
2542
2517
|
}
|
2543
2518
|
|
2544
|
-
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row,
|
2545
|
-
(void)hist;
|
2519
|
+
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
2546
2520
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
2547
2521
|
if (!quant_weights) {
|
2548
2522
|
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
@@ -2753,17 +2727,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
|
2753
2727
|
quantize_row_q5_K_reference(x, y, k);
|
2754
2728
|
}
|
2755
2729
|
|
2756
|
-
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
2757
|
-
assert(k % QK_K == 0);
|
2758
|
-
(void)hist; // TODO: collect histograms
|
2759
|
-
|
2760
|
-
for (int j = 0; j < n; j += k) {
|
2761
|
-
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
2762
|
-
quantize_row_q5_K_reference(src + j, y, k);
|
2763
|
-
}
|
2764
|
-
return (n/QK_K*sizeof(block_q5_K));
|
2765
|
-
}
|
2766
|
-
|
2767
2730
|
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
|
2768
2731
|
#if QK_K != 256
|
2769
2732
|
(void)quant_weights;
|
@@ -2862,8 +2825,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
2862
2825
|
#endif
|
2863
2826
|
}
|
2864
2827
|
|
2865
|
-
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row,
|
2866
|
-
(void)hist;
|
2828
|
+
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
2867
2829
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
2868
2830
|
if (!quant_weights) {
|
2869
2831
|
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
@@ -3016,17 +2978,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
|
3016
2978
|
quantize_row_q6_K_reference(x, y, k);
|
3017
2979
|
}
|
3018
2980
|
|
3019
|
-
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
3020
|
-
assert(k % QK_K == 0);
|
3021
|
-
(void)hist; // TODO: collect histograms
|
3022
|
-
|
3023
|
-
for (int j = 0; j < n; j += k) {
|
3024
|
-
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
3025
|
-
quantize_row_q6_K_reference(src + j, y, k);
|
3026
|
-
}
|
3027
|
-
return (n/QK_K*sizeof(block_q6_K));
|
3028
|
-
}
|
3029
|
-
|
3030
2981
|
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
|
3031
2982
|
#if QK_K != 256
|
3032
2983
|
(void)quant_weights;
|
@@ -3116,8 +3067,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
3116
3067
|
#endif
|
3117
3068
|
}
|
3118
3069
|
|
3119
|
-
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row,
|
3120
|
-
(void)hist;
|
3070
|
+
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3121
3071
|
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
3122
3072
|
if (!quant_weights) {
|
3123
3073
|
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
@@ -3161,9 +3111,10 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
3161
3111
|
}
|
3162
3112
|
}
|
3163
3113
|
|
3164
|
-
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row,
|
3114
|
+
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3165
3115
|
if (!quant_weights) {
|
3166
|
-
|
3116
|
+
quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
|
3117
|
+
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
3167
3118
|
}
|
3168
3119
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
3169
3120
|
char * qrow = (char *)dst;
|
@@ -3205,9 +3156,10 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
3205
3156
|
}
|
3206
3157
|
}
|
3207
3158
|
|
3208
|
-
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row,
|
3159
|
+
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3209
3160
|
if (!quant_weights) {
|
3210
|
-
|
3161
|
+
quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
|
3162
|
+
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
3211
3163
|
}
|
3212
3164
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
3213
3165
|
char * qrow = (char *)dst;
|
@@ -3258,9 +3210,10 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
3258
3210
|
}
|
3259
3211
|
}
|
3260
3212
|
|
3261
|
-
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row,
|
3213
|
+
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3262
3214
|
if (!quant_weights) {
|
3263
|
-
|
3215
|
+
quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
|
3216
|
+
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
3264
3217
|
}
|
3265
3218
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
3266
3219
|
char * qrow = (char *)dst;
|
@@ -3310,9 +3263,10 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
3310
3263
|
}
|
3311
3264
|
}
|
3312
3265
|
|
3313
|
-
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row,
|
3266
|
+
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3314
3267
|
if (!quant_weights) {
|
3315
|
-
|
3268
|
+
quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
|
3269
|
+
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
3316
3270
|
}
|
3317
3271
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
3318
3272
|
char * qrow = (char *)dst;
|
@@ -3324,712 +3278,14 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
|
|
3324
3278
|
return nrow * row_size;
|
3325
3279
|
}
|
3326
3280
|
|
3327
|
-
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
|
3333
|
-
0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
|
3334
|
-
0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
|
3335
|
-
0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
|
3336
|
-
0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
|
3337
|
-
0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
|
3338
|
-
0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
|
3339
|
-
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
|
3340
|
-
0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
|
3341
|
-
0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
|
3342
|
-
0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
|
3343
|
-
0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
|
3344
|
-
0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
|
3345
|
-
0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
|
3346
|
-
0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
|
3347
|
-
0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
|
3348
|
-
0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
|
3349
|
-
0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
|
3350
|
-
0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
|
3351
|
-
0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
|
3352
|
-
0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
|
3353
|
-
0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
|
3354
|
-
0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
|
3355
|
-
0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
|
3356
|
-
0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
|
3357
|
-
0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
|
3358
|
-
0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
|
3359
|
-
0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
|
3360
|
-
0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
|
3361
|
-
0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
|
3362
|
-
0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
|
3363
|
-
0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
|
3364
|
-
0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
|
3365
|
-
0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
|
3366
|
-
0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
|
3367
|
-
0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
|
3368
|
-
0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
|
3369
|
-
0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
|
3370
|
-
0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
|
3371
|
-
0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
|
3372
|
-
0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
|
3373
|
-
0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
|
3374
|
-
0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
|
3375
|
-
0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
|
3376
|
-
0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
|
3377
|
-
0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
|
3378
|
-
0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
|
3379
|
-
0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
|
3380
|
-
0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
|
3381
|
-
0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
|
3382
|
-
0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
|
3383
|
-
0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
|
3384
|
-
0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
|
3385
|
-
0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
|
3386
|
-
0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
|
3387
|
-
0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
|
3388
|
-
0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
|
3389
|
-
0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
|
3390
|
-
0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
|
3391
|
-
0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
|
3392
|
-
0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
|
3393
|
-
0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
|
3394
|
-
};
|
3395
|
-
|
3396
|
-
static const uint64_t iq2xs_grid[512] = {
|
3397
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
3398
|
-
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
3399
|
-
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
3400
|
-
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
3401
|
-
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
3402
|
-
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
3403
|
-
0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
|
3404
|
-
0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
|
3405
|
-
0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
|
3406
|
-
0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
|
3407
|
-
0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
|
3408
|
-
0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
|
3409
|
-
0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
|
3410
|
-
0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
|
3411
|
-
0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
|
3412
|
-
0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
|
3413
|
-
0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
|
3414
|
-
0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
|
3415
|
-
0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
|
3416
|
-
0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
|
3417
|
-
0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
|
3418
|
-
0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
|
3419
|
-
0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
|
3420
|
-
0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
|
3421
|
-
0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
|
3422
|
-
0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
|
3423
|
-
0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
|
3424
|
-
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
|
3425
|
-
0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
|
3426
|
-
0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
|
3427
|
-
0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
|
3428
|
-
0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
|
3429
|
-
0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
|
3430
|
-
0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
|
3431
|
-
0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
|
3432
|
-
0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
|
3433
|
-
0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
|
3434
|
-
0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
|
3435
|
-
0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
|
3436
|
-
0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
|
3437
|
-
0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
|
3438
|
-
0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
|
3439
|
-
0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
|
3440
|
-
0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
|
3441
|
-
0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
|
3442
|
-
0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
|
3443
|
-
0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
|
3444
|
-
0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
|
3445
|
-
0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
|
3446
|
-
0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
|
3447
|
-
0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
|
3448
|
-
0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
|
3449
|
-
0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
|
3450
|
-
0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
|
3451
|
-
0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
|
3452
|
-
0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
|
3453
|
-
0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
|
3454
|
-
0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
|
3455
|
-
0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
|
3456
|
-
0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
|
3457
|
-
0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
|
3458
|
-
0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
|
3459
|
-
0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
|
3460
|
-
0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
|
3461
|
-
0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
|
3462
|
-
0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
3463
|
-
0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
|
3464
|
-
0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
|
3465
|
-
0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
|
3466
|
-
0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
|
3467
|
-
0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
|
3468
|
-
0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
|
3469
|
-
0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
|
3470
|
-
0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
|
3471
|
-
0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
|
3472
|
-
0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
|
3473
|
-
0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
|
3474
|
-
0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
|
3475
|
-
0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
|
3476
|
-
0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
|
3477
|
-
0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
|
3478
|
-
0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
|
3479
|
-
0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
|
3480
|
-
0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
|
3481
|
-
0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
|
3482
|
-
0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
|
3483
|
-
0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
|
3484
|
-
0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
|
3485
|
-
0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
|
3486
|
-
0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
|
3487
|
-
0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
|
3488
|
-
0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
|
3489
|
-
0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
|
3490
|
-
0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
|
3491
|
-
0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
|
3492
|
-
0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
|
3493
|
-
0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
|
3494
|
-
0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
|
3495
|
-
0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
|
3496
|
-
0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
|
3497
|
-
0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
|
3498
|
-
0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
|
3499
|
-
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
|
3500
|
-
0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
|
3501
|
-
0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
|
3502
|
-
0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
|
3503
|
-
0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
|
3504
|
-
0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
|
3505
|
-
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
|
3506
|
-
0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
|
3507
|
-
0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
|
3508
|
-
0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
|
3509
|
-
0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
|
3510
|
-
0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
|
3511
|
-
0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
|
3512
|
-
0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
|
3513
|
-
0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
|
3514
|
-
0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
|
3515
|
-
0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
|
3516
|
-
0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
|
3517
|
-
0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
|
3518
|
-
0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
|
3519
|
-
0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
|
3520
|
-
0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
|
3521
|
-
0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
|
3522
|
-
0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
|
3523
|
-
0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
|
3524
|
-
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
3525
|
-
};
|
3526
|
-
|
3527
|
-
static const uint64_t iq2s_grid[1024] = {
|
3528
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
3529
|
-
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
3530
|
-
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
3531
|
-
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
3532
|
-
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
3533
|
-
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
3534
|
-
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
3535
|
-
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
3536
|
-
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
3537
|
-
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
3538
|
-
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
3539
|
-
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
3540
|
-
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
3541
|
-
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
3542
|
-
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
3543
|
-
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
3544
|
-
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
3545
|
-
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
3546
|
-
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
3547
|
-
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
3548
|
-
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
3549
|
-
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
3550
|
-
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
3551
|
-
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
3552
|
-
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
3553
|
-
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
3554
|
-
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
3555
|
-
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
3556
|
-
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
3557
|
-
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
3558
|
-
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
3559
|
-
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
3560
|
-
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
3561
|
-
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
3562
|
-
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
3563
|
-
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
3564
|
-
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
3565
|
-
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
3566
|
-
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
3567
|
-
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
3568
|
-
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
3569
|
-
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
3570
|
-
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
3571
|
-
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
3572
|
-
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
3573
|
-
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
3574
|
-
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
3575
|
-
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
3576
|
-
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
3577
|
-
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
3578
|
-
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
3579
|
-
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
3580
|
-
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
3581
|
-
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
3582
|
-
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
3583
|
-
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
3584
|
-
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
3585
|
-
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
3586
|
-
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
3587
|
-
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
3588
|
-
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
3589
|
-
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
3590
|
-
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
3591
|
-
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
3592
|
-
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
3593
|
-
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
3594
|
-
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
3595
|
-
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
3596
|
-
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
3597
|
-
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
3598
|
-
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
3599
|
-
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
3600
|
-
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
3601
|
-
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
3602
|
-
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
3603
|
-
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
3604
|
-
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
3605
|
-
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
3606
|
-
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
3607
|
-
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
3608
|
-
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
3609
|
-
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
3610
|
-
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
3611
|
-
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
3612
|
-
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
3613
|
-
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
3614
|
-
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
3615
|
-
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
3616
|
-
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
3617
|
-
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
3618
|
-
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
3619
|
-
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
3620
|
-
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
3621
|
-
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
3622
|
-
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
3623
|
-
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
3624
|
-
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
3625
|
-
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
3626
|
-
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
3627
|
-
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
3628
|
-
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
3629
|
-
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
3630
|
-
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
3631
|
-
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
3632
|
-
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
3633
|
-
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
3634
|
-
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
3635
|
-
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
3636
|
-
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
3637
|
-
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
3638
|
-
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
3639
|
-
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
3640
|
-
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
3641
|
-
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
3642
|
-
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
3643
|
-
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
3644
|
-
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
3645
|
-
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
3646
|
-
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
3647
|
-
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
3648
|
-
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
3649
|
-
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
3650
|
-
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
3651
|
-
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
3652
|
-
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
3653
|
-
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
3654
|
-
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
3655
|
-
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
3656
|
-
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
3657
|
-
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
3658
|
-
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
3659
|
-
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
3660
|
-
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
3661
|
-
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
3662
|
-
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
3663
|
-
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
3664
|
-
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
3665
|
-
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
3666
|
-
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
3667
|
-
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
3668
|
-
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
3669
|
-
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
3670
|
-
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
3671
|
-
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
3672
|
-
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
3673
|
-
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
3674
|
-
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
3675
|
-
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
3676
|
-
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
3677
|
-
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
3678
|
-
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
3679
|
-
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
3680
|
-
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
3681
|
-
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
3682
|
-
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
3683
|
-
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
3684
|
-
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
3685
|
-
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
3686
|
-
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
3687
|
-
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
3688
|
-
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
3689
|
-
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
3690
|
-
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
3691
|
-
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
3692
|
-
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
3693
|
-
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
3694
|
-
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
3695
|
-
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
3696
|
-
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
3697
|
-
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
3698
|
-
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
3699
|
-
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
3700
|
-
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
3701
|
-
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
3702
|
-
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
3703
|
-
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
3704
|
-
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
3705
|
-
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
3706
|
-
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
3707
|
-
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
3708
|
-
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
3709
|
-
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
3710
|
-
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
3711
|
-
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
3712
|
-
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
3713
|
-
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
3714
|
-
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
3715
|
-
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
3716
|
-
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
3717
|
-
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
3718
|
-
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
3719
|
-
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
3720
|
-
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
3721
|
-
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
3722
|
-
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
3723
|
-
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
3724
|
-
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
3725
|
-
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
3726
|
-
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
3727
|
-
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
3728
|
-
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
3729
|
-
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
3730
|
-
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
3731
|
-
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
3732
|
-
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
3733
|
-
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
3734
|
-
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
3735
|
-
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
3736
|
-
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
3737
|
-
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
3738
|
-
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
3739
|
-
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
3740
|
-
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
3741
|
-
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
3742
|
-
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
3743
|
-
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
3744
|
-
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
3745
|
-
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
3746
|
-
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
3747
|
-
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
3748
|
-
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
3749
|
-
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
3750
|
-
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
3751
|
-
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
3752
|
-
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
3753
|
-
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
3754
|
-
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
3755
|
-
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
3756
|
-
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
3757
|
-
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
3758
|
-
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
3759
|
-
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
3760
|
-
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
3761
|
-
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
3762
|
-
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
3763
|
-
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
3764
|
-
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
3765
|
-
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
3766
|
-
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
3767
|
-
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
3768
|
-
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
3769
|
-
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
3770
|
-
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
3771
|
-
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
3772
|
-
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
3773
|
-
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
3774
|
-
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
3775
|
-
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
3776
|
-
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
3777
|
-
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
3778
|
-
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
3779
|
-
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
3780
|
-
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
3781
|
-
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
3782
|
-
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
3783
|
-
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
3784
|
-
};
|
3785
|
-
|
3786
|
-
static const uint32_t iq3xxs_grid[256] = {
|
3787
|
-
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
3788
|
-
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
3789
|
-
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
3790
|
-
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
3791
|
-
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
3792
|
-
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
3793
|
-
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
3794
|
-
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
3795
|
-
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
3796
|
-
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
3797
|
-
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
3798
|
-
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
3799
|
-
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
3800
|
-
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
3801
|
-
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
3802
|
-
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
3803
|
-
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
3804
|
-
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
3805
|
-
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
3806
|
-
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
3807
|
-
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
3808
|
-
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
3809
|
-
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
3810
|
-
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
3811
|
-
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
3812
|
-
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
3813
|
-
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
3814
|
-
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
3815
|
-
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
3816
|
-
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
3817
|
-
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
3818
|
-
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
3819
|
-
};
|
3820
|
-
|
3821
|
-
static const uint32_t iq3xs_grid[512] = {
|
3822
|
-
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
3823
|
-
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
3824
|
-
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
3825
|
-
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
3826
|
-
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
3827
|
-
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
3828
|
-
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
3829
|
-
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
3830
|
-
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
3831
|
-
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
3832
|
-
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
3833
|
-
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
3834
|
-
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
3835
|
-
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
3836
|
-
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
3837
|
-
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
3838
|
-
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
3839
|
-
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
3840
|
-
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
3841
|
-
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
3842
|
-
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
3843
|
-
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
3844
|
-
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
3845
|
-
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
3846
|
-
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
3847
|
-
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
3848
|
-
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
3849
|
-
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
3850
|
-
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
3851
|
-
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
3852
|
-
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
3853
|
-
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
3854
|
-
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
3855
|
-
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
3856
|
-
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
3857
|
-
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
3858
|
-
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
3859
|
-
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
3860
|
-
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
3861
|
-
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
3862
|
-
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
3863
|
-
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
3864
|
-
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
3865
|
-
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
3866
|
-
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
3867
|
-
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
3868
|
-
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
3869
|
-
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
3870
|
-
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
3871
|
-
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
3872
|
-
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
3873
|
-
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
3874
|
-
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
3875
|
-
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
3876
|
-
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
3877
|
-
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
3878
|
-
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
3879
|
-
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
3880
|
-
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
3881
|
-
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
3882
|
-
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
3883
|
-
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
3884
|
-
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
3885
|
-
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
3886
|
-
};
|
3887
|
-
|
3888
|
-
#define NGRID_IQ2XXS 512
|
3889
|
-
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
3890
|
-
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
3891
|
-
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
3892
|
-
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
3893
|
-
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
3894
|
-
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
3895
|
-
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
3896
|
-
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
3897
|
-
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
3898
|
-
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
3899
|
-
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
3900
|
-
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
3901
|
-
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
3902
|
-
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
3903
|
-
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
3904
|
-
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
3905
|
-
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
3906
|
-
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
3907
|
-
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
3908
|
-
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
3909
|
-
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
3910
|
-
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
3911
|
-
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
3912
|
-
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
3913
|
-
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
3914
|
-
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
3915
|
-
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
3916
|
-
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
3917
|
-
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
3918
|
-
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
3919
|
-
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
3920
|
-
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
3921
|
-
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
3922
|
-
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
3923
|
-
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
3924
|
-
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
3925
|
-
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
3926
|
-
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
3927
|
-
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
3928
|
-
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
3929
|
-
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
3930
|
-
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
3931
|
-
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
3932
|
-
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
3933
|
-
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
3934
|
-
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
3935
|
-
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
3936
|
-
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
3937
|
-
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
3938
|
-
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
3939
|
-
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
3940
|
-
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
3941
|
-
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
3942
|
-
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
3943
|
-
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
3944
|
-
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
3945
|
-
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
3946
|
-
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
3947
|
-
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
3948
|
-
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
3949
|
-
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
3950
|
-
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
3951
|
-
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
3952
|
-
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
3953
|
-
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
3954
|
-
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
3955
|
-
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
3956
|
-
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
3957
|
-
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
3958
|
-
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
3959
|
-
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
3960
|
-
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
3961
|
-
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
3962
|
-
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
3963
|
-
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
3964
|
-
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
3965
|
-
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
3966
|
-
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
3967
|
-
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
3968
|
-
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
3969
|
-
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
3970
|
-
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
3971
|
-
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
3972
|
-
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
3973
|
-
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
3974
|
-
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
3975
|
-
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
3976
|
-
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
3977
|
-
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
3978
|
-
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
3979
|
-
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
3980
|
-
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
3981
|
-
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
3982
|
-
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
3983
|
-
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
3984
|
-
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
3985
|
-
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
3986
|
-
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
3987
|
-
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
3988
|
-
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
3989
|
-
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
3990
|
-
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
3991
|
-
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
3992
|
-
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
3993
|
-
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
3994
|
-
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
3995
|
-
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
3996
|
-
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
3997
|
-
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
3998
|
-
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
3999
|
-
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
4000
|
-
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
4001
|
-
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
4002
|
-
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
4003
|
-
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
4004
|
-
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
4005
|
-
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
4006
|
-
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
4007
|
-
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
4008
|
-
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
4009
|
-
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
4010
|
-
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
4011
|
-
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
4012
|
-
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
4013
|
-
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
4014
|
-
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
4015
|
-
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
4016
|
-
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
4017
|
-
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
4018
|
-
|
4019
|
-
};
|
4020
|
-
|
4021
|
-
static const uint8_t ksigns_iq2xs[128] = {
|
4022
|
-
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
4023
|
-
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
4024
|
-
160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
|
4025
|
-
48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
|
4026
|
-
192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
|
4027
|
-
80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
|
4028
|
-
96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
|
4029
|
-
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
4030
|
-
};
|
3281
|
+
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
3282
|
+
(void)quant_weights; // not used
|
3283
|
+
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
3284
|
+
quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
|
3285
|
+
return nrow * row_size;
|
3286
|
+
}
|
4031
3287
|
|
4032
|
-
|
3288
|
+
// ====================== "True" 2-bit (de)-quantization
|
4033
3289
|
|
4034
3290
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
4035
3291
|
assert(k % QK_K == 0);
|
@@ -4162,11 +3418,11 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
4162
3418
|
const uint8_t * signs = x[i].signs;
|
4163
3419
|
|
4164
3420
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
4165
|
-
const float db1 = d * (
|
4166
|
-
const float db2 = d * (
|
3421
|
+
const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
3422
|
+
const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
4167
3423
|
for (int l = 0; l < 4; ++l) {
|
4168
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
4169
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
3424
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
3425
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
4170
3426
|
for (int j = 0; j < 4; ++j) {
|
4171
3427
|
y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4172
3428
|
y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
@@ -4176,8 +3432,8 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
4176
3432
|
qs += 8;
|
4177
3433
|
signs += 4;
|
4178
3434
|
for (int l = 0; l < 4; ++l) {
|
4179
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
4180
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
3435
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
|
3436
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
|
4181
3437
|
for (int j = 0; j < 4; ++j) {
|
4182
3438
|
y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4183
3439
|
y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
@@ -4197,39 +3453,23 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
|
4197
3453
|
assert(k % QK_K == 0);
|
4198
3454
|
const int nb = k / QK_K;
|
4199
3455
|
|
4200
|
-
float db[4];
|
4201
|
-
uint16_t idx[4];
|
4202
|
-
//const int8_t * grid[4];
|
4203
|
-
|
4204
3456
|
for (int i = 0; i < nb; i++) {
|
4205
3457
|
|
4206
3458
|
const float d = GGML_FP16_TO_FP32(x[i].d);
|
4207
|
-
const uint8_t
|
4208
|
-
const
|
3459
|
+
const uint8_t * qs = x[i].qs;
|
3460
|
+
const uint16_t * qh = x[i].qh;
|
4209
3461
|
|
4210
|
-
for (int
|
4211
|
-
|
4212
|
-
|
4213
|
-
idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
|
4214
|
-
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
4215
|
-
//grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
4216
|
-
//grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
4217
|
-
//grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
|
4218
|
-
//grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
|
4219
|
-
db[0] = d * (2*(sc[0] & 7) + 1);
|
4220
|
-
db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
|
4221
|
-
db[2] = d * (2*(sc[1] & 7) + 1);
|
4222
|
-
db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
|
3462
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
3463
|
+
const float dl = d * (2*((qh[ib] >> 12) & 7) + 1);
|
3464
|
+
const float delta = qh[ib] & 0x8000 ? -IQ1S_DELTA : IQ1S_DELTA;
|
4223
3465
|
for (int l = 0; l < 4; ++l) {
|
4224
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid +
|
3466
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
4225
3467
|
for (int j = 0; j < 8; ++j) {
|
4226
|
-
|
4227
|
-
y[j] = db[l] * grid[j];
|
3468
|
+
y[j] = dl * (grid[j] + delta);
|
4228
3469
|
}
|
4229
3470
|
y += 8;
|
4230
3471
|
}
|
4231
3472
|
qs += 4;
|
4232
|
-
sc += 2;
|
4233
3473
|
}
|
4234
3474
|
}
|
4235
3475
|
}
|
@@ -4783,10 +4023,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4783
4023
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
4784
4024
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
4785
4025
|
|
4786
|
-
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
|
4787
|
-
GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
|
4788
|
-
GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
|
4789
|
-
GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
|
4026
|
+
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
4027
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
4028
|
+
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
4029
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
4790
4030
|
summs0 += summs_t;
|
4791
4031
|
|
4792
4032
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
@@ -4807,10 +4047,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4807
4047
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
4808
4048
|
|
4809
4049
|
// mmla into int32x4_t
|
4810
|
-
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*
|
4811
|
-
GGML_FP16_TO_FP32(b_x0->d)*
|
4812
|
-
GGML_FP16_TO_FP32(b_x1->d)*
|
4813
|
-
GGML_FP16_TO_FP32(b_x1->d)*
|
4050
|
+
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
4051
|
+
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
4052
|
+
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
4053
|
+
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
4814
4054
|
|
4815
4055
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4816
4056
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
@@ -4851,7 +4091,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4851
4091
|
const block_q8_1 * restrict y0 = &y[i + 0];
|
4852
4092
|
const block_q8_1 * restrict y1 = &y[i + 1];
|
4853
4093
|
|
4854
|
-
summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
|
4094
|
+
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
4855
4095
|
|
4856
4096
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
4857
4097
|
|
@@ -4874,8 +4114,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4874
4114
|
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
4875
4115
|
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
4876
4116
|
|
4877
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
4878
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
4117
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
4118
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
4879
4119
|
}
|
4880
4120
|
|
4881
4121
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
@@ -4888,9 +4128,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4888
4128
|
// Main loop
|
4889
4129
|
for (int i = 0; i < nb; ++i) {
|
4890
4130
|
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
4891
|
-
const float d1 = y[i].d;
|
4131
|
+
const float d1 = GGML_FP16_TO_FP32(y[i].d);
|
4892
4132
|
|
4893
|
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
4133
|
+
summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
|
4894
4134
|
|
4895
4135
|
const __m256 d0v = _mm256_set1_ps( d0 );
|
4896
4136
|
const __m256 d1v = _mm256_set1_ps( d1 );
|
@@ -4942,7 +4182,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4942
4182
|
|
4943
4183
|
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
4944
4184
|
|
4945
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
4185
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
4946
4186
|
}
|
4947
4187
|
|
4948
4188
|
*s = sumf;
|
@@ -4960,7 +4200,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4960
4200
|
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
4961
4201
|
}
|
4962
4202
|
|
4963
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
4203
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
4964
4204
|
}
|
4965
4205
|
|
4966
4206
|
*s = sumf;
|
@@ -5296,8 +4536,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5296
4536
|
|
5297
4537
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
5298
4538
|
|
5299
|
-
summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
|
5300
|
-
summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
|
4539
|
+
summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
4540
|
+
summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
5301
4541
|
|
5302
4542
|
// extract the 5th bit via lookup table ((b) << 4)
|
5303
4543
|
memcpy(&qh0, x0->qh, sizeof(qh0));
|
@@ -5341,10 +4581,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5341
4581
|
|
5342
4582
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
5343
4583
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
5344
|
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
4584
|
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
5345
4585
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
5346
4586
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
5347
|
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
4587
|
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
5348
4588
|
}
|
5349
4589
|
|
5350
4590
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
@@ -5361,7 +4601,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5361
4601
|
const block_q5_1 * restrict x0 = &x[i];
|
5362
4602
|
const block_q8_1 * restrict y0 = &y[i];
|
5363
4603
|
|
5364
|
-
summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
|
4604
|
+
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
5365
4605
|
|
5366
4606
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
5367
4607
|
|
@@ -5408,7 +4648,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5408
4648
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
5409
4649
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
5410
4650
|
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
5411
|
-
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
4651
|
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
5412
4652
|
}
|
5413
4653
|
|
5414
4654
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
@@ -5423,14 +4663,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5423
4663
|
for (int i = 0; i < nb; i++) {
|
5424
4664
|
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
|
5425
4665
|
|
5426
|
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
4666
|
+
summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
|
5427
4667
|
|
5428
4668
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
5429
4669
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
5430
4670
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
5431
4671
|
qx = _mm256_or_si256(qx, bxhi);
|
5432
4672
|
|
5433
|
-
const __m256 dy = _mm256_set1_ps(y[i].d);
|
4673
|
+
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
|
5434
4674
|
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
5435
4675
|
|
5436
4676
|
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
@@ -5450,7 +4690,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5450
4690
|
for (int i = 0; i < nb; i++) {
|
5451
4691
|
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
|
5452
4692
|
|
5453
|
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
4693
|
+
summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
|
5454
4694
|
|
5455
4695
|
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
5456
4696
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
@@ -5464,7 +4704,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5464
4704
|
bxh = _mm_or_si128(bxh, bxhih);
|
5465
4705
|
bx_0 = MM256_SET_M128I(bxh, bxl);
|
5466
4706
|
|
5467
|
-
const __m256 dy = _mm256_set1_ps(y[i].d);
|
4707
|
+
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
|
5468
4708
|
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
5469
4709
|
|
5470
4710
|
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
@@ -5531,7 +4771,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5531
4771
|
|
5532
4772
|
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
5533
4773
|
|
5534
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
4774
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
5535
4775
|
}
|
5536
4776
|
|
5537
4777
|
*s = sumf;
|
@@ -5555,7 +4795,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5555
4795
|
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
5556
4796
|
}
|
5557
4797
|
|
5558
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
4798
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
5559
4799
|
}
|
5560
4800
|
|
5561
4801
|
*s = sumf;
|
@@ -9563,7 +8803,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9563
8803
|
|
9564
8804
|
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
9565
8805
|
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
9566
|
-
const __m256i full_signs =
|
8806
|
+
const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
|
9567
8807
|
|
9568
8808
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
9569
8809
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
@@ -9585,8 +8825,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9585
8825
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
9586
8826
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
9587
8827
|
|
9588
|
-
const __m256i sc1 =
|
9589
|
-
const __m256i sc2 =
|
8828
|
+
const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
8829
|
+
const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
9590
8830
|
|
9591
8831
|
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
9592
8832
|
|
@@ -9653,8 +8893,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9653
8893
|
|
9654
8894
|
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
9655
8895
|
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
9656
|
-
const __m256i full_signs_1 =
|
9657
|
-
const __m256i full_signs_2 =
|
8896
|
+
const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
|
8897
|
+
const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
|
9658
8898
|
|
9659
8899
|
__m256i signs;
|
9660
8900
|
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
@@ -9757,8 +8997,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9757
8997
|
|
9758
8998
|
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
9759
8999
|
|
9760
|
-
const
|
9761
|
-
const uint8x16_t
|
9000
|
+
const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
|
9001
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
9762
9002
|
const uint8x16_t m1 = vdupq_n_u8(1);
|
9763
9003
|
const int32x4_t vzero = vdupq_n_s32(0);
|
9764
9004
|
|
@@ -9789,7 +9029,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9789
9029
|
vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
|
9790
9030
|
qs += 8;
|
9791
9031
|
|
9792
|
-
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
9032
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
|
9793
9033
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9794
9034
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9795
9035
|
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
@@ -9798,7 +9038,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9798
9038
|
q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
|
9799
9039
|
q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
|
9800
9040
|
|
9801
|
-
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
9041
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
|
9802
9042
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
9803
9043
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
9804
9044
|
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
@@ -9869,12 +9109,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9869
9109
|
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
9870
9110
|
qs += 8;
|
9871
9111
|
|
9872
|
-
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
9112
|
+
__m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
|
9873
9113
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9874
9114
|
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
9875
9115
|
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
9876
9116
|
|
9877
|
-
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
9117
|
+
aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
|
9878
9118
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
9879
9119
|
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
9880
9120
|
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
@@ -10074,7 +9314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
10074
9314
|
#endif
|
10075
9315
|
}
|
10076
9316
|
|
10077
|
-
void ggml_vec_dot_iq3_s_q8_K (int n, float *
|
9317
|
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
10078
9318
|
assert(n % QK_K == 0);
|
10079
9319
|
assert(nrc == 1);
|
10080
9320
|
UNUSED(nrc);
|
@@ -10089,18 +9329,35 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10089
9329
|
|
10090
9330
|
#if defined(__ARM_NEON)
|
10091
9331
|
|
9332
|
+
typedef union {
|
9333
|
+
uint16x8_t vec_index;
|
9334
|
+
uint16_t index[8];
|
9335
|
+
} vec_index_t;
|
9336
|
+
|
10092
9337
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10093
9338
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10094
9339
|
};
|
10095
9340
|
|
10096
9341
|
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
10097
9342
|
|
10098
|
-
const
|
10099
|
-
|
9343
|
+
static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
|
9344
|
+
|
9345
|
+
const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
|
9346
|
+
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
9347
|
+
|
9348
|
+
const int16x8_t hshift = vld1q_s16(k_shift);
|
9349
|
+
const uint16x8_t m256 = vdupq_n_u16(256);
|
9350
|
+
const uint8x16_t m1 = vdupq_n_u8(1);
|
10100
9351
|
|
10101
9352
|
uint8x16x2_t vs;
|
10102
9353
|
ggml_int8x16x4_t q3s;
|
10103
9354
|
ggml_int8x16x4_t q8b;
|
9355
|
+
vec_index_t idx;
|
9356
|
+
|
9357
|
+
#if QK_K == 256
|
9358
|
+
uint32_t scales32[2];
|
9359
|
+
const uint8_t * scales8 = (const uint8_t *)scales32;
|
9360
|
+
#endif
|
10104
9361
|
|
10105
9362
|
float sumf = 0;
|
10106
9363
|
for (int i = 0; i < nb; ++i) {
|
@@ -10109,47 +9366,63 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10109
9366
|
const uint8_t * restrict qh = x[i].qh;
|
10110
9367
|
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10111
9368
|
const int8_t * restrict q8 = y[i].qs;
|
9369
|
+
|
9370
|
+
#if QK_K == 256
|
9371
|
+
memcpy(scales32, x[i].scales, 4);
|
9372
|
+
scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
|
9373
|
+
scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
|
9374
|
+
#endif
|
9375
|
+
|
10112
9376
|
int sumi1 = 0, sumi2 = 0;
|
10113
9377
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10114
9378
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
10115
|
-
const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
|
10116
|
-
iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
|
10117
|
-
const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
|
10118
|
-
iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
|
10119
|
-
const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
|
10120
|
-
iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
|
10121
|
-
const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
|
10122
|
-
iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
|
10123
|
-
qs += 16;
|
10124
9379
|
|
10125
|
-
|
9380
|
+
const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
|
9381
|
+
idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
|
9382
|
+
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
9383
|
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
|
9384
|
+
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
9385
|
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
|
9386
|
+
idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
|
9387
|
+
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
9388
|
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
|
9389
|
+
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
9390
|
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
|
9391
|
+
|
9392
|
+
|
9393
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
|
10126
9394
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10127
9395
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10128
|
-
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
10129
|
-
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9396
|
+
vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
|
9397
|
+
vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
|
10130
9398
|
|
10131
|
-
q3s.val[0] =
|
10132
|
-
q3s.val[1] =
|
9399
|
+
q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
|
9400
|
+
q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
|
10133
9401
|
|
10134
|
-
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
9402
|
+
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
|
10135
9403
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
10136
9404
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
10137
|
-
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
10138
|
-
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
9405
|
+
vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
|
9406
|
+
vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
|
10139
9407
|
|
10140
9408
|
signs += 4;
|
10141
9409
|
|
10142
|
-
q3s.val[2] =
|
10143
|
-
q3s.val[3] =
|
9410
|
+
q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
|
9411
|
+
q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
|
10144
9412
|
|
10145
9413
|
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
10146
9414
|
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
9415
|
+
#if QK_K == 256
|
9416
|
+
sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
|
9417
|
+
sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
|
9418
|
+
#else
|
10147
9419
|
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
|
10148
9420
|
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
|
9421
|
+
#endif
|
10149
9422
|
}
|
10150
9423
|
sumf += d*(sumi1 + sumi2);
|
10151
9424
|
}
|
10152
|
-
*s =
|
9425
|
+
*s = sumf;
|
10153
9426
|
|
10154
9427
|
#elif defined(__AVX2__)
|
10155
9428
|
|
@@ -10164,6 +9437,16 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10164
9437
|
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
10165
9438
|
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
10166
9439
|
|
9440
|
+
const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
9441
|
+
const __m256i idx_mask = _mm256_set1_epi32(256);
|
9442
|
+
|
9443
|
+
typedef union {
|
9444
|
+
__m256i vec[2];
|
9445
|
+
uint32_t index[16];
|
9446
|
+
} index_t;
|
9447
|
+
|
9448
|
+
index_t idx;
|
9449
|
+
|
10167
9450
|
__m256 accumf = _mm256_setzero_ps();
|
10168
9451
|
for (int i = 0; i < nb; ++i) {
|
10169
9452
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
@@ -10176,24 +9459,25 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10176
9459
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10177
9460
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10178
9461
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10179
|
-
const __m256i
|
10180
|
-
|
10181
|
-
|
10182
|
-
|
10183
|
-
|
10184
|
-
|
10185
|
-
|
10186
|
-
|
10187
|
-
|
10188
|
-
const __m256i
|
10189
|
-
|
10190
|
-
|
10191
|
-
|
10192
|
-
|
10193
|
-
|
10194
|
-
|
10195
|
-
|
10196
|
-
|
9462
|
+
const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
|
9463
|
+
idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
|
9464
|
+
idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
|
9465
|
+
idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
|
9466
|
+
idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
|
9467
|
+
idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
|
9468
|
+
idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
|
9469
|
+
|
9470
|
+
// At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
|
9471
|
+
//const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
|
9472
|
+
//const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
|
9473
|
+
const __m256i q2_1 = _mm256_set_epi32(
|
9474
|
+
iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
|
9475
|
+
iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
|
9476
|
+
);
|
9477
|
+
const __m256i q2_2 = _mm256_set_epi32(
|
9478
|
+
iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
|
9479
|
+
iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
|
9480
|
+
);
|
10197
9481
|
|
10198
9482
|
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
10199
9483
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
@@ -10221,7 +9505,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10221
9505
|
|
10222
9506
|
}
|
10223
9507
|
|
10224
|
-
*s =
|
9508
|
+
*s = hsum_float_8(accumf);
|
10225
9509
|
|
10226
9510
|
#else
|
10227
9511
|
|
@@ -10238,8 +9522,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10238
9522
|
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
10239
9523
|
int32_t sumi = 0;
|
10240
9524
|
for (int l = 0; l < 4; ++l) {
|
10241
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
10242
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
9525
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
9526
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
10243
9527
|
for (int j = 0; j < 4; ++j) {
|
10244
9528
|
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10245
9529
|
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
@@ -10251,8 +9535,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10251
9535
|
bsum += sumi * ls1;
|
10252
9536
|
sumi = 0;
|
10253
9537
|
for (int l = 0; l < 4; ++l) {
|
10254
|
-
const uint8_t * grid1 = (const uint8_t *)(
|
10255
|
-
const uint8_t * grid2 = (const uint8_t *)(
|
9538
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
9539
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
10256
9540
|
for (int j = 0; j < 4; ++j) {
|
10257
9541
|
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
10258
9542
|
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
@@ -10265,7 +9549,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
10265
9549
|
}
|
10266
9550
|
sumf += d * bsum;
|
10267
9551
|
}
|
10268
|
-
*s =
|
9552
|
+
*s = sumf;
|
10269
9553
|
#endif
|
10270
9554
|
}
|
10271
9555
|
|
@@ -10278,7 +9562,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
10278
9562
|
}
|
10279
9563
|
#endif
|
10280
9564
|
|
10281
|
-
void ggml_vec_dot_iq1_s_q8_K (int n, float *
|
9565
|
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
10282
9566
|
assert(n % QK_K == 0);
|
10283
9567
|
assert(nrc == 1);
|
10284
9568
|
UNUSED(nrc);
|
@@ -10291,155 +9575,119 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
10291
9575
|
|
10292
9576
|
const int nb = n / QK_K;
|
10293
9577
|
|
10294
|
-
|
10295
|
-
#if defined __ARM_NEON && QK_K == 256
|
10296
|
-
|
10297
|
-
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
10298
|
-
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
10299
|
-
const uint8x16_t m1 = vdupq_n_u8(0x01);
|
10300
|
-
const int32x4_t vzero = vdupq_n_s32(0);
|
9578
|
+
#if defined __ARM_NEON
|
10301
9579
|
|
10302
|
-
|
10303
|
-
uint16x8x2_t vindex;
|
10304
|
-
int8x16x4_t q1b;
|
9580
|
+
ggml_int8x16x4_t q1b;
|
10305
9581
|
ggml_int8x16x4_t q8b;
|
10306
|
-
uint16x8x4_t scales;
|
10307
|
-
int32x4x2_t sumi;
|
10308
|
-
int32x4x2_t dotq;
|
10309
9582
|
|
10310
9583
|
float sumf = 0;
|
10311
9584
|
for (int i = 0; i < nb; ++i) {
|
10312
9585
|
|
10313
|
-
const int8_t
|
10314
|
-
const uint8_t
|
10315
|
-
const
|
9586
|
+
const int8_t * q8 = y[i].qs;
|
9587
|
+
const uint8_t * qs = x[i].qs;
|
9588
|
+
const uint16_t * qh = x[i].qh;
|
10316
9589
|
|
10317
|
-
|
9590
|
+
int sumi1 = 0, sumi2 = 0, sumi3 = 0;
|
10318
9591
|
|
10319
|
-
for (int
|
10320
|
-
const uint8x16_t ql = vld1q_u8(qs); qs += 16;
|
10321
|
-
const uint8x8_t tm1 = vld1_u8 (sc); sc += 8;
|
10322
|
-
const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
|
10323
|
-
const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
|
10324
|
-
const uint8x16_t hbit = vandq_u8(qh, m8);
|
10325
|
-
vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
|
10326
|
-
vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
|
10327
|
-
const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
|
10328
|
-
scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
|
10329
|
-
scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
|
9592
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
10330
9593
|
|
10331
|
-
|
10332
|
-
|
10333
|
-
|
10334
|
-
|
10335
|
-
|
10336
|
-
|
10337
|
-
|
9594
|
+
q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
|
9595
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
|
9596
|
+
q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
|
9597
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
|
9598
|
+
q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
|
9599
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
|
9600
|
+
q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
|
9601
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
|
9602
|
+
qs += 8;
|
10338
9603
|
|
10339
|
-
|
10340
|
-
|
9604
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9605
|
+
|
9606
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
|
9607
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);
|
9608
|
+
|
9609
|
+
const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
9610
|
+
const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
9611
|
+
sumi1 += vaddvq_s32(p1) * ls1;
|
9612
|
+
sumi2 += vaddvq_s32(p2) * ls2;
|
9613
|
+
sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
|
9614
|
+
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);
|
10341
9615
|
|
10342
|
-
sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
|
10343
|
-
sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
|
10344
|
-
}
|
10345
9616
|
}
|
10346
9617
|
|
10347
|
-
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) *
|
9618
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
|
10348
9619
|
}
|
10349
9620
|
|
10350
9621
|
*s = sumf;
|
10351
9622
|
|
10352
|
-
|
10353
|
-
#elif defined __AVX2__ && QK_K == 256
|
10354
|
-
|
10355
|
-
const __m128i m8 = _mm_set1_epi8(0x08);
|
10356
|
-
const __m128i m7 = _mm_set1_epi8(0x07);
|
10357
|
-
const __m128i m1 = _mm_set1_epi8(0x01);
|
10358
|
-
const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
|
10359
|
-
const __m128i shuffle_s[4] = {
|
10360
|
-
_mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
|
10361
|
-
_mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
|
10362
|
-
_mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
|
10363
|
-
_mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
|
10364
|
-
};
|
10365
|
-
|
10366
|
-
uint64_t aux64;
|
10367
|
-
|
10368
|
-
typedef union m256i_uint16 {
|
10369
|
-
__m256i reg;
|
10370
|
-
uint16_t s[16];
|
10371
|
-
} m256i_uint16_t;
|
10372
|
-
|
10373
|
-
m256i_uint16_t v_gindex;
|
9623
|
+
#elif defined __AVX2__
|
10374
9624
|
|
10375
9625
|
__m256 accum = _mm256_setzero_ps();
|
9626
|
+
float accum1 = 0;
|
10376
9627
|
for (int i = 0; i < nb; ++i) {
|
10377
9628
|
|
10378
|
-
const int8_t
|
10379
|
-
const uint8_t
|
10380
|
-
const
|
9629
|
+
const int8_t * q8 = y[i].qs;
|
9630
|
+
const uint8_t * qs = x[i].qs;
|
9631
|
+
const uint16_t * qh = x[i].qh;
|
10381
9632
|
|
10382
9633
|
__m256i sumi = _mm256_setzero_si256();
|
10383
|
-
|
10384
|
-
|
10385
|
-
|
10386
|
-
|
10387
|
-
const __m256i
|
10388
|
-
|
10389
|
-
|
9634
|
+
int sumi1 = 0;
|
9635
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
9636
|
+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
|
9637
|
+
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
9638
|
+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
|
9639
|
+
iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
9640
|
+
qs += 8;
|
9641
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9642
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
10390
9643
|
|
10391
|
-
|
10392
|
-
|
10393
|
-
|
10394
|
-
|
10395
|
-
|
10396
|
-
|
10397
|
-
const __m256i p = _mm256_madd_epi16(s16, dot);
|
10398
|
-
sumi = _mm256_add_epi32(sumi, p);
|
10399
|
-
}
|
9644
|
+
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
9645
|
+
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
9646
|
+
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
9647
|
+
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
9648
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
|
9649
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
|
10400
9650
|
|
9651
|
+
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
|
9652
|
+
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
|
9653
|
+
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
10401
9654
|
}
|
10402
9655
|
|
10403
|
-
|
9656
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
9657
|
+
accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
|
9658
|
+
accum1 += d * sumi1;
|
10404
9659
|
|
10405
9660
|
}
|
10406
9661
|
|
10407
|
-
*s = hsum_float_8(accum);
|
9662
|
+
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
10408
9663
|
|
10409
9664
|
#else
|
10410
9665
|
|
10411
|
-
int db[4];
|
10412
|
-
uint16_t idx[4];
|
10413
|
-
|
10414
9666
|
float sumf = 0;
|
10415
|
-
for (int i = 0; i < nb; ++
|
9667
|
+
for (int i = 0; i < nb; i++) {
|
10416
9668
|
|
10417
|
-
const int8_t
|
10418
|
-
const uint8_t
|
10419
|
-
const
|
9669
|
+
const int8_t * q8 = y[i].qs;
|
9670
|
+
const uint8_t * qs = x[i].qs;
|
9671
|
+
const uint16_t * qh = x[i].qh;
|
10420
9672
|
|
10421
|
-
int sumi = 0;
|
10422
|
-
for (int
|
10423
|
-
|
10424
|
-
|
10425
|
-
|
10426
|
-
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
10427
|
-
db[0] = (2*(sc[0] & 7) + 1);
|
10428
|
-
db[1] = (2*((sc[0] >> 4) & 7) + 1);
|
10429
|
-
db[2] = (2*(sc[1] & 7) + 1);
|
10430
|
-
db[3] = (2*((sc[1] >> 4) & 7) + 1);
|
9673
|
+
int sumi = 0, sumi1 = 0;
|
9674
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
9675
|
+
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
9676
|
+
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
9677
|
+
int lsum = 0;
|
10431
9678
|
for (int l = 0; l < 4; ++l) {
|
10432
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid +
|
10433
|
-
int
|
10434
|
-
|
10435
|
-
|
9679
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
9680
|
+
for (int j = 0; j < 8; ++j) {
|
9681
|
+
lsum += q8[j] * grid[j];
|
9682
|
+
}
|
10436
9683
|
q8 += 8;
|
10437
9684
|
}
|
9685
|
+
sumi += ls * lsum;
|
9686
|
+
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
10438
9687
|
qs += 4;
|
10439
|
-
sc += 2;
|
10440
9688
|
}
|
10441
9689
|
|
10442
|
-
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
|
9690
|
+
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
10443
9691
|
}
|
10444
9692
|
|
10445
9693
|
*s = sumf;
|
@@ -10508,10 +9756,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
10508
9756
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
10509
9757
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
10510
9758
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
10511
|
-
const __m256i q4b_1 =
|
10512
|
-
|
10513
|
-
const __m256i q4b_2 =
|
10514
|
-
|
9759
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
9760
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
9761
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
9762
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
10515
9763
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
10516
9764
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
10517
9765
|
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
@@ -10618,10 +9866,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
10618
9866
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
10619
9867
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10620
9868
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
10621
|
-
const __m256i q4b_1 =
|
10622
|
-
|
10623
|
-
const __m256i q4b_2 =
|
10624
|
-
|
9869
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
9870
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
9871
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
9872
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
10625
9873
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
10626
9874
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
10627
9875
|
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
@@ -10700,7 +9948,7 @@ static inline int iq2_grid_size(enum ggml_type type) {
|
|
10700
9948
|
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10701
9949
|
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
10702
9950
|
type == GGML_TYPE_IQ2_XS ? 512 :
|
10703
|
-
type == GGML_TYPE_IQ1_S ?
|
9951
|
+
type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024;
|
10704
9952
|
}
|
10705
9953
|
|
10706
9954
|
static int iq2_compare_func(const void * left, const void * right) {
|
@@ -10767,39 +10015,135 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10767
10015
|
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
10768
10016
|
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
10769
10017
|
};
|
10770
|
-
static const uint16_t
|
10771
|
-
|
10772
|
-
|
10773
|
-
|
10774
|
-
|
10775
|
-
|
10776
|
-
|
10777
|
-
|
10778
|
-
|
10779
|
-
|
10780
|
-
|
10781
|
-
|
10782
|
-
|
10783
|
-
|
10784
|
-
|
10785
|
-
|
10786
|
-
|
10787
|
-
|
10788
|
-
|
10789
|
-
|
10790
|
-
|
10791
|
-
|
10792
|
-
|
10793
|
-
|
10794
|
-
|
10795
|
-
|
10796
|
-
|
10797
|
-
|
10798
|
-
|
10799
|
-
|
10800
|
-
|
10801
|
-
|
10802
|
-
|
10018
|
+
static const uint16_t kgrid_1bit_2048[NGRID_IQ1S] = {
|
10019
|
+
0, 2, 5, 8, 10, 17, 21, 32, 34, 40, 42, 69, 81, 84, 86, 101,
|
10020
|
+
128, 130, 136, 138, 149, 160, 162, 168, 170, 260, 261, 273, 276, 278, 281, 282,
|
10021
|
+
293, 321, 326, 329, 338, 341, 346, 353, 356, 358, 360, 389, 401, 404, 406, 421,
|
10022
|
+
512, 514, 520, 522, 533, 544, 546, 552, 554, 581, 593, 601, 612, 617, 640, 642,
|
10023
|
+
648, 650, 657, 661, 665, 672, 674, 680, 682, 1041, 1044, 1046, 1061, 1089, 1097, 1109,
|
10024
|
+
1114, 1124, 1125, 1169, 1177, 1189, 1281, 1284, 1285, 1286, 1301, 1304, 1306, 1321, 1344, 1349,
|
10025
|
+
1354, 1360, 1361, 1364, 1365, 1366, 1369, 1376, 1378, 1381, 1384, 1386, 1409, 1425, 1429, 1432,
|
10026
|
+
1434, 1441, 1444, 1445, 1446, 1449, 1556, 1561, 1601, 1604, 1616, 1618, 1621, 1624, 1632, 1633,
|
10027
|
+
1638, 1641, 1669, 1681, 1684, 1689, 2048, 2050, 2056, 2058, 2069, 2080, 2082, 2088, 2090, 2117,
|
10028
|
+
2129, 2134, 2149, 2176, 2178, 2184, 2186, 2197, 2208, 2210, 2216, 2218, 2309, 2321, 2324, 2329,
|
10029
|
+
2340, 2341, 2369, 2384, 2385, 2389, 2401, 2404, 2409, 2449, 2452, 2454, 2457, 2469, 2560, 2562,
|
10030
|
+
2568, 2570, 2581, 2592, 2594, 2600, 2602, 2629, 2641, 2649, 2657, 2661, 2688, 2690, 2693, 2696,
|
10031
|
+
2698, 2709, 2720, 2722, 2728, 2730, 4112, 4113, 4116, 4121, 4132, 4133, 4161, 4164, 4176, 4181,
|
10032
|
+
4184, 4193, 4196, 4197, 4201, 4241, 4244, 4246, 4257, 4261, 4353, 4356, 4358, 4361, 4368, 4370,
|
10033
|
+
4373, 4376, 4385, 4388, 4393, 4421, 4426, 4432, 4433, 4434, 4436, 4437, 4438, 4441, 4448, 4453,
|
10034
|
+
4484, 4498, 4501, 4513, 4516, 4625, 4628, 4630, 4645, 4672, 4678, 4681, 4690, 4693, 4696, 4698,
|
10035
|
+
4708, 4710, 4741, 4753, 4756, 4758, 4773, 5121, 5126, 5129, 5140, 5141, 5144, 5145, 5153, 5158,
|
10036
|
+
5185, 5189, 5190, 5192, 5194, 5201, 5204, 5205, 5206, 5209, 5218, 5221, 5224, 5252, 5257, 5264,
|
10037
|
+
5268, 5269, 5272, 5273, 5274, 5281, 5284, 5285, 5289, 5378, 5381, 5386, 5393, 5396, 5397, 5398,
|
10038
|
+
5401, 5408, 5410, 5413, 5416, 5418, 5441, 5444, 5445, 5446, 5457, 5458, 5460, 5461, 5462, 5465,
|
10039
|
+
5466, 5473, 5476, 5477, 5478, 5481, 5504, 5506, 5508, 5509, 5512, 5514, 5520, 5521, 5524, 5525,
|
10040
|
+
5526, 5529, 5530, 5536, 5538, 5541, 5633, 5636, 5637, 5638, 5653, 5654, 5656, 5658, 5665, 5670,
|
10041
|
+
5696, 5698, 5700, 5701, 5704, 5706, 5713, 5717, 5718, 5720, 5721, 5729, 5732, 5733, 5736, 5737,
|
10042
|
+
5738, 5766, 5770, 5778, 5781, 5796, 5801, 6161, 6166, 6181, 6209, 6212, 6214, 6217, 6224, 6229,
|
10043
|
+
6232, 6234, 6240, 6241, 6244, 6246, 6249, 6277, 6289, 6292, 6309, 6416, 6418, 6421, 6426, 6433,
|
10044
|
+
6437, 6466, 6468, 6469, 6472, 6481, 6484, 6485, 6486, 6489, 6490, 6496, 6501, 6506, 6537, 6545,
|
10045
|
+
6546, 6549, 6552, 6561, 6566, 6569, 6665, 6678, 6692, 6694, 6724, 6726, 6729, 6736, 6738, 6741,
|
10046
|
+
6744, 6753, 6758, 6761, 6789, 6801, 6806, 6810, 8192, 8194, 8200, 8202, 8213, 8224, 8226, 8229,
|
10047
|
+
8232, 8234, 8261, 8273, 8281, 8289, 8293, 8320, 8322, 8328, 8330, 8341, 8352, 8354, 8357, 8360,
|
10048
|
+
8362, 8453, 8465, 8468, 8473, 8485, 8514, 8516, 8521, 8533, 8536, 8538, 8545, 8548, 8549, 8550,
|
10049
|
+
8581, 8592, 8598, 8601, 8613, 8705, 8712, 8714, 8721, 8725, 8736, 8738, 8744, 8746, 8773, 8785,
|
10050
|
+
8790, 8793, 8805, 8833, 8840, 8842, 8849, 8853, 8864, 8866, 8872, 8874, 9221, 9236, 9238, 9241,
|
10051
|
+
9253, 9284, 9285, 9286, 9289, 9298, 9301, 9304, 9306, 9318, 9349, 9361, 9364, 9369, 9377, 9381,
|
10052
|
+
9481, 9493, 9505, 9513, 9536, 9541, 9544, 9553, 9556, 9557, 9561, 9570, 9573, 9576, 9609, 9616,
|
10053
|
+
9620, 9621, 9624, 9626, 9633, 9636, 9638, 9641, 9733, 9744, 9746, 9753, 9765, 9793, 9801, 9813,
|
10054
|
+
9824, 9825, 9833, 9860, 9862, 9872, 9882, 10240, 10242, 10248, 10250, 10261, 10272, 10274, 10280, 10282,
|
10055
|
+
10309, 10321, 10324, 10341, 10368, 10370, 10376, 10378, 10400, 10402, 10408, 10410, 10505, 10513, 10516, 10521,
|
10056
|
+
10533, 10566, 10569, 10578, 10581, 10593, 10596, 10598, 10601, 10629, 10640, 10646, 10649, 10660, 10661, 10752,
|
10057
|
+
10754, 10760, 10762, 10784, 10786, 10792, 10794, 10821, 10833, 10838, 10841, 10853, 10880, 10882, 10888, 10890,
|
10058
|
+
10901, 10912, 10914, 10920, 10922, 16389, 16401, 16406, 16421, 16457, 16466, 16469, 16472, 16474, 16481, 16484,
|
10059
|
+
16486, 16532, 16537, 16545, 16550, 16640, 16641, 16644, 16646, 16649, 16658, 16661, 16662, 16664, 16666, 16673,
|
10060
|
+
16678, 16681, 16709, 16712, 16714, 16721, 16724, 16725, 16726, 16729, 16730, 16741, 16744, 16746, 16769, 16772,
|
10061
|
+
16774, 16784, 16786, 16789, 16800, 16801, 16802, 16901, 16913, 16916, 16918, 16933, 16961, 16978, 16981, 16986,
|
10062
|
+
16996, 17001, 17033, 17044, 17061, 17409, 17429, 17433, 17449, 17477, 17480, 17482, 17489, 17492, 17493, 17494,
|
10063
|
+
17505, 17506, 17509, 17512, 17514, 17537, 17542, 17545, 17552, 17554, 17557, 17568, 17569, 17577, 17665, 17666,
|
10064
|
+
17669, 17674, 17681, 17684, 17685, 17686, 17689, 17696, 17701, 17706, 17729, 17732, 17733, 17734, 17737, 17744,
|
10065
|
+
17745, 17748, 17749, 17750, 17752, 17753, 17761, 17764, 17765, 17766, 17769, 17794, 17796, 17797, 17800, 17809,
|
10066
|
+
17812, 17813, 17814, 17817, 17818, 17829, 17832, 17834, 17921, 17925, 17929, 17940, 17941, 17944, 17946, 17953,
|
10067
|
+
17956, 17961, 17984, 17986, 17989, 17992, 18000, 18001, 18002, 18005, 18006, 18009, 18018, 18021, 18024, 18049,
|
10068
|
+
18053, 18058, 18068, 18069, 18081, 18084, 18086, 18437, 18449, 18453, 18458, 18469, 18498, 18505, 18512, 18517,
|
10069
|
+
18520, 18529, 18532, 18534, 18537, 18565, 18577, 18580, 18582, 18585, 18597, 18689, 18693, 18694, 18698, 18704,
|
10070
|
+
18708, 18709, 18712, 18721, 18724, 18726, 18752, 18757, 18762, 18769, 18770, 18772, 18773, 18774, 18777, 18784,
|
10071
|
+
18786, 18789, 18790, 18794, 18822, 18825, 18834, 18837, 18838, 18840, 18849, 18852, 18854, 18857, 18966, 19012,
|
10072
|
+
19014, 19017, 19029, 19032, 19034, 19044, 19049, 19092, 19109, 20481, 20484, 20485, 20486, 20489, 20498, 20501,
|
10073
|
+
20506, 20513, 20516, 20521, 20544, 20549, 20552, 20561, 20564, 20565, 20566, 20569, 20581, 20584, 20614, 20617,
|
10074
|
+
20629, 20632, 20640, 20641, 20646, 20649, 20741, 20744, 20745, 20746, 20753, 20756, 20757, 20758, 20760, 20761,
|
10075
|
+
20768, 20773, 20774, 20776, 20778, 20801, 20804, 20805, 20806, 20809, 20816, 20817, 20818, 20820, 20821, 20822,
|
10076
|
+
20824, 20825, 20826, 20833, 20836, 20837, 20838, 20841, 20866, 20869, 20881, 20884, 20885, 20886, 20889, 20896,
|
10077
|
+
20901, 20906, 20993, 20998, 21010, 21013, 21018, 21025, 21028, 21058, 21061, 21066, 21073, 21076, 21077, 21078,
|
10078
|
+
21081, 21090, 21093, 21125, 21136, 21138, 21141, 21145, 21146, 21156, 21508, 21509, 21521, 21524, 21525, 21526,
|
10079
|
+
21528, 21529, 21537, 21541, 21544, 21546, 21569, 21572, 21573, 21574, 21577, 21578, 21584, 21585, 21588, 21589,
|
10080
|
+
21590, 21592, 21593, 21594, 21601, 21602, 21604, 21605, 21606, 21609, 21632, 21640, 21642, 21649, 21652, 21653,
|
10081
|
+
21654, 21657, 21665, 21668, 21669, 21674, 21761, 21762, 21764, 21765, 21766, 21769, 21776, 21777, 21778, 21780,
|
10082
|
+
21781, 21782, 21785, 21786, 21793, 21796, 21797, 21798, 21801, 21824, 21825, 21826, 21828, 21829, 21830, 21832,
|
10083
|
+
21833, 21840, 21841, 21842, 21844, 21845, 21846, 21848, 21849, 21850, 21856, 21857, 21860, 21861, 21862, 21864,
|
10084
|
+
21865, 21866, 21889, 21892, 21893, 21897, 21898, 21904, 21905, 21908, 21909, 21910, 21912, 21913, 21921, 21924,
|
10085
|
+
21925, 21926, 21929, 22016, 22017, 22018, 22020, 22022, 22024, 22025, 22033, 22036, 22037, 22040, 22041, 22048,
|
10086
|
+
22049, 22050, 22052, 22053, 22054, 22056, 22057, 22081, 22085, 22086, 22088, 22089, 22090, 22096, 22097, 22098,
|
10087
|
+
22100, 22101, 22102, 22104, 22105, 22106, 22113, 22116, 22117, 22121, 22146, 22149, 22150, 22152, 22153, 22154,
|
10088
|
+
22161, 22165, 22170, 22178, 22181, 22182, 22184, 22185, 22532, 22533, 22534, 22537, 22544, 22549, 22552, 22561,
|
10089
|
+
22570, 22597, 22600, 22602, 22609, 22612, 22613, 22614, 22616, 22617, 22624, 22626, 22628, 22629, 22658, 22665,
|
10090
|
+
22672, 22674, 22677, 22680, 22689, 22697, 22785, 22786, 22789, 22794, 22801, 22804, 22805, 22806, 22809, 22821,
|
10091
|
+
22849, 22852, 22853, 22854, 22857, 22864, 22865, 22866, 22868, 22869, 22870, 22872, 22873, 22874, 22881, 22884,
|
10092
|
+
22885, 22886, 22889, 22913, 22917, 22921, 22929, 22932, 22933, 22934, 22936, 22937, 22949, 23044, 23048, 23061,
|
10093
|
+
23066, 23072, 23077, 23078, 23081, 23109, 23112, 23113, 23121, 23125, 23126, 23128, 23129, 23138, 23141, 23144,
|
10094
|
+
23146, 23169, 23178, 23186, 23189, 23190, 23192, 23194, 23201, 24581, 24596, 24598, 24601, 24613, 24644, 24656,
|
10095
|
+
24661, 24662, 24664, 24666, 24673, 24676, 24678, 24681, 24705, 24726, 24741, 24833, 24836, 24838, 24841, 24850,
|
10096
|
+
24853, 24865, 24866, 24870, 24873, 24901, 24905, 24913, 24917, 24918, 24921, 24933, 24934, 24938, 24964, 24970,
|
10097
|
+
24978, 24981, 24993, 24998, 25001, 25105, 25110, 25113, 25152, 25153, 25158, 25173, 25174, 25176, 25184, 25221,
|
10098
|
+
25233, 25238, 25253, 25617, 25618, 25621, 25622, 25626, 25633, 25638, 25641, 25664, 25666, 25669, 25672, 25674,
|
10099
|
+
25681, 25684, 25685, 25686, 25689, 25690, 25696, 25698, 25701, 25732, 25733, 25737, 25744, 25746, 25748, 25749,
|
10100
|
+
25750, 25752, 25754, 25761, 25764, 25769, 25861, 25864, 25866, 25873, 25877, 25878, 25881, 25924, 25925, 25926,
|
10101
|
+
25929, 25936, 25937, 25940, 25941, 25942, 25945, 25953, 25956, 25957, 25958, 25961, 25990, 25993, 25994, 26001,
|
10102
|
+
26005, 26006, 26009, 26010, 26018, 26021, 26022, 26024, 26114, 26121, 26133, 26144, 26150, 26152, 26153, 26176,
|
10103
|
+
26181, 26184, 26186, 26193, 26196, 26197, 26198, 26200, 26202, 26208, 26213, 26216, 26240, 26242, 26245, 26250,
|
10104
|
+
26260, 26262, 26264, 26265, 26272, 26276, 26278, 26282, 26646, 26649, 26661, 26689, 26706, 26709, 26714, 26721,
|
10105
|
+
26729, 26757, 26769, 26776, 26790, 26881, 26884, 26896, 26901, 26913, 26916, 26918, 26921, 26944, 26945, 26949,
|
10106
|
+
26950, 26952, 26961, 26964, 26965, 26966, 26969, 26976, 26981, 26986, 27010, 27012, 27018, 27029, 27041, 27044,
|
10107
|
+
27045, 27049, 27153, 27158, 27160, 27201, 27204, 27209, 27216, 27221, 27224, 27226, 27236, 27237, 27241, 27270,
|
10108
|
+
27284, 27288, 27290, 27302, 32768, 32770, 32776, 32778, 32800, 32802, 32808, 32810, 32837, 32848, 32849, 32852,
|
10109
|
+
32854, 32857, 32869, 32896, 32898, 32904, 32906, 32917, 32928, 32930, 32936, 32938, 33029, 33041, 33044, 33046,
|
10110
|
+
33049, 33061, 33089, 33092, 33097, 33104, 33106, 33109, 33110, 33112, 33113, 33124, 33126, 33129, 33157, 33161,
|
10111
|
+
33172, 33174, 33177, 33189, 33280, 33282, 33288, 33290, 33301, 33312, 33314, 33320, 33322, 33361, 33364, 33369,
|
10112
|
+
33381, 33408, 33410, 33416, 33418, 33429, 33440, 33442, 33448, 33450, 33812, 33817, 33857, 33860, 33873, 33877,
|
10113
|
+
33882, 33889, 33892, 33897, 33940, 33945, 34049, 34057, 34066, 34069, 34074, 34086, 34089, 34112, 34113, 34117,
|
10114
|
+
34120, 34129, 34132, 34133, 34134, 34137, 34138, 34149, 34150, 34152, 34154, 34177, 34180, 34182, 34185, 34192,
|
10115
|
+
34194, 34197, 34200, 34214, 34321, 34326, 34329, 34341, 34369, 34372, 34377, 34378, 34384, 34389, 34393, 34394,
|
10116
|
+
34401, 34406, 34410, 34437, 34449, 34458, 34468, 34816, 34818, 34824, 34826, 34837, 34848, 34850, 34856, 34858,
|
10117
|
+
34881, 34885, 34897, 34900, 34905, 34917, 34921, 34944, 34946, 34952, 34954, 34965, 34976, 34978, 34984, 34986,
|
10118
|
+
35077, 35078, 35089, 35092, 35094, 35109, 35137, 35140, 35142, 35145, 35152, 35154, 35157, 35162, 35169, 35172,
|
10119
|
+
35205, 35222, 35225, 35237, 35328, 35330, 35336, 35338, 35349, 35360, 35362, 35368, 35370, 35397, 35409, 35412,
|
10120
|
+
35414, 35456, 35458, 35464, 35466, 35477, 35488, 35490, 35496, 35498, 36869, 36881, 36886, 36888, 36889, 36901,
|
10121
|
+
36929, 36934, 36937, 36949, 36952, 36954, 36969, 36970, 36997, 37009, 37012, 37014, 37017, 37029, 37121, 37124,
|
10122
|
+
37126, 37129, 37136, 37141, 37144, 37146, 37153, 37156, 37158, 37161, 37184, 37189, 37200, 37201, 37204, 37205,
|
10123
|
+
37206, 37209, 37218, 37221, 37252, 37254, 37266, 37269, 37272, 37281, 37284, 37286, 37289, 37381, 37393, 37396,
|
10124
|
+
37401, 37413, 37444, 37446, 37449, 37456, 37458, 37461, 37464, 37478, 37481, 37509, 37524, 37526, 37545, 37889,
|
10125
|
+
37892, 37894, 37904, 37909, 37912, 37926, 37952, 37962, 37969, 37972, 37973, 37974, 37976, 37977, 37984, 37985,
|
10126
|
+
37986, 37989, 38020, 38022, 38034, 38036, 38037, 38040, 38049, 38057, 38144, 38149, 38152, 38154, 38160, 38161,
|
10127
|
+
38164, 38165, 38166, 38169, 38177, 38181, 38185, 38186, 38209, 38212, 38213, 38214, 38217, 38224, 38225, 38226,
|
10128
|
+
38228, 38229, 38230, 38232, 38233, 38234, 38241, 38244, 38245, 38246, 38249, 38273, 38277, 38280, 38289, 38290,
|
10129
|
+
38292, 38293, 38294, 38297, 38298, 38304, 38306, 38309, 38312, 38314, 38401, 38404, 38416, 38421, 38425, 38432,
|
10130
|
+
38438, 38441, 38469, 38472, 38473, 38481, 38482, 38485, 38486, 38489, 38501, 38504, 38530, 38532, 38537, 38538,
|
10131
|
+
38546, 38548, 38549, 38564, 38566, 38569, 38917, 38934, 38937, 38949, 38977, 38982, 38992, 38994, 38997, 38998,
|
10132
|
+
39002, 39012, 39013, 39045, 39057, 39062, 39065, 39077, 39172, 39174, 39177, 39184, 39186, 39189, 39192, 39194,
|
10133
|
+
39200, 39201, 39204, 39206, 39232, 39234, 39237, 39240, 39242, 39249, 39252, 39253, 39254, 39257, 39266, 39269,
|
10134
|
+
39270, 39274, 39297, 39300, 39312, 39314, 39317, 39322, 39329, 39334, 39429, 39445, 39461, 39492, 39494, 39497,
|
10135
|
+
39504, 39509, 39512, 39521, 39557, 39569, 39572, 39573, 39574, 40960, 40962, 40968, 40970, 40981, 40992, 40994,
|
10136
|
+
41000, 41002, 41029, 41041, 41044, 41046, 41049, 41088, 41090, 41096, 41098, 41109, 41120, 41122, 41128, 41130,
|
10137
|
+
41221, 41225, 41233, 41236, 41238, 41241, 41242, 41286, 41289, 41297, 41301, 41304, 41306, 41313, 41316, 41349,
|
10138
|
+
41360, 41362, 41366, 41369, 41474, 41480, 41482, 41488, 41497, 41506, 41512, 41514, 41541, 41553, 41558, 41561,
|
10139
|
+
41573, 41600, 41602, 41608, 41610, 41621, 41632, 41634, 41640, 41642, 42009, 42021, 42049, 42052, 42064, 42068,
|
10140
|
+
42069, 42072, 42074, 42081, 42085, 42086, 42088, 42089, 42117, 42246, 42249, 42256, 42258, 42261, 42264, 42278,
|
10141
|
+
42281, 42306, 42309, 42321, 42324, 42325, 42326, 42329, 42341, 42346, 42369, 42372, 42373, 42374, 42377, 42386,
|
10142
|
+
42389, 42392, 42501, 42513, 42518, 42522, 42529, 42533, 42564, 42566, 42570, 42578, 42581, 42582, 42584, 42592,
|
10143
|
+
42594, 42630, 42640, 42645, 42646, 42649, 42657, 42660, 42662, 43008, 43010, 43016, 43018, 43040, 43042, 43048,
|
10144
|
+
43050, 43089, 43092, 43094, 43097, 43136, 43138, 43144, 43146, 43157, 43168, 43170, 43176, 43178, 43269, 43284,
|
10145
|
+
43289, 43297, 43301, 43329, 43344, 43349, 43354, 43361, 43366, 43369, 43408, 43414, 43520, 43522, 43528, 43530,
|
10146
|
+
43552, 43554, 43560, 43562, 43601, 43604, 43606, 43648, 43650, 43656, 43658, 43669, 43680, 43682, 43688, 43690,
|
10803
10147
|
};
|
10804
10148
|
static const uint16_t kgrid_2bit_1024[1024] = {
|
10805
10149
|
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
@@ -10873,12 +10217,12 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10873
10217
|
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
10874
10218
|
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
10875
10219
|
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10876
|
-
type == GGML_TYPE_IQ1_S ?
|
10220
|
+
type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024;
|
10877
10221
|
uint64_t * kgrid_q2xs;
|
10878
10222
|
int * kmap_q2xs;
|
10879
10223
|
uint16_t * kneighbors_q2xs;
|
10880
10224
|
|
10881
|
-
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
10225
|
+
//printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
10882
10226
|
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
|
10883
10227
|
for (int k = 0; k < grid_size; ++k) {
|
10884
10228
|
int8_t * pos = (int8_t *)(the_grid + k);
|
@@ -10933,7 +10277,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10933
10277
|
}
|
10934
10278
|
num_neighbors += n;
|
10935
10279
|
}
|
10936
|
-
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
10280
|
+
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
10937
10281
|
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
10938
10282
|
iq2_data[gindex].neighbours = kneighbors_q2xs;
|
10939
10283
|
int counter = 0;
|
@@ -11356,8 +10700,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
11356
10700
|
}
|
11357
10701
|
}
|
11358
10702
|
|
11359
|
-
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
11360
|
-
(void)hist;
|
10703
|
+
size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
11361
10704
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11362
10705
|
int nblock = n_per_row/QK_K;
|
11363
10706
|
char * qrow = (char *)dst;
|
@@ -11369,8 +10712,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
|
11369
10712
|
return nrow * nblock * sizeof(block_iq2_xxs);
|
11370
10713
|
}
|
11371
10714
|
|
11372
|
-
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row,
|
11373
|
-
(void)hist;
|
10715
|
+
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
11374
10716
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11375
10717
|
int nblock = n_per_row/QK_K;
|
11376
10718
|
char * qrow = (char *)dst;
|
@@ -11474,7 +10816,7 @@ void iq3xs_init_impl(int grid_size) {
|
|
11474
10816
|
int * kmap_q3xs;
|
11475
10817
|
uint16_t * kneighbors_q3xs;
|
11476
10818
|
|
11477
|
-
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
10819
|
+
//printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
11478
10820
|
uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
|
11479
10821
|
for (int k = 0; k < grid_size; ++k) {
|
11480
10822
|
int8_t * pos = (int8_t *)(the_grid + k);
|
@@ -11529,7 +10871,7 @@ void iq3xs_init_impl(int grid_size) {
|
|
11529
10871
|
}
|
11530
10872
|
num_neighbors += n;
|
11531
10873
|
}
|
11532
|
-
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
10874
|
+
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
11533
10875
|
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
11534
10876
|
iq3_data[gindex].neighbours = kneighbors_q3xs;
|
11535
10877
|
int counter = 0;
|
@@ -11812,8 +11154,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
11812
11154
|
}
|
11813
11155
|
}
|
11814
11156
|
|
11815
|
-
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
11816
|
-
(void)hist;
|
11157
|
+
size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
11817
11158
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11818
11159
|
int nblock = n_per_row/QK_K;
|
11819
11160
|
char * qrow = (char *)dst;
|
@@ -11912,7 +11253,8 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
11912
11253
|
}
|
11913
11254
|
float best = 0;
|
11914
11255
|
float scale = max/(2*kMaxQ-1);
|
11915
|
-
for (int
|
11256
|
+
for (int k = 0; k < bs4; ++k) is_on_grid[k] = false;
|
11257
|
+
for (int is = -9; is <= 9; ++is) {
|
11916
11258
|
float id = (2*kMaxQ-1+is*0.2f)/max;
|
11917
11259
|
float this_scale = 1/id;
|
11918
11260
|
for (int k = 0; k < bs4; ++k) {
|
@@ -11948,7 +11290,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
11948
11290
|
if (n_not_ongrid > 0 && scale > 0) {
|
11949
11291
|
float id = 1/scale;
|
11950
11292
|
for (int k = 0; k < bs4; ++k) {
|
11951
|
-
if (is_on_grid[k]) continue;
|
11293
|
+
//if (is_on_grid[k]) continue;
|
11952
11294
|
uint16_t u = 0;
|
11953
11295
|
for (int i = 0; i < 4; ++i) {
|
11954
11296
|
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
@@ -12004,7 +11346,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
12004
11346
|
}
|
12005
11347
|
|
12006
11348
|
float d = max_scale/31;
|
12007
|
-
y[ibl].d = GGML_FP32_TO_FP16(d);
|
11349
|
+
y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f);
|
12008
11350
|
float id = 1/d;
|
12009
11351
|
for (int ib = 0; ib < QK_K/block_size; ib += 2) {
|
12010
11352
|
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
|
@@ -12018,8 +11360,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
12018
11360
|
}
|
12019
11361
|
|
12020
11362
|
#define IQ3S_BLOCK_SIZE 32
|
12021
|
-
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row,
|
12022
|
-
(void)hist;
|
11363
|
+
size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12023
11364
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
12024
11365
|
int nblock = n_per_row/QK_K;
|
12025
11366
|
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
@@ -12049,7 +11390,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
|
|
12049
11390
|
|
12050
11391
|
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
12051
11392
|
assert(k % QK_K == 0);
|
12052
|
-
quantize_iq3_s(x, y, 1, k, NULL
|
11393
|
+
quantize_iq3_s(x, y, 1, k, NULL);
|
12053
11394
|
}
|
12054
11395
|
|
12055
11396
|
|
@@ -12115,12 +11456,70 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
12115
11456
|
return grid_index;
|
12116
11457
|
}
|
12117
11458
|
|
11459
|
+
static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
11460
|
+
const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
|
11461
|
+
int num_neighbors = neighbours[0];
|
11462
|
+
GGML_ASSERT(num_neighbors > 0);
|
11463
|
+
float best_score = FLT_MAX;
|
11464
|
+
int grid_index = -1;
|
11465
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
11466
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
11467
|
+
float d2 = 0;
|
11468
|
+
for (int i = 0; i < 8; ++i) {
|
11469
|
+
float q = xg[(pg[i] - 1)/2];
|
11470
|
+
float w = weight[i];
|
11471
|
+
float diff = scale*q - xval[i];
|
11472
|
+
d2 += w*diff*diff;
|
11473
|
+
}
|
11474
|
+
if (d2 < best_score) {
|
11475
|
+
best_score = d2;
|
11476
|
+
grid_index = neighbours[j];
|
11477
|
+
}
|
11478
|
+
}
|
11479
|
+
if (grid_index < 0) {
|
11480
|
+
for (int i = 0; i < ngrid; ++i) {
|
11481
|
+
const int8_t * grid_i = (const int8_t *)(grid + i);
|
11482
|
+
float d2 = 0;
|
11483
|
+
for (int j = 0; j < 8; ++j) {
|
11484
|
+
float w = weight[j];
|
11485
|
+
float q = xg[(grid_i[j] - 1)/2];
|
11486
|
+
float diff = scale*q - xval[i];
|
11487
|
+
d2 += w*diff*diff;
|
11488
|
+
}
|
11489
|
+
if (d2 < best_score) {
|
11490
|
+
best_score = d2;
|
11491
|
+
grid_index = i;
|
11492
|
+
}
|
11493
|
+
}
|
11494
|
+
}
|
11495
|
+
if (grid_index < 0) {
|
11496
|
+
printf("Oops, did not find grid point\n");
|
11497
|
+
printf("Have %d neighbours\n", num_neighbors);
|
11498
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
11499
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
11500
|
+
float sumqx = 0, sumq2 = 0;
|
11501
|
+
for (int i = 0; i < 8; ++i) {
|
11502
|
+
float q = xg[(pg[i] - 1)/2];
|
11503
|
+
float w = weight[i];
|
11504
|
+
sumqx += w*q*xval[i];
|
11505
|
+
sumq2 += w*q*q;
|
11506
|
+
}
|
11507
|
+
printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
|
11508
|
+
}
|
11509
|
+
}
|
11510
|
+
GGML_ASSERT(grid_index >= 0);
|
11511
|
+
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
11512
|
+
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
|
11513
|
+
return grid_index;
|
11514
|
+
}
|
11515
|
+
|
12118
11516
|
static int iq1_sort_helper(const void * left, const void * right) {
|
12119
11517
|
const float * l = left;
|
12120
11518
|
const float * r = right;
|
12121
11519
|
return *l < *r ? -1 : *l > *r ? 1 : 0;
|
12122
11520
|
}
|
12123
11521
|
|
11522
|
+
#define IQ1S_BLOCK_SIZE 32
|
12124
11523
|
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
12125
11524
|
|
12126
11525
|
const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
|
@@ -12139,37 +11538,41 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
12139
11538
|
|
12140
11539
|
block_iq1_s * y = vy;
|
12141
11540
|
|
12142
|
-
float
|
12143
|
-
float
|
12144
|
-
|
12145
|
-
float
|
12146
|
-
float
|
12147
|
-
|
11541
|
+
const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
|
11542
|
+
const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
|
11543
|
+
|
11544
|
+
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
11545
|
+
float weight[IQ1S_BLOCK_SIZE];
|
11546
|
+
int8_t L[IQ1S_BLOCK_SIZE];
|
11547
|
+
float sumx[IQ1S_BLOCK_SIZE+1];
|
11548
|
+
float sumw[IQ1S_BLOCK_SIZE+1];
|
11549
|
+
float pairs[2*IQ1S_BLOCK_SIZE];
|
12148
11550
|
int * idx = (int *)(pairs + 1);
|
12149
|
-
|
11551
|
+
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
11552
|
+
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
12150
11553
|
|
12151
11554
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
12152
11555
|
|
12153
11556
|
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
12154
11557
|
memset(y[ibl].qs, 0, QK_K/8);
|
12155
|
-
memset(y[ibl].
|
11558
|
+
memset(y[ibl].qh, 0, QK_K/16);
|
12156
11559
|
|
12157
11560
|
float max_scale = 0;
|
12158
11561
|
|
12159
11562
|
const float * xbl = x + QK_K*ibl;
|
12160
11563
|
float sumx2 = 0;
|
12161
11564
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
12162
|
-
float sigma2 = sumx2/QK_K;
|
11565
|
+
float sigma2 = 2*sumx2/QK_K;
|
12163
11566
|
|
12164
|
-
for (int ib = 0; ib < QK_K/
|
12165
|
-
const float * xb = xbl +
|
12166
|
-
const float * qw = quant_weights + QK_K*ibl +
|
12167
|
-
for (int i = 0; i <
|
11567
|
+
for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
|
11568
|
+
const float * xb = xbl + IQ1S_BLOCK_SIZE*ib;
|
11569
|
+
const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib;
|
11570
|
+
for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12168
11571
|
float max = fabsf(xb[0]);
|
12169
|
-
for (int i = 1; i <
|
11572
|
+
for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i]));
|
12170
11573
|
if (!max) {
|
12171
11574
|
scales[ib] = 0;
|
12172
|
-
memset(L, 1,
|
11575
|
+
memset(L, 1, IQ1S_BLOCK_SIZE);
|
12173
11576
|
continue;
|
12174
11577
|
}
|
12175
11578
|
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
@@ -12178,52 +11581,81 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
12178
11581
|
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
12179
11582
|
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
12180
11583
|
// for each possible and score for each split.
|
12181
|
-
for (int j = 0; j <
|
11584
|
+
for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
|
12182
11585
|
pairs[2*j] = xb[j];
|
12183
11586
|
idx[2*j] = j;
|
12184
11587
|
}
|
12185
|
-
qsort(pairs,
|
11588
|
+
qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper);
|
12186
11589
|
{
|
12187
11590
|
sumx[0] = sumw[0] = 0;
|
12188
|
-
for (int j = 0; j <
|
11591
|
+
for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
|
12189
11592
|
int i = idx[2*j];
|
12190
11593
|
sumx[j+1] = sumx[j] + weight[i]*xb[i];
|
12191
11594
|
sumw[j+1] = sumw[j] + weight[i];
|
12192
11595
|
}
|
12193
11596
|
}
|
12194
11597
|
float best_score = 0, scale = max;
|
12195
|
-
int besti1 =
|
12196
|
-
for (int i1 = 0; i1 <=
|
12197
|
-
for (int i2 = i1; i2 <=
|
12198
|
-
float sumqx =
|
12199
|
-
float sumq2 =
|
11598
|
+
int besti1 = -1, besti2 = -1, best_shift = 0;
|
11599
|
+
for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) {
|
11600
|
+
for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) {
|
11601
|
+
float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2];
|
11602
|
+
float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2];
|
12200
11603
|
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
12201
11604
|
scale = sumqx/sumq2; best_score = scale*sumqx;
|
12202
|
-
besti1 = i1; besti2 = i2;
|
11605
|
+
besti1 = i1; besti2 = i2; best_shift = 1;
|
11606
|
+
}
|
11607
|
+
sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2];
|
11608
|
+
sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2];
|
11609
|
+
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
11610
|
+
scale = sumqx/sumq2; best_score = scale*sumqx;
|
11611
|
+
besti1 = i1; besti2 = i2; best_shift = -1;
|
12203
11612
|
}
|
12204
11613
|
}
|
12205
11614
|
}
|
11615
|
+
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
|
12206
11616
|
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
12207
11617
|
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
12208
|
-
for (int j = besti2; j <
|
11618
|
+
for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2;
|
12209
11619
|
if (scale < 0) {
|
12210
|
-
for (int j = 0; j <
|
12211
|
-
scale = -scale;
|
11620
|
+
for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j];
|
11621
|
+
scale = -scale; best_shift = -best_shift;
|
11622
|
+
}
|
11623
|
+
bool all_on_grid = true;
|
11624
|
+
const float * xx = best_shift == 1 ? x_p : x_m;
|
11625
|
+
for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
|
11626
|
+
uint16_t u = 0;
|
11627
|
+
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
|
11628
|
+
int grid_index = kmap_q2xs[u];
|
11629
|
+
if (grid_index < 0) {
|
11630
|
+
all_on_grid = false;
|
11631
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
11632
|
+
grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
|
11633
|
+
GGML_ASSERT(grid_index >= 0);
|
11634
|
+
}
|
11635
|
+
index[k] = grid_index;
|
11636
|
+
}
|
11637
|
+
if (!all_on_grid) {
|
11638
|
+
float sumqx = 0, sumq2 = 0;
|
11639
|
+
for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
|
11640
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
|
11641
|
+
for (int j = 0; j < 8; ++j) {
|
11642
|
+
float w = weight[8*k + j];
|
11643
|
+
float q = xx[(pg[j] - 1)/2];
|
11644
|
+
sumqx += w*q*xb[8*k+j];
|
11645
|
+
sumq2 += w*q*q;
|
11646
|
+
}
|
11647
|
+
}
|
11648
|
+
if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
|
11649
|
+
}
|
11650
|
+
uint16_t h = 0;
|
11651
|
+
for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
|
11652
|
+
y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255;
|
11653
|
+
h |= (index[k] >> 8) << 3*k;
|
12212
11654
|
}
|
12213
|
-
|
12214
|
-
// grid point that minimizes SSD.
|
12215
|
-
uint16_t u = 0;
|
12216
|
-
for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
|
12217
|
-
int grid_index = kmap_q2xs[u];
|
12218
|
-
if (grid_index < 0) {
|
12219
|
-
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12220
|
-
grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
|
12221
|
-
GGML_ASSERT(grid_index >= 0);
|
12222
|
-
}
|
12223
|
-
y[ibl].qs[ib] = grid_index & 255;
|
12224
|
-
hbit[ib] = grid_index >> 8;
|
11655
|
+
y[ibl].qh[ib] = h;
|
12225
11656
|
GGML_ASSERT(scale >= 0);
|
12226
11657
|
scales[ib] = scale;
|
11658
|
+
shifts[ib] = best_shift;
|
12227
11659
|
max_scale = MAX(max_scale, scale);
|
12228
11660
|
}
|
12229
11661
|
|
@@ -12233,19 +11665,18 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
12233
11665
|
}
|
12234
11666
|
|
12235
11667
|
float d = max_scale/15;
|
12236
|
-
y[ibl].d = GGML_FP32_TO_FP16(d*1.
|
11668
|
+
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
|
12237
11669
|
float id = 1/d;
|
12238
|
-
for (int ib = 0; ib < QK_K/
|
11670
|
+
for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
|
12239
11671
|
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
12240
11672
|
l = MAX(0, MIN(7, l));
|
12241
|
-
if (
|
12242
|
-
y[ibl].
|
11673
|
+
if (shifts[ib] == -1) l |= 8;
|
11674
|
+
y[ibl].qh[ib] |= (l << 12);
|
12243
11675
|
}
|
12244
11676
|
}
|
12245
11677
|
}
|
12246
11678
|
|
12247
|
-
size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row,
|
12248
|
-
(void)hist;
|
11679
|
+
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12249
11680
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
12250
11681
|
int nblock = n_per_row/QK_K;
|
12251
11682
|
char * qrow = (char *)dst;
|
@@ -12270,7 +11701,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|
12270
11701
|
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
12271
11702
|
}
|
12272
11703
|
|
12273
|
-
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float *
|
11704
|
+
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
|
12274
11705
|
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
12275
11706
|
float * scales, float * weight, uint8_t * L,
|
12276
11707
|
const int8_t * values,
|
@@ -12378,8 +11809,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
12378
11809
|
}
|
12379
11810
|
}
|
12380
11811
|
|
12381
|
-
size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row,
|
12382
|
-
(void)hist;
|
11812
|
+
size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12383
11813
|
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
12384
11814
|
int nblock = n_per_row/QK4_NL;
|
12385
11815
|
char * qrow = (char *)dst;
|
@@ -12409,14 +11839,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
|
12409
11839
|
|
12410
11840
|
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
12411
11841
|
assert(k % QK4_NL == 0);
|
12412
|
-
quantize_iq4_nl(x, y, 1, k, NULL
|
11842
|
+
quantize_iq4_nl(x, y, 1, k, NULL);
|
12413
11843
|
}
|
12414
11844
|
|
12415
|
-
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row,
|
11845
|
+
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12416
11846
|
#if QK_K == 64
|
12417
|
-
return quantize_iq4_nl(src, dst, nrow, n_per_row,
|
11847
|
+
return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
|
12418
11848
|
#else
|
12419
|
-
(void)hist;
|
12420
11849
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
12421
11850
|
int nblock = n_per_row/QK_K;
|
12422
11851
|
char * qrow = (char *)dst;
|
@@ -12445,7 +11874,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
|
12445
11874
|
|
12446
11875
|
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
12447
11876
|
assert(k % QK_K == 0);
|
12448
|
-
quantize_iq4_xs(x, y, 1, k, NULL
|
11877
|
+
quantize_iq4_xs(x, y, 1, k, NULL);
|
12449
11878
|
}
|
12450
11879
|
|
12451
11880
|
// =============================== 2.5625 bpw
|
@@ -12618,8 +12047,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
12618
12047
|
}
|
12619
12048
|
}
|
12620
12049
|
|
12621
|
-
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row,
|
12622
|
-
(void)hist;
|
12050
|
+
size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12623
12051
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
12624
12052
|
int nblock = n_per_row/QK_K;
|
12625
12053
|
char * qrow = (char *)dst;
|
@@ -12633,7 +12061,7 @@ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, in
|
|
12633
12061
|
|
12634
12062
|
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
12635
12063
|
assert(k % QK_K == 0);
|
12636
|
-
quantize_iq2_s(x, y, 1, k, NULL
|
12064
|
+
quantize_iq2_s(x, y, 1, k, NULL);
|
12637
12065
|
}
|
12638
12066
|
|
12639
12067
|
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|