llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +27 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +14 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +81 -20
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
- data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +141 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -12
- data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
- data/vendor/tmp/llama.cpp/llama.h +145 -29
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
132
132
|
}
|
133
133
|
|
134
134
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
135
|
-
#if __AVXVNNI__
|
135
|
+
#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
|
136
136
|
const __m256i zero = _mm256_setzero_si256();
|
137
137
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
138
138
|
return _mm256_cvtepi32_ps(summed_pairs);
|
@@ -544,7 +544,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
|
|
544
544
|
#endif
|
545
545
|
|
546
546
|
// reference implementation for deterministic creation of model files
|
547
|
-
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y,
|
547
|
+
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
548
548
|
static const int qk = QK4_0;
|
549
549
|
|
550
550
|
assert(k % qk == 0);
|
@@ -581,12 +581,12 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
|
|
581
581
|
}
|
582
582
|
}
|
583
583
|
|
584
|
-
void quantize_row_q4_0(const float * restrict x, void * restrict y,
|
584
|
+
void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
|
585
585
|
quantize_row_q4_0_reference(x, y, k);
|
586
586
|
}
|
587
587
|
|
588
588
|
|
589
|
-
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y,
|
589
|
+
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
590
590
|
const int qk = QK4_1;
|
591
591
|
|
592
592
|
assert(k % qk == 0);
|
@@ -623,11 +623,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
|
|
623
623
|
}
|
624
624
|
}
|
625
625
|
|
626
|
-
void quantize_row_q4_1(const float * restrict x, void * restrict y,
|
626
|
+
void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
|
627
627
|
quantize_row_q4_1_reference(x, y, k);
|
628
628
|
}
|
629
629
|
|
630
|
-
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y,
|
630
|
+
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
|
631
631
|
static const int qk = QK5_0;
|
632
632
|
|
633
633
|
assert(k % qk == 0);
|
@@ -671,11 +671,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
|
|
671
671
|
}
|
672
672
|
}
|
673
673
|
|
674
|
-
void quantize_row_q5_0(const float * restrict x, void * restrict y,
|
674
|
+
void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
|
675
675
|
quantize_row_q5_0_reference(x, y, k);
|
676
676
|
}
|
677
677
|
|
678
|
-
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y,
|
678
|
+
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
|
679
679
|
const int qk = QK5_1;
|
680
680
|
|
681
681
|
assert(k % qk == 0);
|
@@ -719,12 +719,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
|
|
719
719
|
}
|
720
720
|
}
|
721
721
|
|
722
|
-
void quantize_row_q5_1(const float * restrict x, void * restrict y,
|
722
|
+
void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
|
723
723
|
quantize_row_q5_1_reference(x, y, k);
|
724
724
|
}
|
725
725
|
|
726
726
|
// reference implementation for deterministic creation of model files
|
727
|
-
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y,
|
727
|
+
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
|
728
728
|
assert(k % QK8_0 == 0);
|
729
729
|
const int nb = k / QK8_0;
|
730
730
|
|
@@ -749,7 +749,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
|
|
749
749
|
}
|
750
750
|
}
|
751
751
|
|
752
|
-
void quantize_row_q8_0(const float * restrict x, void * restrict vy,
|
752
|
+
void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
|
753
753
|
assert(QK8_0 == 32);
|
754
754
|
assert(k % QK8_0 == 0);
|
755
755
|
const int nb = k / QK8_0;
|
@@ -938,7 +938,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
|
|
938
938
|
}
|
939
939
|
|
940
940
|
// reference implementation for deterministic creation of model files
|
941
|
-
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y,
|
941
|
+
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
|
942
942
|
assert(QK8_1 == 32);
|
943
943
|
assert(k % QK8_1 == 0);
|
944
944
|
const int nb = k / QK8_1;
|
@@ -973,7 +973,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
|
|
973
973
|
}
|
974
974
|
}
|
975
975
|
|
976
|
-
void quantize_row_q8_1(const float * restrict x, void * restrict vy,
|
976
|
+
void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
|
977
977
|
assert(k % QK8_1 == 0);
|
978
978
|
const int nb = k / QK8_1;
|
979
979
|
|
@@ -1192,7 +1192,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
1192
1192
|
#endif
|
1193
1193
|
}
|
1194
1194
|
|
1195
|
-
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y,
|
1195
|
+
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
|
1196
1196
|
static const int qk = QK4_0;
|
1197
1197
|
|
1198
1198
|
assert(k % qk == 0);
|
@@ -1212,7 +1212,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
|
|
1212
1212
|
}
|
1213
1213
|
}
|
1214
1214
|
|
1215
|
-
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y,
|
1215
|
+
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
|
1216
1216
|
static const int qk = QK4_1;
|
1217
1217
|
|
1218
1218
|
assert(k % qk == 0);
|
@@ -1233,7 +1233,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
|
|
1233
1233
|
}
|
1234
1234
|
}
|
1235
1235
|
|
1236
|
-
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y,
|
1236
|
+
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
|
1237
1237
|
static const int qk = QK5_0;
|
1238
1238
|
|
1239
1239
|
assert(k % qk == 0);
|
@@ -1259,7 +1259,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
|
|
1259
1259
|
}
|
1260
1260
|
}
|
1261
1261
|
|
1262
|
-
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y,
|
1262
|
+
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
|
1263
1263
|
static const int qk = QK5_1;
|
1264
1264
|
|
1265
1265
|
assert(k % qk == 0);
|
@@ -1286,7 +1286,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
|
|
1286
1286
|
}
|
1287
1287
|
}
|
1288
1288
|
|
1289
|
-
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y,
|
1289
|
+
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
|
1290
1290
|
static const int qk = QK8_0;
|
1291
1291
|
|
1292
1292
|
assert(k % qk == 0);
|
@@ -1581,7 +1581,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|
1581
1581
|
|
1582
1582
|
//========================- 2-bit (de)-quantization
|
1583
1583
|
|
1584
|
-
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y,
|
1584
|
+
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
|
1585
1585
|
assert(k % QK_K == 0);
|
1586
1586
|
const int nb = k / QK_K;
|
1587
1587
|
|
@@ -1658,7 +1658,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
|
1658
1658
|
}
|
1659
1659
|
}
|
1660
1660
|
|
1661
|
-
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y,
|
1661
|
+
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
|
1662
1662
|
assert(k % QK_K == 0);
|
1663
1663
|
const int nb = k / QK_K;
|
1664
1664
|
|
@@ -1704,7 +1704,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
|
|
1704
1704
|
}
|
1705
1705
|
}
|
1706
1706
|
|
1707
|
-
void quantize_row_q2_K(const float * restrict x, void * restrict vy,
|
1707
|
+
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
|
1708
1708
|
quantize_row_q2_K_reference(x, vy, k);
|
1709
1709
|
}
|
1710
1710
|
|
@@ -1960,14 +1960,14 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
1960
1960
|
}
|
1961
1961
|
}
|
1962
1962
|
|
1963
|
-
size_t quantize_q2_K(const float * restrict src, void * restrict dst,
|
1963
|
+
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
1964
1964
|
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
1965
1965
|
if (!quant_weights) {
|
1966
|
-
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
1966
|
+
quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
1967
1967
|
}
|
1968
1968
|
else {
|
1969
1969
|
char * qrow = (char *)dst;
|
1970
|
-
for (
|
1970
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
1971
1971
|
quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
|
1972
1972
|
src += n_per_row;
|
1973
1973
|
qrow += row_size;
|
@@ -1978,7 +1978,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow,
|
|
1978
1978
|
|
1979
1979
|
//========================= 3-bit (de)-quantization
|
1980
1980
|
|
1981
|
-
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y,
|
1981
|
+
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
|
1982
1982
|
assert(k % QK_K == 0);
|
1983
1983
|
const int nb = k / QK_K;
|
1984
1984
|
|
@@ -2092,7 +2092,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
|
|
2092
2092
|
}
|
2093
2093
|
|
2094
2094
|
#if QK_K == 256
|
2095
|
-
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y,
|
2095
|
+
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
|
2096
2096
|
assert(k % QK_K == 0);
|
2097
2097
|
const int nb = k / QK_K;
|
2098
2098
|
|
@@ -2142,7 +2142,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
|
|
2142
2142
|
}
|
2143
2143
|
}
|
2144
2144
|
#else
|
2145
|
-
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y,
|
2145
|
+
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
|
2146
2146
|
assert(k % QK_K == 0);
|
2147
2147
|
assert(QK_K == 64);
|
2148
2148
|
const int nb = k / QK_K;
|
@@ -2175,11 +2175,11 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
|
|
2175
2175
|
}
|
2176
2176
|
#endif
|
2177
2177
|
|
2178
|
-
void quantize_row_q3_K(const float * restrict x, void * restrict vy,
|
2178
|
+
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
|
2179
2179
|
quantize_row_q3_K_reference(x, vy, k);
|
2180
2180
|
}
|
2181
2181
|
|
2182
|
-
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y,
|
2182
|
+
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
|
2183
2183
|
#if QK_K != 256
|
2184
2184
|
(void)quant_weights;
|
2185
2185
|
quantize_row_q3_K_reference(x, y, n_per_row);
|
@@ -2268,14 +2268,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
2268
2268
|
#endif
|
2269
2269
|
}
|
2270
2270
|
|
2271
|
-
size_t quantize_q3_K(const float * restrict src, void * restrict dst,
|
2271
|
+
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
2272
2272
|
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
2273
2273
|
if (!quant_weights) {
|
2274
|
-
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
2274
|
+
quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
2275
2275
|
}
|
2276
2276
|
else {
|
2277
2277
|
char * qrow = (char *)dst;
|
2278
|
-
for (
|
2278
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
2279
2279
|
quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
|
2280
2280
|
src += n_per_row;
|
2281
2281
|
qrow += row_size;
|
@@ -2286,7 +2286,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow,
|
|
2286
2286
|
|
2287
2287
|
// ====================== 4-bit (de)-quantization
|
2288
2288
|
|
2289
|
-
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y,
|
2289
|
+
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
|
2290
2290
|
assert(k % QK_K == 0);
|
2291
2291
|
const int nb = k / QK_K;
|
2292
2292
|
|
@@ -2393,7 +2393,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
2393
2393
|
}
|
2394
2394
|
}
|
2395
2395
|
|
2396
|
-
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y,
|
2396
|
+
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
|
2397
2397
|
assert(k % QK_K == 0);
|
2398
2398
|
const int nb = k / QK_K;
|
2399
2399
|
|
@@ -2432,19 +2432,19 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
|
|
2432
2432
|
}
|
2433
2433
|
}
|
2434
2434
|
|
2435
|
-
void quantize_row_q4_K(const float * restrict x, void * restrict vy,
|
2435
|
+
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
|
2436
2436
|
assert(k % QK_K == 0);
|
2437
2437
|
block_q4_K * restrict y = vy;
|
2438
2438
|
quantize_row_q4_K_reference(x, y, k);
|
2439
2439
|
}
|
2440
2440
|
|
2441
|
-
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y,
|
2441
|
+
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
2442
2442
|
#if QK_K != 256
|
2443
2443
|
(void)quant_weights;
|
2444
2444
|
quantize_row_q4_K_reference(x, y, n_per_row);
|
2445
2445
|
#else
|
2446
2446
|
assert(n_per_row % QK_K == 0);
|
2447
|
-
const
|
2447
|
+
const int64_t nb = n_per_row / QK_K;
|
2448
2448
|
|
2449
2449
|
uint8_t L[QK_K];
|
2450
2450
|
uint8_t Laux[32];
|
@@ -2516,14 +2516,14 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
2516
2516
|
#endif
|
2517
2517
|
}
|
2518
2518
|
|
2519
|
-
size_t quantize_q4_K(const float * restrict src, void * restrict dst,
|
2519
|
+
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
2520
2520
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
2521
2521
|
if (!quant_weights) {
|
2522
|
-
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
2522
|
+
quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
2523
2523
|
}
|
2524
2524
|
else {
|
2525
2525
|
char * qrow = (char *)dst;
|
2526
|
-
for (
|
2526
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
2527
2527
|
quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
|
2528
2528
|
src += n_per_row;
|
2529
2529
|
qrow += row_size;
|
@@ -2534,9 +2534,9 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow,
|
|
2534
2534
|
|
2535
2535
|
// ====================== 5-bit (de)-quantization
|
2536
2536
|
|
2537
|
-
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y,
|
2537
|
+
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
|
2538
2538
|
assert(k % QK_K == 0);
|
2539
|
-
const
|
2539
|
+
const int64_t nb = k / QK_K;
|
2540
2540
|
|
2541
2541
|
#if QK_K == 256
|
2542
2542
|
uint8_t L[QK_K];
|
@@ -2676,9 +2676,9 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
2676
2676
|
}
|
2677
2677
|
}
|
2678
2678
|
|
2679
|
-
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y,
|
2679
|
+
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
|
2680
2680
|
assert(k % QK_K == 0);
|
2681
|
-
const
|
2681
|
+
const int64_t nb = k / QK_K;
|
2682
2682
|
|
2683
2683
|
for (int i = 0; i < nb; i++) {
|
2684
2684
|
|
@@ -2721,19 +2721,19 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
|
|
2721
2721
|
}
|
2722
2722
|
}
|
2723
2723
|
|
2724
|
-
void quantize_row_q5_K(const float * restrict x, void * restrict vy,
|
2724
|
+
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
|
2725
2725
|
assert(k % QK_K == 0);
|
2726
2726
|
block_q5_K * restrict y = vy;
|
2727
2727
|
quantize_row_q5_K_reference(x, y, k);
|
2728
2728
|
}
|
2729
2729
|
|
2730
|
-
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y,
|
2730
|
+
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
2731
2731
|
#if QK_K != 256
|
2732
2732
|
(void)quant_weights;
|
2733
2733
|
quantize_row_q5_K_reference(x, y, n_per_row);
|
2734
2734
|
#else
|
2735
2735
|
assert(n_per_row % QK_K == 0);
|
2736
|
-
const
|
2736
|
+
const int64_t nb = n_per_row / QK_K;
|
2737
2737
|
|
2738
2738
|
uint8_t L[QK_K];
|
2739
2739
|
uint8_t Laux[32];
|
@@ -2825,14 +2825,14 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
2825
2825
|
#endif
|
2826
2826
|
}
|
2827
2827
|
|
2828
|
-
size_t quantize_q5_K(const float * restrict src, void * restrict dst,
|
2828
|
+
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
2829
2829
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
2830
2830
|
if (!quant_weights) {
|
2831
|
-
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
2831
|
+
quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
2832
2832
|
}
|
2833
2833
|
else {
|
2834
2834
|
char * qrow = (char *)dst;
|
2835
|
-
for (
|
2835
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
2836
2836
|
quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
|
2837
2837
|
src += n_per_row;
|
2838
2838
|
qrow += row_size;
|
@@ -2843,9 +2843,9 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow,
|
|
2843
2843
|
|
2844
2844
|
// ====================== 6-bit (de)-quantization
|
2845
2845
|
|
2846
|
-
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y,
|
2846
|
+
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
|
2847
2847
|
assert(k % QK_K == 0);
|
2848
|
-
const
|
2848
|
+
const int64_t nb = k / QK_K;
|
2849
2849
|
|
2850
2850
|
int8_t L[QK_K];
|
2851
2851
|
float scales[QK_K/16];
|
@@ -2925,9 +2925,9 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
2925
2925
|
}
|
2926
2926
|
}
|
2927
2927
|
|
2928
|
-
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y,
|
2928
|
+
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
|
2929
2929
|
assert(k % QK_K == 0);
|
2930
|
-
const
|
2930
|
+
const int64_t nb = k / QK_K;
|
2931
2931
|
|
2932
2932
|
for (int i = 0; i < nb; i++) {
|
2933
2933
|
|
@@ -2972,19 +2972,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
|
2972
2972
|
}
|
2973
2973
|
}
|
2974
2974
|
|
2975
|
-
void quantize_row_q6_K(const float * restrict x, void * restrict vy,
|
2975
|
+
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
|
2976
2976
|
assert(k % QK_K == 0);
|
2977
2977
|
block_q6_K * restrict y = vy;
|
2978
2978
|
quantize_row_q6_K_reference(x, y, k);
|
2979
2979
|
}
|
2980
2980
|
|
2981
|
-
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y,
|
2981
|
+
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
2982
2982
|
#if QK_K != 256
|
2983
2983
|
(void)quant_weights;
|
2984
2984
|
quantize_row_q6_K_reference(x, y, n_per_row);
|
2985
2985
|
#else
|
2986
2986
|
assert(n_per_row % QK_K == 0);
|
2987
|
-
const
|
2987
|
+
const int64_t nb = n_per_row / QK_K;
|
2988
2988
|
|
2989
2989
|
int8_t L[QK_K];
|
2990
2990
|
float scales[QK_K/16];
|
@@ -3067,14 +3067,14 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
3067
3067
|
#endif
|
3068
3068
|
}
|
3069
3069
|
|
3070
|
-
size_t quantize_q6_K(const float * restrict src, void * restrict dst,
|
3070
|
+
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
3071
3071
|
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
3072
3072
|
if (!quant_weights) {
|
3073
|
-
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
3073
|
+
quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
3074
3074
|
}
|
3075
3075
|
else {
|
3076
3076
|
char * qrow = (char *)dst;
|
3077
|
-
for (
|
3077
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
3078
3078
|
quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
|
3079
3079
|
src += n_per_row;
|
3080
3080
|
qrow += row_size;
|
@@ -3083,7 +3083,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow,
|
|
3083
3083
|
return nrow * row_size;
|
3084
3084
|
}
|
3085
3085
|
|
3086
|
-
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y,
|
3086
|
+
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
3087
3087
|
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
3088
3088
|
|
3089
3089
|
if (!quant_weights) {
|
@@ -3098,7 +3098,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
3098
3098
|
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3099
3099
|
float sigma2 = sum_x2/n_per_row;
|
3100
3100
|
|
3101
|
-
const
|
3101
|
+
const int64_t nb = n_per_row/QK4_0;
|
3102
3102
|
for (int ib = 0; ib < nb; ++ib) {
|
3103
3103
|
const float * xb = x + QK4_0 * ib;
|
3104
3104
|
const float * qw = quant_weights + QK4_0 * ib;
|
@@ -3111,14 +3111,14 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
3111
3111
|
}
|
3112
3112
|
}
|
3113
3113
|
|
3114
|
-
size_t quantize_q4_0(const float * restrict src, void * restrict dst,
|
3114
|
+
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
3115
3115
|
if (!quant_weights) {
|
3116
|
-
quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
|
3116
|
+
quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
3117
3117
|
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
3118
3118
|
}
|
3119
3119
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
3120
3120
|
char * qrow = (char *)dst;
|
3121
|
-
for (
|
3121
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
3122
3122
|
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
3123
3123
|
src += n_per_row;
|
3124
3124
|
qrow += row_size;
|
@@ -3126,7 +3126,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow,
|
|
3126
3126
|
return nrow * row_size;
|
3127
3127
|
}
|
3128
3128
|
|
3129
|
-
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y,
|
3129
|
+
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
3130
3130
|
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
3131
3131
|
|
3132
3132
|
if (!quant_weights) {
|
@@ -3141,7 +3141,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
3141
3141
|
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3142
3142
|
float sigma2 = sum_x2/n_per_row;
|
3143
3143
|
|
3144
|
-
const
|
3144
|
+
const int64_t nb = n_per_row/QK4_1;
|
3145
3145
|
for (int ib = 0; ib < nb; ++ib) {
|
3146
3146
|
const float * xb = x + QK4_1 * ib;
|
3147
3147
|
const float * qw = quant_weights + QK4_1 * ib;
|
@@ -3156,14 +3156,14 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
3156
3156
|
}
|
3157
3157
|
}
|
3158
3158
|
|
3159
|
-
size_t quantize_q4_1(const float * restrict src, void * restrict dst,
|
3159
|
+
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
3160
3160
|
if (!quant_weights) {
|
3161
|
-
quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
|
3161
|
+
quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
|
3162
3162
|
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
3163
3163
|
}
|
3164
3164
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
3165
3165
|
char * qrow = (char *)dst;
|
3166
|
-
for (
|
3166
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
3167
3167
|
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
3168
3168
|
src += n_per_row;
|
3169
3169
|
qrow += row_size;
|
@@ -3171,7 +3171,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow,
|
|
3171
3171
|
return nrow * row_size;
|
3172
3172
|
}
|
3173
3173
|
|
3174
|
-
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y,
|
3174
|
+
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
3175
3175
|
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
3176
3176
|
|
3177
3177
|
if (!quant_weights) {
|
@@ -3186,7 +3186,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
3186
3186
|
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3187
3187
|
float sigma2 = sum_x2/n_per_row;
|
3188
3188
|
|
3189
|
-
const
|
3189
|
+
const int64_t nb = n_per_row/QK5_0;
|
3190
3190
|
for (int ib = 0; ib < nb; ++ib) {
|
3191
3191
|
const float * xb = x + QK5_0 * ib;
|
3192
3192
|
const float * qw = quant_weights + QK5_0 * ib;
|
@@ -3210,14 +3210,14 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
3210
3210
|
}
|
3211
3211
|
}
|
3212
3212
|
|
3213
|
-
size_t quantize_q5_0(const float * restrict src, void * restrict dst,
|
3213
|
+
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
3214
3214
|
if (!quant_weights) {
|
3215
|
-
quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
|
3215
|
+
quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
3216
3216
|
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
3217
3217
|
}
|
3218
3218
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
3219
3219
|
char * qrow = (char *)dst;
|
3220
|
-
for (
|
3220
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
3221
3221
|
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
3222
3222
|
src += n_per_row;
|
3223
3223
|
qrow += row_size;
|
@@ -3225,7 +3225,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow,
|
|
3225
3225
|
return nrow * row_size;
|
3226
3226
|
}
|
3227
3227
|
|
3228
|
-
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y,
|
3228
|
+
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
3229
3229
|
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
3230
3230
|
|
3231
3231
|
if (!quant_weights) {
|
@@ -3240,7 +3240,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
3240
3240
|
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3241
3241
|
float sigma2 = sum_x2/n_per_row;
|
3242
3242
|
|
3243
|
-
const
|
3243
|
+
const int64_t nb = n_per_row/QK5_1;
|
3244
3244
|
for (int ib = 0; ib < nb; ++ib) {
|
3245
3245
|
const float * xb = x + QK5_1 * ib;
|
3246
3246
|
const float * qw = quant_weights + QK5_1 * ib;
|
@@ -3263,14 +3263,14 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
3263
3263
|
}
|
3264
3264
|
}
|
3265
3265
|
|
3266
|
-
size_t quantize_q5_1(const float * restrict src, void * restrict dst,
|
3266
|
+
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
3267
3267
|
if (!quant_weights) {
|
3268
|
-
quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
|
3268
|
+
quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
|
3269
3269
|
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
3270
3270
|
}
|
3271
3271
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
3272
3272
|
char * qrow = (char *)dst;
|
3273
|
-
for (
|
3273
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
3274
3274
|
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
3275
3275
|
src += n_per_row;
|
3276
3276
|
qrow += row_size;
|
@@ -3278,18 +3278,18 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow,
|
|
3278
3278
|
return nrow * row_size;
|
3279
3279
|
}
|
3280
3280
|
|
3281
|
-
size_t quantize_q8_0(const float * restrict src, void * restrict dst,
|
3281
|
+
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
3282
3282
|
(void)quant_weights; // not used
|
3283
3283
|
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
3284
|
-
quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
|
3284
|
+
quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
3285
3285
|
return nrow * row_size;
|
3286
3286
|
}
|
3287
3287
|
|
3288
3288
|
// ====================== "True" 2-bit (de)-quantization
|
3289
3289
|
|
3290
|
-
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y,
|
3290
|
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
3291
3291
|
assert(k % QK_K == 0);
|
3292
|
-
const
|
3292
|
+
const int64_t nb = k / QK_K;
|
3293
3293
|
|
3294
3294
|
uint32_t aux32[2];
|
3295
3295
|
const uint8_t * aux8 = (const uint8_t *)aux32;
|
@@ -3315,9 +3315,9 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
|
|
3315
3315
|
|
3316
3316
|
// ====================== 2.3125 bpw (de)-quantization
|
3317
3317
|
|
3318
|
-
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
3318
|
+
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
|
3319
3319
|
assert(k % QK_K == 0);
|
3320
|
-
const
|
3320
|
+
const int64_t nb = k / QK_K;
|
3321
3321
|
|
3322
3322
|
float db[2];
|
3323
3323
|
|
@@ -3342,9 +3342,9 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
3342
3342
|
|
3343
3343
|
// ====================== 2.5625 bpw (de)-quantization
|
3344
3344
|
|
3345
|
-
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y,
|
3345
|
+
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
|
3346
3346
|
assert(k % QK_K == 0);
|
3347
|
-
const
|
3347
|
+
const int64_t nb = k / QK_K;
|
3348
3348
|
|
3349
3349
|
float db[2];
|
3350
3350
|
|
@@ -3374,9 +3374,9 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
|
|
3374
3374
|
|
3375
3375
|
// ====================== 3.0625 bpw (de)-quantization
|
3376
3376
|
|
3377
|
-
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y,
|
3377
|
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
|
3378
3378
|
assert(k % QK_K == 0);
|
3379
|
-
const
|
3379
|
+
const int64_t nb = k / QK_K;
|
3380
3380
|
|
3381
3381
|
uint32_t aux32;
|
3382
3382
|
|
@@ -3406,9 +3406,9 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
|
3406
3406
|
|
3407
3407
|
// ====================== 3.3125 bpw (de)-quantization
|
3408
3408
|
|
3409
|
-
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y,
|
3409
|
+
void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
|
3410
3410
|
assert(k % QK_K == 0);
|
3411
|
-
const
|
3411
|
+
const int64_t nb = k / QK_K;
|
3412
3412
|
|
3413
3413
|
for (int i = 0; i < nb; i++) {
|
3414
3414
|
|
@@ -3449,9 +3449,9 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
3449
3449
|
|
3450
3450
|
// ====================== 1.5625 bpw (de)-quantization
|
3451
3451
|
|
3452
|
-
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y,
|
3452
|
+
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
|
3453
3453
|
assert(k % QK_K == 0);
|
3454
|
-
const
|
3454
|
+
const int64_t nb = k / QK_K;
|
3455
3455
|
|
3456
3456
|
for (int i = 0; i < nb; i++) {
|
3457
3457
|
|
@@ -3474,11 +3474,70 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
|
3474
3474
|
}
|
3475
3475
|
}
|
3476
3476
|
|
3477
|
+
void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
|
3478
|
+
assert(k % QK_K == 0);
|
3479
|
+
const int64_t nb = k / QK_K;
|
3480
|
+
|
3481
|
+
float delta[4];
|
3482
|
+
uint16_t idx[4];
|
3483
|
+
|
3484
|
+
#if QK_K != 64
|
3485
|
+
iq1m_scale_t scale;
|
3486
|
+
#endif
|
3487
|
+
|
3488
|
+
for (int i = 0; i < nb; i++) {
|
3489
|
+
|
3490
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
3491
|
+
#if QK_K == 64
|
3492
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
3493
|
+
#else
|
3494
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
3495
|
+
const float d = GGML_FP16_TO_FP32(scale.f16);
|
3496
|
+
#endif
|
3497
|
+
const uint8_t * qs = x[i].qs;
|
3498
|
+
const uint8_t * qh = x[i].qh;
|
3499
|
+
|
3500
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
3501
|
+
#if QK_K == 64
|
3502
|
+
const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1);
|
3503
|
+
const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1);
|
3504
|
+
#else
|
3505
|
+
const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
|
3506
|
+
const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
|
3507
|
+
#endif
|
3508
|
+
idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
|
3509
|
+
idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
|
3510
|
+
idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
|
3511
|
+
idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
|
3512
|
+
delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3513
|
+
delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3514
|
+
delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3515
|
+
delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3516
|
+
for (int l = 0; l < 2; ++l) {
|
3517
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
3518
|
+
for (int j = 0; j < 8; ++j) {
|
3519
|
+
y[j] = dl1 * (grid[j] + delta[l]);
|
3520
|
+
}
|
3521
|
+
y += 8;
|
3522
|
+
}
|
3523
|
+
for (int l = 2; l < 4; ++l) {
|
3524
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
3525
|
+
for (int j = 0; j < 8; ++j) {
|
3526
|
+
y[j] = dl2 * (grid[j] + delta[l]);
|
3527
|
+
}
|
3528
|
+
y += 8;
|
3529
|
+
}
|
3530
|
+
qs += 4;
|
3531
|
+
qh += 2;
|
3532
|
+
}
|
3533
|
+
}
|
3534
|
+
}
|
3535
|
+
|
3477
3536
|
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
3478
3537
|
|
3479
|
-
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
3538
|
+
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
|
3480
3539
|
assert(k % QK4_NL == 0);
|
3481
|
-
const
|
3540
|
+
const int64_t nb = k / QK4_NL;
|
3482
3541
|
|
3483
3542
|
for (int i = 0; i < nb; i++) {
|
3484
3543
|
|
@@ -3494,12 +3553,12 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
|
3494
3553
|
}
|
3495
3554
|
}
|
3496
3555
|
|
3497
|
-
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
3556
|
+
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
|
3498
3557
|
assert(k % QK_K == 0);
|
3499
3558
|
#if QK_K == 64
|
3500
3559
|
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
3501
3560
|
#else
|
3502
|
-
const
|
3561
|
+
const int64_t nb = k / QK_K;
|
3503
3562
|
|
3504
3563
|
for (int i = 0; i < nb; i++) {
|
3505
3564
|
|
@@ -3523,9 +3582,9 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
|
3523
3582
|
|
3524
3583
|
//===================================== Q8_K ==============================================
|
3525
3584
|
|
3526
|
-
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y,
|
3585
|
+
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
|
3527
3586
|
assert(k % QK_K == 0);
|
3528
|
-
const
|
3587
|
+
const int64_t nb = k / QK_K;
|
3529
3588
|
|
3530
3589
|
for (int i = 0; i < nb; i++) {
|
3531
3590
|
|
@@ -3562,9 +3621,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
|
|
3562
3621
|
}
|
3563
3622
|
}
|
3564
3623
|
|
3565
|
-
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y,
|
3624
|
+
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
|
3566
3625
|
assert(k % QK_K == 0);
|
3567
|
-
const
|
3626
|
+
const int64_t nb = k / QK_K;
|
3568
3627
|
|
3569
3628
|
for (int i = 0; i < nb; i++) {
|
3570
3629
|
for (int j = 0; j < QK_K; ++j) {
|
@@ -3573,7 +3632,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
|
|
3573
3632
|
}
|
3574
3633
|
}
|
3575
3634
|
|
3576
|
-
void quantize_row_q8_K(const float * restrict x, void * restrict y,
|
3635
|
+
void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
|
3577
3636
|
quantize_row_q8_K_reference(x, y, k);
|
3578
3637
|
}
|
3579
3638
|
|
@@ -9695,6 +9754,248 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
9695
9754
|
#endif
|
9696
9755
|
}
|
9697
9756
|
|
9757
|
+
void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9758
|
+
assert(n % QK_K == 0);
|
9759
|
+
assert(nrc == 1);
|
9760
|
+
UNUSED(nrc);
|
9761
|
+
UNUSED(bx);
|
9762
|
+
UNUSED(by);
|
9763
|
+
UNUSED(bs);
|
9764
|
+
|
9765
|
+
const block_iq1_m * restrict x = vx;
|
9766
|
+
const block_q8_K * restrict y = vy;
|
9767
|
+
|
9768
|
+
const int nb = n / QK_K;
|
9769
|
+
|
9770
|
+
#if QK_K != 64
|
9771
|
+
iq1m_scale_t scale;
|
9772
|
+
#endif
|
9773
|
+
|
9774
|
+
#if defined __ARM_NEON
|
9775
|
+
|
9776
|
+
#if QK_K == 64
|
9777
|
+
const int32x4_t mask = vdupq_n_s32(0xf);
|
9778
|
+
#else
|
9779
|
+
const int32x4_t mask = vdupq_n_s32(0x7);
|
9780
|
+
#endif
|
9781
|
+
const int32x4_t mone = vdupq_n_s32(1);
|
9782
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
9783
|
+
|
9784
|
+
ggml_int8x16x4_t deltas;
|
9785
|
+
deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
|
9786
|
+
deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
|
9787
|
+
deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
|
9788
|
+
deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
|
9789
|
+
|
9790
|
+
ggml_int8x16x4_t q1b;
|
9791
|
+
ggml_int8x16x4_t q8b;
|
9792
|
+
|
9793
|
+
uint32_t aux32;
|
9794
|
+
const uint8_t * aux8 = (const uint8_t *)&aux32;
|
9795
|
+
|
9796
|
+
float sumf = 0;
|
9797
|
+
for (int i = 0; i < nb; ++i) {
|
9798
|
+
|
9799
|
+
const int8_t * q8 = y[i].qs;
|
9800
|
+
const uint8_t * qs = x[i].qs;
|
9801
|
+
const uint8_t * qh = x[i].qh;
|
9802
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
9803
|
+
|
9804
|
+
#if QK_K != 64
|
9805
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
9806
|
+
#endif
|
9807
|
+
|
9808
|
+
int32x4_t sumi1 = mzero;
|
9809
|
+
int32x4_t sumi2 = mzero;
|
9810
|
+
|
9811
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
9812
|
+
|
9813
|
+
q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
|
9814
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
|
9815
|
+
q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
|
9816
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
|
9817
|
+
q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
|
9818
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
|
9819
|
+
q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
|
9820
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
|
9821
|
+
|
9822
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9823
|
+
|
9824
|
+
const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
|
9825
|
+
const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
|
9826
|
+
const int32x4_t p12 = vpaddq_s32(p1, p2);
|
9827
|
+
|
9828
|
+
const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
|
9829
|
+
aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
|
9830
|
+
|
9831
|
+
const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
|
9832
|
+
const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
|
9833
|
+
const int32x4_t p34 = vpaddq_s32(p3, p4);
|
9834
|
+
|
9835
|
+
#if QK_K == 64
|
9836
|
+
int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12);
|
9837
|
+
#else
|
9838
|
+
int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
|
9839
|
+
#endif
|
9840
|
+
scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
|
9841
|
+
|
9842
|
+
sumi1 = vmlaq_s32(sumi1, scales_4, p12);
|
9843
|
+
sumi2 = vmlaq_s32(sumi2, scales_4, p34);
|
9844
|
+
|
9845
|
+
qs += 8; qh += 4;
|
9846
|
+
|
9847
|
+
}
|
9848
|
+
|
9849
|
+
#if QK_K == 64
|
9850
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
|
9851
|
+
#else
|
9852
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
|
9853
|
+
#endif
|
9854
|
+
}
|
9855
|
+
|
9856
|
+
*s = sumf;
|
9857
|
+
|
9858
|
+
#elif defined __AVX2__
|
9859
|
+
|
9860
|
+
#if QK_K == 64
|
9861
|
+
const __m256i mask = _mm256_set1_epi16(0xf);
|
9862
|
+
#else
|
9863
|
+
const __m256i mask = _mm256_set1_epi16(0x7);
|
9864
|
+
#endif
|
9865
|
+
const __m256i mone = _mm256_set1_epi16(1);
|
9866
|
+
|
9867
|
+
__m256 accum1 = _mm256_setzero_ps();
|
9868
|
+
__m256 accum2 = _mm256_setzero_ps();
|
9869
|
+
for (int i = 0; i < nb; ++i) {
|
9870
|
+
|
9871
|
+
const int8_t * q8 = y[i].qs;
|
9872
|
+
const uint8_t * qs = x[i].qs;
|
9873
|
+
const uint8_t * qh = x[i].qh;
|
9874
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
9875
|
+
|
9876
|
+
#if QK_K != 64
|
9877
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
9878
|
+
#endif
|
9879
|
+
|
9880
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
9881
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
9882
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
9883
|
+
const __m256i q1b_1 = _mm256_set_epi64x(
|
9884
|
+
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
|
9885
|
+
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
|
9886
|
+
);
|
9887
|
+
const __m256i q1b_2 = _mm256_set_epi64x(
|
9888
|
+
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
|
9889
|
+
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
|
9890
|
+
);
|
9891
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9892
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9893
|
+
|
9894
|
+
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
9895
|
+
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
9896
|
+
|
9897
|
+
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9898
|
+
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
9899
|
+
qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9900
|
+
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
9901
|
+
const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9902
|
+
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
9903
|
+
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9904
|
+
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
9905
|
+
|
9906
|
+
const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
|
9907
|
+
const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
|
9908
|
+
#if QK_K == 64
|
9909
|
+
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 4), _mm_set1_epi16(sc[0] >> 0));
|
9910
|
+
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8));
|
9911
|
+
#else
|
9912
|
+
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
|
9913
|
+
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
|
9914
|
+
#endif
|
9915
|
+
scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
|
9916
|
+
scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
|
9917
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
|
9918
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
|
9919
|
+
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
|
9920
|
+
const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
|
9921
|
+
|
9922
|
+
sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
|
9923
|
+
sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
|
9924
|
+
|
9925
|
+
qs += 8; qh += 4;
|
9926
|
+
}
|
9927
|
+
|
9928
|
+
#if QK_K == 64
|
9929
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
9930
|
+
#else
|
9931
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
9932
|
+
#endif
|
9933
|
+
accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
|
9934
|
+
accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
|
9935
|
+
|
9936
|
+
}
|
9937
|
+
|
9938
|
+
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
9939
|
+
|
9940
|
+
#else
|
9941
|
+
|
9942
|
+
int sum1[2], sum2[2], delta[4];
|
9943
|
+
|
9944
|
+
float sumf = 0;
|
9945
|
+
for (int i = 0; i < nb; i++) {
|
9946
|
+
|
9947
|
+
const int8_t * q8 = y[i].qs;
|
9948
|
+
const uint8_t * qs = x[i].qs;
|
9949
|
+
const uint8_t * qh = x[i].qh;
|
9950
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
9951
|
+
|
9952
|
+
#if QK_K != 64
|
9953
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
9954
|
+
#endif
|
9955
|
+
|
9956
|
+
int sumi1 = 0, sumi2 = 0;
|
9957
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
9958
|
+
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
9959
|
+
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
9960
|
+
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
9961
|
+
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
9962
|
+
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
9963
|
+
for (int l = 0; l < 4; ++l) {
|
9964
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
9965
|
+
int lsum1 = 0, lsum2 = 0;
|
9966
|
+
for (int j = 0; j < 8; ++j) {
|
9967
|
+
lsum1 += q8[j] * grid[j];
|
9968
|
+
lsum2 += q8[j];
|
9969
|
+
}
|
9970
|
+
q8 += 8;
|
9971
|
+
sum1[l/2] += lsum1;
|
9972
|
+
sum2[l/2] += lsum2*delta[l];
|
9973
|
+
}
|
9974
|
+
#if QK_K == 64
|
9975
|
+
const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1;
|
9976
|
+
const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1;
|
9977
|
+
#else
|
9978
|
+
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
9979
|
+
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
9980
|
+
#endif
|
9981
|
+
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
9982
|
+
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
9983
|
+
qs += 4;
|
9984
|
+
qh += 2;
|
9985
|
+
}
|
9986
|
+
|
9987
|
+
#if QK_K == 64
|
9988
|
+
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
9989
|
+
#else
|
9990
|
+
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
9991
|
+
#endif
|
9992
|
+
}
|
9993
|
+
|
9994
|
+
*s = sumf;
|
9995
|
+
|
9996
|
+
#endif
|
9997
|
+
}
|
9998
|
+
|
9698
9999
|
void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9699
10000
|
assert(nrc == 1);
|
9700
10001
|
UNUSED(nrc);
|
@@ -9938,17 +10239,17 @@ static iq2_entry_t iq2_data[4] = {
|
|
9938
10239
|
};
|
9939
10240
|
|
9940
10241
|
static inline int iq2_data_index(enum ggml_type type) {
|
9941
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10242
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
|
9942
10243
|
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
9943
10244
|
type == GGML_TYPE_IQ2_XS ? 1 :
|
9944
|
-
type == GGML_TYPE_IQ1_S
|
10245
|
+
type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
|
9945
10246
|
}
|
9946
10247
|
|
9947
10248
|
static inline int iq2_grid_size(enum ggml_type type) {
|
9948
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10249
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
|
9949
10250
|
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
9950
10251
|
type == GGML_TYPE_IQ2_XS ? 512 :
|
9951
|
-
type == GGML_TYPE_IQ1_S
|
10252
|
+
type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
|
9952
10253
|
}
|
9953
10254
|
|
9954
10255
|
static int iq2_compare_func(const void * left, const void * right) {
|
@@ -10214,10 +10515,10 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10214
10515
|
|
10215
10516
|
const int kmap_size = 43692;
|
10216
10517
|
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
10217
|
-
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
10518
|
+
const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
10218
10519
|
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
10219
10520
|
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10220
|
-
type == GGML_TYPE_IQ1_S
|
10521
|
+
type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
|
10221
10522
|
uint64_t * kgrid_q2xs;
|
10222
10523
|
int * kmap_q2xs;
|
10223
10524
|
uint16_t * kneighbors_q2xs;
|
@@ -10314,7 +10615,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10314
10615
|
}
|
10315
10616
|
|
10316
10617
|
void iq2xs_free_impl(enum ggml_type type) {
|
10317
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10618
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
|
10318
10619
|
const int gindex = iq2_data_index(type);
|
10319
10620
|
if (iq2_data[gindex].grid) {
|
10320
10621
|
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
@@ -10347,7 +10648,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
10347
10648
|
return grid_index;
|
10348
10649
|
}
|
10349
10650
|
|
10350
|
-
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy,
|
10651
|
+
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
10351
10652
|
|
10352
10653
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
|
10353
10654
|
|
@@ -10363,7 +10664,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
10363
10664
|
|
10364
10665
|
const int kMaxQ = 3;
|
10365
10666
|
|
10366
|
-
const
|
10667
|
+
const int64_t nbl = n/QK_K;
|
10367
10668
|
|
10368
10669
|
block_iq2_xxs * y = vy;
|
10369
10670
|
|
@@ -10520,7 +10821,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
10520
10821
|
}
|
10521
10822
|
}
|
10522
10823
|
|
10523
|
-
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy,
|
10824
|
+
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
10524
10825
|
|
10525
10826
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
|
10526
10827
|
|
@@ -10536,7 +10837,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
10536
10837
|
|
10537
10838
|
const int kMaxQ = 3;
|
10538
10839
|
|
10539
|
-
const
|
10840
|
+
const int64_t nbl = n/QK_K;
|
10540
10841
|
|
10541
10842
|
block_iq2_xs * y = vy;
|
10542
10843
|
|
@@ -10700,11 +11001,11 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
10700
11001
|
}
|
10701
11002
|
}
|
10702
11003
|
|
10703
|
-
size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst,
|
11004
|
+
size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
10704
11005
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
10705
|
-
|
11006
|
+
int64_t nblock = n_per_row/QK_K;
|
10706
11007
|
char * qrow = (char *)dst;
|
10707
|
-
for (
|
11008
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
10708
11009
|
quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
|
10709
11010
|
src += n_per_row;
|
10710
11011
|
qrow += nblock*sizeof(block_iq2_xxs);
|
@@ -10712,11 +11013,11 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nro
|
|
10712
11013
|
return nrow * nblock * sizeof(block_iq2_xxs);
|
10713
11014
|
}
|
10714
11015
|
|
10715
|
-
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst,
|
11016
|
+
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
10716
11017
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
10717
|
-
|
11018
|
+
int64_t nblock = n_per_row/QK_K;
|
10718
11019
|
char * qrow = (char *)dst;
|
10719
|
-
for (
|
11020
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
10720
11021
|
quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
|
10721
11022
|
src += n_per_row;
|
10722
11023
|
qrow += nblock*sizeof(block_iq2_xs);
|
@@ -10941,7 +11242,7 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
10941
11242
|
return grid_index;
|
10942
11243
|
}
|
10943
11244
|
|
10944
|
-
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy,
|
11245
|
+
static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
|
10945
11246
|
const float * restrict quant_weights) {
|
10946
11247
|
|
10947
11248
|
const int gindex = iq3_data_index(grid_size);
|
@@ -10958,7 +11259,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
10958
11259
|
|
10959
11260
|
const int kMaxQ = 8;
|
10960
11261
|
|
10961
|
-
const
|
11262
|
+
const int64_t nbl = n/QK_K;
|
10962
11263
|
|
10963
11264
|
ggml_fp16_t * dh;
|
10964
11265
|
uint8_t * qs;
|
@@ -11154,11 +11455,11 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
11154
11455
|
}
|
11155
11456
|
}
|
11156
11457
|
|
11157
|
-
size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst,
|
11458
|
+
size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
11158
11459
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11159
|
-
|
11460
|
+
int64_t nblock = n_per_row/QK_K;
|
11160
11461
|
char * qrow = (char *)dst;
|
11161
|
-
for (
|
11462
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
11162
11463
|
quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
|
11163
11464
|
src += n_per_row;
|
11164
11465
|
qrow += nblock*sizeof(block_iq3_xxs);
|
@@ -11166,13 +11467,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nro
|
|
11166
11467
|
return nrow * nblock * sizeof(block_iq3_xxs);
|
11167
11468
|
}
|
11168
11469
|
|
11169
|
-
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy,
|
11470
|
+
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
|
11170
11471
|
assert(k % QK_K == 0);
|
11171
11472
|
block_iq3_xxs * restrict y = vy;
|
11172
11473
|
quantize_row_iq3_xxs_reference(x, y, k);
|
11173
11474
|
}
|
11174
11475
|
|
11175
|
-
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y,
|
11476
|
+
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
|
11176
11477
|
assert(k % QK_K == 0);
|
11177
11478
|
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
11178
11479
|
}
|
@@ -11203,7 +11504,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
11203
11504
|
|
11204
11505
|
const int kMaxQ = 8;
|
11205
11506
|
|
11206
|
-
const
|
11507
|
+
const int64_t nbl = n/QK_K;
|
11207
11508
|
|
11208
11509
|
block_iq3_s * y = vy;
|
11209
11510
|
|
@@ -11360,9 +11661,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
11360
11661
|
}
|
11361
11662
|
|
11362
11663
|
#define IQ3S_BLOCK_SIZE 32
|
11363
|
-
size_t quantize_iq3_s(const float * restrict src, void * restrict dst,
|
11664
|
+
size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
11364
11665
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11365
|
-
|
11666
|
+
int64_t nblock = n_per_row/QK_K;
|
11366
11667
|
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
11367
11668
|
float weight[IQ3S_BLOCK_SIZE];
|
11368
11669
|
float xval[IQ3S_BLOCK_SIZE];
|
@@ -11373,7 +11674,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
|
|
11373
11674
|
bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
|
11374
11675
|
uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
|
11375
11676
|
char * qrow = (char *)dst;
|
11376
|
-
for (
|
11677
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
11377
11678
|
quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
|
11378
11679
|
scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
|
11379
11680
|
src += n_per_row;
|
@@ -11382,13 +11683,13 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
|
|
11382
11683
|
return nrow * nblock * sizeof(block_iq3_s);
|
11383
11684
|
}
|
11384
11685
|
|
11385
|
-
void quantize_row_iq3_s(const float * restrict x, void * restrict vy,
|
11686
|
+
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
|
11386
11687
|
assert(k % QK_K == 0);
|
11387
11688
|
block_iq3_s * restrict y = vy;
|
11388
11689
|
quantize_row_iq3_s_reference(x, y, k);
|
11389
11690
|
}
|
11390
11691
|
|
11391
|
-
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y,
|
11692
|
+
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
|
11392
11693
|
assert(k % QK_K == 0);
|
11393
11694
|
quantize_iq3_s(x, y, 1, k, NULL);
|
11394
11695
|
}
|
@@ -11520,7 +11821,16 @@ static int iq1_sort_helper(const void * left, const void * right) {
|
|
11520
11821
|
}
|
11521
11822
|
|
11522
11823
|
#define IQ1S_BLOCK_SIZE 32
|
11523
|
-
|
11824
|
+
#define IQ1M_BLOCK_SIZE 16
|
11825
|
+
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
|
11826
|
+
float * scales,
|
11827
|
+
float * weight,
|
11828
|
+
float * sumx,
|
11829
|
+
float * sumw,
|
11830
|
+
float * pairs,
|
11831
|
+
int8_t * L,
|
11832
|
+
uint16_t * index,
|
11833
|
+
int8_t * shifts) {
|
11524
11834
|
|
11525
11835
|
const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
|
11526
11836
|
|
@@ -11534,22 +11844,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11534
11844
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
11535
11845
|
GGML_ASSERT(n%QK_K == 0);
|
11536
11846
|
|
11537
|
-
const int nbl = n/QK_K;
|
11538
|
-
|
11539
11847
|
block_iq1_s * y = vy;
|
11540
11848
|
|
11849
|
+
const int64_t nbl = n/QK_K;
|
11850
|
+
|
11851
|
+
const int block_size = IQ1S_BLOCK_SIZE;
|
11852
|
+
|
11541
11853
|
const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
|
11542
11854
|
const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
|
11543
11855
|
|
11544
|
-
|
11545
|
-
float weight[IQ1S_BLOCK_SIZE];
|
11546
|
-
int8_t L[IQ1S_BLOCK_SIZE];
|
11547
|
-
float sumx[IQ1S_BLOCK_SIZE+1];
|
11548
|
-
float sumw[IQ1S_BLOCK_SIZE+1];
|
11549
|
-
float pairs[2*IQ1S_BLOCK_SIZE];
|
11856
|
+
|
11550
11857
|
int * idx = (int *)(pairs + 1);
|
11551
|
-
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
11552
|
-
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
11553
11858
|
|
11554
11859
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
11555
11860
|
|
@@ -11564,15 +11869,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11564
11869
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
11565
11870
|
float sigma2 = 2*sumx2/QK_K;
|
11566
11871
|
|
11567
|
-
for (int ib = 0; ib < QK_K/
|
11568
|
-
const float * xb = xbl +
|
11569
|
-
const float * qw = quant_weights + QK_K*ibl +
|
11570
|
-
for (int i = 0; i <
|
11872
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
11873
|
+
const float * xb = xbl + block_size*ib;
|
11874
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
11875
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
11571
11876
|
float max = fabsf(xb[0]);
|
11572
|
-
for (int i = 1; i <
|
11877
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
11573
11878
|
if (!max) {
|
11574
11879
|
scales[ib] = 0;
|
11575
|
-
memset(L, 1,
|
11880
|
+
memset(L, 1, block_size);
|
11576
11881
|
continue;
|
11577
11882
|
}
|
11578
11883
|
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
@@ -11581,14 +11886,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11581
11886
|
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
11582
11887
|
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
11583
11888
|
// for each possible and score for each split.
|
11584
|
-
for (int j = 0; j <
|
11889
|
+
for (int j = 0; j < block_size; ++j) {
|
11585
11890
|
pairs[2*j] = xb[j];
|
11586
11891
|
idx[2*j] = j;
|
11587
11892
|
}
|
11588
|
-
qsort(pairs,
|
11893
|
+
qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
|
11589
11894
|
{
|
11590
11895
|
sumx[0] = sumw[0] = 0;
|
11591
|
-
for (int j = 0; j <
|
11896
|
+
for (int j = 0; j < block_size; ++j) {
|
11592
11897
|
int i = idx[2*j];
|
11593
11898
|
sumx[j+1] = sumx[j] + weight[i]*xb[i];
|
11594
11899
|
sumw[j+1] = sumw[j] + weight[i];
|
@@ -11596,16 +11901,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11596
11901
|
}
|
11597
11902
|
float best_score = 0, scale = max;
|
11598
11903
|
int besti1 = -1, besti2 = -1, best_shift = 0;
|
11599
|
-
for (int i1 = 0; i1 <=
|
11600
|
-
for (int i2 = i1; i2 <=
|
11601
|
-
float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[
|
11602
|
-
float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[
|
11904
|
+
for (int i1 = 0; i1 <= block_size; ++i1) {
|
11905
|
+
for (int i2 = i1; i2 <= block_size; ++i2) {
|
11906
|
+
float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
|
11907
|
+
float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
|
11603
11908
|
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
11604
11909
|
scale = sumqx/sumq2; best_score = scale*sumqx;
|
11605
11910
|
besti1 = i1; besti2 = i2; best_shift = 1;
|
11606
11911
|
}
|
11607
|
-
sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[
|
11608
|
-
sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[
|
11912
|
+
sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
|
11913
|
+
sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
|
11609
11914
|
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
11610
11915
|
scale = sumqx/sumq2; best_score = scale*sumqx;
|
11611
11916
|
besti1 = i1; besti2 = i2; best_shift = -1;
|
@@ -11615,14 +11920,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11615
11920
|
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
|
11616
11921
|
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
11617
11922
|
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
11618
|
-
for (int j = besti2; j <
|
11923
|
+
for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
|
11619
11924
|
if (scale < 0) {
|
11620
|
-
for (int j = 0; j <
|
11925
|
+
for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
|
11621
11926
|
scale = -scale; best_shift = -best_shift;
|
11622
11927
|
}
|
11623
11928
|
bool all_on_grid = true;
|
11624
11929
|
const float * xx = best_shift == 1 ? x_p : x_m;
|
11625
|
-
for (int k = 0; k <
|
11930
|
+
for (int k = 0; k < block_size/8; ++k) {
|
11626
11931
|
uint16_t u = 0;
|
11627
11932
|
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
|
11628
11933
|
int grid_index = kmap_q2xs[u];
|
@@ -11636,7 +11941,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11636
11941
|
}
|
11637
11942
|
if (!all_on_grid) {
|
11638
11943
|
float sumqx = 0, sumq2 = 0;
|
11639
|
-
for (int k = 0; k <
|
11944
|
+
for (int k = 0; k < block_size/8; ++k) {
|
11640
11945
|
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
|
11641
11946
|
for (int j = 0; j < 8; ++j) {
|
11642
11947
|
float w = weight[8*k + j];
|
@@ -11648,8 +11953,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11648
11953
|
if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
|
11649
11954
|
}
|
11650
11955
|
uint16_t h = 0;
|
11651
|
-
for (int k = 0; k <
|
11652
|
-
y[ibl].qs[(
|
11956
|
+
for (int k = 0; k < block_size/8; ++k) {
|
11957
|
+
y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
|
11653
11958
|
h |= (index[k] >> 8) << 3*k;
|
11654
11959
|
}
|
11655
11960
|
y[ibl].qh[ib] = h;
|
@@ -11660,14 +11965,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11660
11965
|
}
|
11661
11966
|
|
11662
11967
|
if (!max_scale) {
|
11663
|
-
memset(y[ibl].qs, 0, QK_K/8);
|
11664
11968
|
continue;
|
11665
11969
|
}
|
11666
11970
|
|
11667
11971
|
float d = max_scale/15;
|
11668
|
-
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.
|
11972
|
+
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
|
11669
11973
|
float id = 1/d;
|
11670
|
-
for (int ib = 0; ib < QK_K/
|
11974
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
11671
11975
|
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
11672
11976
|
l = MAX(0, MIN(7, l));
|
11673
11977
|
if (shifts[ib] == -1) l |= 8;
|
@@ -11676,18 +11980,309 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11676
11980
|
}
|
11677
11981
|
}
|
11678
11982
|
|
11679
|
-
size_t quantize_iq1_s(const float * restrict src, void * restrict dst,
|
11983
|
+
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
11680
11984
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11681
|
-
|
11985
|
+
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
11986
|
+
float weight[IQ1S_BLOCK_SIZE];
|
11987
|
+
int8_t L[IQ1S_BLOCK_SIZE];
|
11988
|
+
float sumx[IQ1S_BLOCK_SIZE+1];
|
11989
|
+
float sumw[IQ1S_BLOCK_SIZE+1];
|
11990
|
+
float pairs[2*IQ1S_BLOCK_SIZE];
|
11991
|
+
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
11992
|
+
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
11993
|
+
int64_t nblock = n_per_row/QK_K;
|
11682
11994
|
char * qrow = (char *)dst;
|
11683
|
-
for (
|
11684
|
-
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
|
11995
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
11996
|
+
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
|
11685
11997
|
src += n_per_row;
|
11686
11998
|
qrow += nblock*sizeof(block_iq1_s);
|
11687
11999
|
}
|
11688
12000
|
return nrow * nblock * sizeof(block_iq1_s);
|
11689
12001
|
}
|
11690
12002
|
|
12003
|
+
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
|
12004
|
+
float * scales,
|
12005
|
+
float * weight,
|
12006
|
+
float * pairs,
|
12007
|
+
int8_t * L,
|
12008
|
+
uint16_t * index,
|
12009
|
+
int8_t * shifts) {
|
12010
|
+
|
12011
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
|
12012
|
+
|
12013
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
12014
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
12015
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
12016
|
+
|
12017
|
+
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
12018
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
12019
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
12020
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
12021
|
+
GGML_ASSERT(n%QK_K == 0);
|
12022
|
+
|
12023
|
+
block_iq1_m * y = vy;
|
12024
|
+
|
12025
|
+
const int64_t nbl = n/QK_K;
|
12026
|
+
|
12027
|
+
const int block_size = IQ1M_BLOCK_SIZE;
|
12028
|
+
|
12029
|
+
const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
|
12030
|
+
const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
|
12031
|
+
const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
|
12032
|
+
|
12033
|
+
int * idx = (int *)(pairs + 1);
|
12034
|
+
|
12035
|
+
float sumqx[4], sumq2[4];
|
12036
|
+
|
12037
|
+
iq1m_scale_t s;
|
12038
|
+
const float * xx;
|
12039
|
+
|
12040
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
12041
|
+
|
12042
|
+
#if QK_K == 64
|
12043
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
12044
|
+
#endif
|
12045
|
+
memset(y[ibl].qs, 0, QK_K/8);
|
12046
|
+
memset(y[ibl].qh, 0, QK_K/16);
|
12047
|
+
memset(y[ibl].scales, 0, QK_K/32);
|
12048
|
+
|
12049
|
+
float max_scale = 0;
|
12050
|
+
|
12051
|
+
const float * xbl = x + QK_K*ibl;
|
12052
|
+
float sumx2 = 0;
|
12053
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
12054
|
+
float sigma2 = 2*sumx2/QK_K;
|
12055
|
+
|
12056
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
12057
|
+
const float * xb = xbl + block_size*ib;
|
12058
|
+
if (quant_weights) {
|
12059
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
12060
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12061
|
+
} else {
|
12062
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
12063
|
+
}
|
12064
|
+
float max = fabsf(xb[0]);
|
12065
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
12066
|
+
if (!max) {
|
12067
|
+
scales[ib] = 0;
|
12068
|
+
memset(L, 1, block_size);
|
12069
|
+
continue;
|
12070
|
+
}
|
12071
|
+
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
12072
|
+
// With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
|
12073
|
+
// boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
|
12074
|
+
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
12075
|
+
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
12076
|
+
// for each possible and score for each split.
|
12077
|
+
for (int j = 0; j < block_size; ++j) {
|
12078
|
+
pairs[2*j] = xb[j];
|
12079
|
+
idx[2*j] = j;
|
12080
|
+
}
|
12081
|
+
qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
|
12082
|
+
float best_score = 0, scale = max;
|
12083
|
+
int besti1 = -1, besti2 = -1, best_k = -1;
|
12084
|
+
// 0: +, +
|
12085
|
+
// 1: +, -
|
12086
|
+
// 2: -, +
|
12087
|
+
// 3: -, -
|
12088
|
+
for (int i1 = 0; i1 <= block_size; ++i1) {
|
12089
|
+
for (int i2 = i1; i2 <= block_size; ++i2) {
|
12090
|
+
memset(sumqx, 0, 4*sizeof(float));
|
12091
|
+
memset(sumq2, 0, 4*sizeof(float));
|
12092
|
+
for (int j = 0; j < i1; ++j) {
|
12093
|
+
int i = idx[2*j];
|
12094
|
+
if (i < block_size/2) {
|
12095
|
+
sumqx[0] += weight[i]*x_p[0]*xb[i];
|
12096
|
+
sumqx[1] += weight[i]*x_p[0]*xb[i];
|
12097
|
+
sumqx[2] += weight[i]*x_m[0]*xb[i];
|
12098
|
+
sumqx[3] += weight[i]*x_m[0]*xb[i];
|
12099
|
+
sumq2[0] += weight[i]*x_p[0]*x_p[0];
|
12100
|
+
sumq2[1] += weight[i]*x_p[0]*x_p[0];
|
12101
|
+
sumq2[2] += weight[i]*x_m[0]*x_m[0];
|
12102
|
+
sumq2[3] += weight[i]*x_m[0]*x_m[0];
|
12103
|
+
} else {
|
12104
|
+
sumqx[0] += weight[i]*x_p[0]*xb[i];
|
12105
|
+
sumqx[2] += weight[i]*x_p[0]*xb[i];
|
12106
|
+
sumqx[1] += weight[i]*x_m[0]*xb[i];
|
12107
|
+
sumqx[3] += weight[i]*x_m[0]*xb[i];
|
12108
|
+
sumq2[0] += weight[i]*x_p[0]*x_p[0];
|
12109
|
+
sumq2[2] += weight[i]*x_p[0]*x_p[0];
|
12110
|
+
sumq2[1] += weight[i]*x_m[0]*x_m[0];
|
12111
|
+
sumq2[3] += weight[i]*x_m[0]*x_m[0];
|
12112
|
+
}
|
12113
|
+
}
|
12114
|
+
for (int j = i1; j < i2; ++j) {
|
12115
|
+
int i = idx[2*j];
|
12116
|
+
if (i < block_size/2) {
|
12117
|
+
sumqx[0] += weight[i]*x_p[1]*xb[i];
|
12118
|
+
sumqx[1] += weight[i]*x_p[1]*xb[i];
|
12119
|
+
sumqx[2] += weight[i]*x_m[1]*xb[i];
|
12120
|
+
sumqx[3] += weight[i]*x_m[1]*xb[i];
|
12121
|
+
sumq2[0] += weight[i]*x_p[1]*x_p[1];
|
12122
|
+
sumq2[1] += weight[i]*x_p[1]*x_p[1];
|
12123
|
+
sumq2[2] += weight[i]*x_m[1]*x_m[1];
|
12124
|
+
sumq2[3] += weight[i]*x_m[1]*x_m[1];
|
12125
|
+
} else {
|
12126
|
+
sumqx[0] += weight[i]*x_p[1]*xb[i];
|
12127
|
+
sumqx[2] += weight[i]*x_p[1]*xb[i];
|
12128
|
+
sumqx[1] += weight[i]*x_m[1]*xb[i];
|
12129
|
+
sumqx[3] += weight[i]*x_m[1]*xb[i];
|
12130
|
+
sumq2[0] += weight[i]*x_p[1]*x_p[1];
|
12131
|
+
sumq2[2] += weight[i]*x_p[1]*x_p[1];
|
12132
|
+
sumq2[1] += weight[i]*x_m[1]*x_m[1];
|
12133
|
+
sumq2[3] += weight[i]*x_m[1]*x_m[1];
|
12134
|
+
}
|
12135
|
+
}
|
12136
|
+
for (int j = i2; j < block_size; ++j) {
|
12137
|
+
int i = idx[2*j];
|
12138
|
+
if (i < block_size/2) {
|
12139
|
+
sumqx[0] += weight[i]*x_p[2]*xb[i];
|
12140
|
+
sumqx[1] += weight[i]*x_p[2]*xb[i];
|
12141
|
+
sumqx[2] += weight[i]*x_m[2]*xb[i];
|
12142
|
+
sumqx[3] += weight[i]*x_m[2]*xb[i];
|
12143
|
+
sumq2[0] += weight[i]*x_p[2]*x_p[2];
|
12144
|
+
sumq2[1] += weight[i]*x_p[2]*x_p[2];
|
12145
|
+
sumq2[2] += weight[i]*x_m[2]*x_m[2];
|
12146
|
+
sumq2[3] += weight[i]*x_m[2]*x_m[2];
|
12147
|
+
} else {
|
12148
|
+
sumqx[0] += weight[i]*x_p[2]*xb[i];
|
12149
|
+
sumqx[2] += weight[i]*x_p[2]*xb[i];
|
12150
|
+
sumqx[1] += weight[i]*x_m[2]*xb[i];
|
12151
|
+
sumqx[3] += weight[i]*x_m[2]*xb[i];
|
12152
|
+
sumq2[0] += weight[i]*x_p[2]*x_p[2];
|
12153
|
+
sumq2[2] += weight[i]*x_p[2]*x_p[2];
|
12154
|
+
sumq2[1] += weight[i]*x_m[2]*x_m[2];
|
12155
|
+
sumq2[3] += weight[i]*x_m[2]*x_m[2];
|
12156
|
+
}
|
12157
|
+
}
|
12158
|
+
for (int k = 0; k < 4; ++k) {
|
12159
|
+
if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
|
12160
|
+
scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
|
12161
|
+
besti1 = i1; besti2 = i2; best_k = k;
|
12162
|
+
}
|
12163
|
+
}
|
12164
|
+
}
|
12165
|
+
}
|
12166
|
+
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
|
12167
|
+
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
12168
|
+
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
12169
|
+
for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
|
12170
|
+
if (scale < 0) {
|
12171
|
+
for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
|
12172
|
+
scale = -scale;
|
12173
|
+
best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
|
12174
|
+
}
|
12175
|
+
bool all_on_grid = true;
|
12176
|
+
for (int k = 0; k < block_size/8; ++k) {
|
12177
|
+
if (k == 0) xx = best_k < 2 ? x_p : x_m;
|
12178
|
+
else xx = best_k%2 == 0 ? x_p : x_m;
|
12179
|
+
uint16_t u = 0;
|
12180
|
+
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
|
12181
|
+
int grid_index = kmap_q2xs[u];
|
12182
|
+
if (grid_index < 0) {
|
12183
|
+
all_on_grid = false;
|
12184
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12185
|
+
grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
|
12186
|
+
GGML_ASSERT(grid_index >= 0);
|
12187
|
+
}
|
12188
|
+
index[k] = grid_index;
|
12189
|
+
}
|
12190
|
+
if (!all_on_grid) {
|
12191
|
+
float sumqx_f = 0, sumq2_f = 0;
|
12192
|
+
for (int k = 0; k < block_size/8; ++k) {
|
12193
|
+
if (k == 0) xx = best_k < 2 ? x_p : x_m;
|
12194
|
+
else xx = best_k%2 == 0 ? x_p : x_m;
|
12195
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
|
12196
|
+
for (int j = 0; j < 8; ++j) {
|
12197
|
+
float w = weight[8*k + j];
|
12198
|
+
float q = xx[(pg[j] - 1)/2];
|
12199
|
+
sumqx_f += w*q*xb[8*k+j];
|
12200
|
+
sumq2_f += w*q*q;
|
12201
|
+
}
|
12202
|
+
}
|
12203
|
+
if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
|
12204
|
+
}
|
12205
|
+
y[ibl].qs[2*ib + 0] = index[0] & 255;
|
12206
|
+
y[ibl].qs[2*ib + 1] = index[1] & 255;
|
12207
|
+
y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
|
12208
|
+
GGML_ASSERT(scale >= 0);
|
12209
|
+
scales[ib] = scale;
|
12210
|
+
shifts[ib] = best_k;
|
12211
|
+
max_scale = MAX(max_scale, scale);
|
12212
|
+
}
|
12213
|
+
|
12214
|
+
if (!max_scale) {
|
12215
|
+
continue;
|
12216
|
+
}
|
12217
|
+
|
12218
|
+
uint16_t * sc = (uint16_t *)y[ibl].scales;
|
12219
|
+
#if QK_K == 64
|
12220
|
+
float d = max_scale/31;
|
12221
|
+
#else
|
12222
|
+
float d = max_scale/15;
|
12223
|
+
#endif
|
12224
|
+
float id = 1/d;
|
12225
|
+
float sumqx_f = 0, sumq2_f = 0;
|
12226
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
12227
|
+
int l = nearest_int(0.5f*(id*scales[ib+0]-1));
|
12228
|
+
#if QK_K == 64
|
12229
|
+
l = MAX(0, MIN(15, l));
|
12230
|
+
sc[ib/4] |= (l << 4*(ib%4));
|
12231
|
+
#else
|
12232
|
+
l = MAX(0, MIN(7, l));
|
12233
|
+
sc[ib/4] |= (l << 3*(ib%4));
|
12234
|
+
#endif
|
12235
|
+
y[ibl].qh[ib] |= masks[shifts[ib]];
|
12236
|
+
const float * xb = xbl + block_size*ib;
|
12237
|
+
if (quant_weights) {
|
12238
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
12239
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12240
|
+
} else {
|
12241
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
12242
|
+
}
|
12243
|
+
for (int k = 0; k < block_size/8; ++k) {
|
12244
|
+
if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
|
12245
|
+
else xx = shifts[ib]%2 == 0 ? x_p : x_m;
|
12246
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
|
12247
|
+
for (int j = 0; j < 8; ++j) {
|
12248
|
+
float w = weight[8*k + j];
|
12249
|
+
float q = xx[(pg[j] - 1)/2]*(2*l+1);
|
12250
|
+
sumqx_f += w*q*xb[8*k+j];
|
12251
|
+
sumq2_f += w*q*q;
|
12252
|
+
}
|
12253
|
+
}
|
12254
|
+
}
|
12255
|
+
if (sumq2_f > 0) d = sumqx_f/sumq2_f;
|
12256
|
+
s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
|
12257
|
+
#if QK_K == 64
|
12258
|
+
y[ibl].d = s.f16;
|
12259
|
+
#else
|
12260
|
+
sc[0] |= ((s.u16 & 0x000f) << 12);
|
12261
|
+
sc[1] |= ((s.u16 & 0x00f0) << 8);
|
12262
|
+
sc[2] |= ((s.u16 & 0x0f00) << 4);
|
12263
|
+
sc[3] |= ((s.u16 & 0xf000) << 0);
|
12264
|
+
#endif
|
12265
|
+
}
|
12266
|
+
}
|
12267
|
+
|
12268
|
+
size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
12269
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
12270
|
+
float scales[QK_K/IQ1M_BLOCK_SIZE];
|
12271
|
+
float weight[IQ1M_BLOCK_SIZE];
|
12272
|
+
int8_t L[IQ1M_BLOCK_SIZE];
|
12273
|
+
float pairs[2*IQ1M_BLOCK_SIZE];
|
12274
|
+
uint16_t index[IQ1M_BLOCK_SIZE/8];
|
12275
|
+
int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
|
12276
|
+
int64_t nblock = n_per_row/QK_K;
|
12277
|
+
char * qrow = (char *)dst;
|
12278
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
12279
|
+
quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
|
12280
|
+
src += n_per_row;
|
12281
|
+
qrow += nblock*sizeof(block_iq1_m);
|
12282
|
+
}
|
12283
|
+
return nrow * nblock * sizeof(block_iq1_m);
|
12284
|
+
}
|
12285
|
+
|
11691
12286
|
// ============================ 4-bit non-linear quants
|
11692
12287
|
|
11693
12288
|
static inline int best_index_int8(int n, const int8_t * val, float x) {
|
@@ -11812,16 +12407,16 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
11812
12407
|
}
|
11813
12408
|
}
|
11814
12409
|
|
11815
|
-
size_t quantize_iq4_nl(const float * restrict src, void * restrict dst,
|
12410
|
+
size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
11816
12411
|
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
11817
|
-
|
12412
|
+
int64_t nblock = n_per_row/QK4_NL;
|
11818
12413
|
char * qrow = (char *)dst;
|
11819
12414
|
uint8_t L[QK4_NL];
|
11820
12415
|
float weight[QK4_NL];
|
11821
12416
|
uint16_t unused_h;
|
11822
12417
|
uint8_t * unused_l = NULL;
|
11823
12418
|
float scale;
|
11824
|
-
for (
|
12419
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
11825
12420
|
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
11826
12421
|
for (int ibl = 0; ibl < nblock; ++ibl) {
|
11827
12422
|
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
@@ -11834,9 +12429,9 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
|
|
11834
12429
|
return nrow * nblock * sizeof(block_iq4_nl);
|
11835
12430
|
}
|
11836
12431
|
|
11837
|
-
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy,
|
12432
|
+
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) {
|
11838
12433
|
GGML_ASSERT(k%QK4_NL == 0);
|
11839
|
-
|
12434
|
+
int64_t nblock = k/QK4_NL;
|
11840
12435
|
uint8_t L[QK4_NL];
|
11841
12436
|
float weight[QK4_NL];
|
11842
12437
|
uint16_t unused_h;
|
@@ -11849,22 +12444,22 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
|
11849
12444
|
}
|
11850
12445
|
}
|
11851
12446
|
|
11852
|
-
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y,
|
12447
|
+
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
|
11853
12448
|
assert(k % QK4_NL == 0);
|
11854
12449
|
quantize_row_iq4_nl(x, y, k);
|
11855
12450
|
}
|
11856
12451
|
|
11857
|
-
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst,
|
12452
|
+
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
11858
12453
|
#if QK_K == 64
|
11859
12454
|
return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
|
11860
12455
|
#else
|
11861
12456
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11862
|
-
|
12457
|
+
int64_t nblock = n_per_row/QK_K;
|
11863
12458
|
char * qrow = (char *)dst;
|
11864
12459
|
uint8_t L[QK_K];
|
11865
12460
|
float weight[32];
|
11866
12461
|
float scales[QK_K/32];
|
11867
|
-
for (
|
12462
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
11868
12463
|
block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
|
11869
12464
|
for (int ibl = 0; ibl < nblock; ++ibl) {
|
11870
12465
|
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
|
@@ -11878,20 +12473,20 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
|
|
11878
12473
|
#endif
|
11879
12474
|
}
|
11880
12475
|
|
11881
|
-
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy,
|
12476
|
+
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
|
11882
12477
|
assert(k % QK_K == 0);
|
11883
12478
|
block_iq4_xs * restrict y = vy;
|
11884
12479
|
quantize_row_iq4_xs_reference(x, y, k);
|
11885
12480
|
}
|
11886
12481
|
|
11887
|
-
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y,
|
12482
|
+
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
|
11888
12483
|
assert(k % QK_K == 0);
|
11889
12484
|
quantize_iq4_xs(x, y, 1, k, NULL);
|
11890
12485
|
}
|
11891
12486
|
|
11892
12487
|
// =============================== 2.5625 bpw
|
11893
12488
|
|
11894
|
-
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy,
|
12489
|
+
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
|
11895
12490
|
|
11896
12491
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
11897
12492
|
|
@@ -11906,7 +12501,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
11906
12501
|
|
11907
12502
|
const int kMaxQ = 3;
|
11908
12503
|
|
11909
|
-
const
|
12504
|
+
const int64_t nbl = n/QK_K;
|
11910
12505
|
|
11911
12506
|
block_iq2_s * y = vy;
|
11912
12507
|
|
@@ -12059,11 +12654,11 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
12059
12654
|
}
|
12060
12655
|
}
|
12061
12656
|
|
12062
|
-
size_t quantize_iq2_s(const float * restrict src, void * restrict dst,
|
12657
|
+
size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
12063
12658
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
12064
|
-
|
12659
|
+
int64_t nblock = n_per_row/QK_K;
|
12065
12660
|
char * qrow = (char *)dst;
|
12066
|
-
for (
|
12661
|
+
for (int64_t row = 0; row < nrow; ++row) {
|
12067
12662
|
quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
|
12068
12663
|
src += n_per_row;
|
12069
12664
|
qrow += nblock*sizeof(block_iq2_s);
|
@@ -12071,12 +12666,12 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow,
|
|
12071
12666
|
return nrow * nblock * sizeof(block_iq2_s);
|
12072
12667
|
}
|
12073
12668
|
|
12074
|
-
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y,
|
12669
|
+
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
|
12075
12670
|
assert(k % QK_K == 0);
|
12076
12671
|
quantize_iq2_s(x, y, 1, k, NULL);
|
12077
12672
|
}
|
12078
12673
|
|
12079
|
-
void quantize_row_iq2_s(const float * restrict x, void * restrict vy,
|
12674
|
+
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
|
12080
12675
|
assert(k % QK_K == 0);
|
12081
12676
|
block_iq2_s * restrict y = vy;
|
12082
12677
|
quantize_row_iq2_s_reference(x, y, k);
|