llama_cpp 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -5,6 +5,8 @@
|
|
5
5
|
#include <string.h>
|
6
6
|
#include <assert.h>
|
7
7
|
#include <float.h>
|
8
|
+
#include <stdlib.h> // for qsort
|
9
|
+
#include <stdio.h> // for GGML_ASSERT
|
8
10
|
|
9
11
|
#ifdef __ARM_NEON
|
10
12
|
|
@@ -272,10 +274,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
272
274
|
|
273
275
|
// vaddvq_s16
|
274
276
|
// vpaddq_s16
|
277
|
+
// vpaddq_s32
|
275
278
|
// vaddvq_s32
|
276
279
|
// vaddvq_f32
|
277
280
|
// vmaxvq_f32
|
278
281
|
// vcvtnq_s32_f32
|
282
|
+
// vzip1_u8
|
283
|
+
// vzip2_u8
|
279
284
|
|
280
285
|
inline static int32_t vaddvq_s16(int16x8_t v) {
|
281
286
|
return
|
@@ -291,6 +296,12 @@ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
|
291
296
|
return vcombine_s16(a0, b0);
|
292
297
|
}
|
293
298
|
|
299
|
+
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
300
|
+
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
301
|
+
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
302
|
+
return vcombine_s32(a0, b0);
|
303
|
+
}
|
304
|
+
|
294
305
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
295
306
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
296
307
|
}
|
@@ -316,6 +327,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
316
327
|
return res;
|
317
328
|
}
|
318
329
|
|
330
|
+
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
331
|
+
uint8x8_t res;
|
332
|
+
|
333
|
+
res[0] = a[0]; res[1] = b[0];
|
334
|
+
res[2] = a[1]; res[3] = b[1];
|
335
|
+
res[4] = a[2]; res[5] = b[2];
|
336
|
+
res[6] = a[3]; res[7] = b[3];
|
337
|
+
|
338
|
+
return res;
|
339
|
+
}
|
340
|
+
|
341
|
+
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
342
|
+
uint8x8_t res;
|
343
|
+
|
344
|
+
res[0] = a[4]; res[1] = b[4];
|
345
|
+
res[2] = a[5]; res[3] = b[5];
|
346
|
+
res[4] = a[6]; res[5] = b[6];
|
347
|
+
res[6] = a[7]; res[7] = b[7];
|
348
|
+
|
349
|
+
return res;
|
350
|
+
}
|
351
|
+
|
319
352
|
// vld1q_s16_x2
|
320
353
|
// vld1q_u8_x2
|
321
354
|
// vld1q_u8_x4
|
@@ -482,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|
482
515
|
quantize_row_q4_0_reference(x, y, k);
|
483
516
|
}
|
484
517
|
|
518
|
+
|
485
519
|
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
|
486
520
|
const int qk = QK4_1;
|
487
521
|
|
@@ -1211,7 +1245,8 @@ static inline int nearest_int(float fval) {
|
|
1211
1245
|
return (i & 0x007fffff) - 0x00400000;
|
1212
1246
|
}
|
1213
1247
|
|
1214
|
-
static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type
|
1248
|
+
static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
|
1249
|
+
const float * restrict qw) {
|
1215
1250
|
float max = 0;
|
1216
1251
|
float amax = 0;
|
1217
1252
|
for (int i = 0; i < n; ++i) {
|
@@ -1237,14 +1272,13 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
1237
1272
|
rmse_type = -rmse_type;
|
1238
1273
|
return_early = true;
|
1239
1274
|
}
|
1240
|
-
int weight_type = rmse_type%2;
|
1241
1275
|
float sumlx = 0;
|
1242
1276
|
float suml2 = 0;
|
1243
1277
|
for (int i = 0; i < n; ++i) {
|
1244
1278
|
int l = nearest_int(iscale * x[i]);
|
1245
1279
|
l = MAX(-nmax, MIN(nmax-1, l));
|
1246
1280
|
L[i] = l + nmax;
|
1247
|
-
float w =
|
1281
|
+
float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
|
1248
1282
|
sumlx += w*x[i]*l;
|
1249
1283
|
suml2 += w*l*l;
|
1250
1284
|
}
|
@@ -1260,7 +1294,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
1260
1294
|
for (int i = 0; i < n; ++i) {
|
1261
1295
|
int l = nearest_int(iscale * x[i]);
|
1262
1296
|
l = MAX(-nmax, MIN(nmax-1, l));
|
1263
|
-
float w =
|
1297
|
+
float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
|
1264
1298
|
sumlx += w*x[i]*l;
|
1265
1299
|
suml2 += w*l*l;
|
1266
1300
|
}
|
@@ -1608,6 +1642,241 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
|
|
1608
1642
|
return (n/QK_K*sizeof(block_q2_K));
|
1609
1643
|
}
|
1610
1644
|
|
1645
|
+
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
1646
|
+
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
1647
|
+
float rmin, float rdelta, int nstep, bool use_mad) {
|
1648
|
+
float min = x[0];
|
1649
|
+
float max = x[0];
|
1650
|
+
float sum_w = weights ? weights[0] : x[0]*x[0];
|
1651
|
+
float sum_x = sum_w * x[0];
|
1652
|
+
for (int i = 1; i < n; ++i) {
|
1653
|
+
if (x[i] < min) min = x[i];
|
1654
|
+
if (x[i] > max) max = x[i];
|
1655
|
+
float w = weights ? weights[i] : x[i]*x[i];
|
1656
|
+
sum_w += w;
|
1657
|
+
sum_x += w * x[i];
|
1658
|
+
}
|
1659
|
+
if (min > 0) {
|
1660
|
+
min = 0;
|
1661
|
+
}
|
1662
|
+
if (max <= min) {
|
1663
|
+
for (int i = 0; i < n; ++i) L[i] = 0;
|
1664
|
+
*the_min = -min;
|
1665
|
+
return 0.f;
|
1666
|
+
}
|
1667
|
+
float iscale = nmax/(max - min);
|
1668
|
+
float scale = 1/iscale;
|
1669
|
+
float best_mad = 0;
|
1670
|
+
for (int i = 0; i < n; ++i) {
|
1671
|
+
int l = nearest_int(iscale*(x[i] - min));
|
1672
|
+
L[i] = MAX(0, MIN(nmax, l));
|
1673
|
+
float diff = scale * L[i] + min - x[i];
|
1674
|
+
diff = use_mad ? fabsf(diff) : diff*diff;
|
1675
|
+
float w = weights ? weights[i] : x[i]*x[i];
|
1676
|
+
best_mad += w * diff;
|
1677
|
+
}
|
1678
|
+
if (nstep < 1) {
|
1679
|
+
*the_min = -min;
|
1680
|
+
return scale;
|
1681
|
+
}
|
1682
|
+
for (int is = 0; is <= nstep; ++is) {
|
1683
|
+
iscale = (rmin + rdelta*is + nmax)/(max - min);
|
1684
|
+
float sum_l = 0, sum_l2 = 0, sum_xl = 0;
|
1685
|
+
for (int i = 0; i < n; ++i) {
|
1686
|
+
int l = nearest_int(iscale*(x[i] - min));
|
1687
|
+
l = MAX(0, MIN(nmax, l));
|
1688
|
+
Laux[i] = l;
|
1689
|
+
float w = weights ? weights[i] : x[i]*x[i];
|
1690
|
+
sum_l += w*l;
|
1691
|
+
sum_l2 += w*l*l;
|
1692
|
+
sum_xl += w*l*x[i];
|
1693
|
+
}
|
1694
|
+
float D = sum_w * sum_l2 - sum_l * sum_l;
|
1695
|
+
if (D > 0) {
|
1696
|
+
float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
|
1697
|
+
float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
|
1698
|
+
if (this_min > 0) {
|
1699
|
+
this_min = 0;
|
1700
|
+
this_scale = sum_xl / sum_l2;
|
1701
|
+
}
|
1702
|
+
float mad = 0;
|
1703
|
+
for (int i = 0; i < n; ++i) {
|
1704
|
+
float diff = this_scale * Laux[i] + this_min - x[i];
|
1705
|
+
diff = use_mad ? fabsf(diff) : diff*diff;
|
1706
|
+
float w = weights ? weights[i] : x[i]*x[i];
|
1707
|
+
mad += w * diff;
|
1708
|
+
}
|
1709
|
+
if (mad < best_mad) {
|
1710
|
+
for (int i = 0; i < n; ++i) {
|
1711
|
+
L[i] = Laux[i];
|
1712
|
+
}
|
1713
|
+
best_mad = mad;
|
1714
|
+
scale = this_scale;
|
1715
|
+
min = this_min;
|
1716
|
+
}
|
1717
|
+
}
|
1718
|
+
}
|
1719
|
+
*the_min = -min;
|
1720
|
+
return scale;
|
1721
|
+
}
|
1722
|
+
|
1723
|
+
static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
|
1724
|
+
float max = 0;
|
1725
|
+
for (int i = 0; i < n; ++i) {
|
1726
|
+
max = MAX(max, x[i]);
|
1727
|
+
}
|
1728
|
+
if (!max) { // all zero
|
1729
|
+
for (int i = 0; i < n; ++i) { L[i] = 0; }
|
1730
|
+
return 0.f;
|
1731
|
+
}
|
1732
|
+
float iscale = nmax / max;
|
1733
|
+
for (int i = 0; i < n; ++i) {
|
1734
|
+
L[i] = nearest_int(iscale * x[i]);
|
1735
|
+
}
|
1736
|
+
float scale = 1/iscale;
|
1737
|
+
float best_mse = 0;
|
1738
|
+
for (int i = 0; i < n; ++i) {
|
1739
|
+
float diff = x[i] - scale*L[i];
|
1740
|
+
float w = quant_weights[i];
|
1741
|
+
best_mse += w*diff*diff;
|
1742
|
+
}
|
1743
|
+
for (int is = -4; is <= 4; ++is) {
|
1744
|
+
if (is == 0) continue;
|
1745
|
+
float iscale_is = (0.1f*is + nmax)/max;
|
1746
|
+
float scale_is = 1/iscale_is;
|
1747
|
+
float mse = 0;
|
1748
|
+
for (int i = 0; i < n; ++i) {
|
1749
|
+
int l = nearest_int(iscale_is*x[i]);
|
1750
|
+
l = MIN(nmax, l);
|
1751
|
+
float diff = x[i] - scale_is*l;
|
1752
|
+
float w = quant_weights[i];
|
1753
|
+
mse += w*diff*diff;
|
1754
|
+
}
|
1755
|
+
if (mse < best_mse) {
|
1756
|
+
best_mse = mse;
|
1757
|
+
iscale = iscale_is;
|
1758
|
+
}
|
1759
|
+
}
|
1760
|
+
float sumlx = 0;
|
1761
|
+
float suml2 = 0;
|
1762
|
+
for (int i = 0; i < n; ++i) {
|
1763
|
+
int l = nearest_int(iscale * x[i]);
|
1764
|
+
l = MIN(nmax, l);
|
1765
|
+
L[i] = l;
|
1766
|
+
float w = quant_weights[i];
|
1767
|
+
sumlx += w*x[i]*l;
|
1768
|
+
suml2 += w*l*l;
|
1769
|
+
}
|
1770
|
+
for (int itry = 0; itry < 5; ++itry) {
|
1771
|
+
int n_changed = 0;
|
1772
|
+
for (int i = 0; i < n; ++i) {
|
1773
|
+
float w = quant_weights[i];
|
1774
|
+
float slx = sumlx - w*x[i]*L[i];
|
1775
|
+
float sl2 = suml2 - w*L[i]*L[i];
|
1776
|
+
if (slx > 0 && sl2 > 0) {
|
1777
|
+
int new_l = nearest_int(x[i] * sl2 / slx);
|
1778
|
+
new_l = MIN(nmax, new_l);
|
1779
|
+
if (new_l != L[i]) {
|
1780
|
+
slx += w*x[i]*new_l;
|
1781
|
+
sl2 += w*new_l*new_l;
|
1782
|
+
if (slx*slx*suml2 > sumlx*sumlx*sl2) {
|
1783
|
+
L[i] = new_l; sumlx = slx; suml2 = sl2;
|
1784
|
+
++n_changed;
|
1785
|
+
}
|
1786
|
+
}
|
1787
|
+
}
|
1788
|
+
}
|
1789
|
+
if (!n_changed) {
|
1790
|
+
break;
|
1791
|
+
}
|
1792
|
+
}
|
1793
|
+
return sumlx / suml2;
|
1794
|
+
}
|
1795
|
+
|
1796
|
+
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
1797
|
+
GGML_ASSERT(quant_weights);
|
1798
|
+
assert(k % QK_K == 0);
|
1799
|
+
const int nb = k / QK_K;
|
1800
|
+
const bool requantize = true;
|
1801
|
+
|
1802
|
+
uint8_t L[QK_K];
|
1803
|
+
uint8_t Laux[16];
|
1804
|
+
float mins[QK_K/16];
|
1805
|
+
float scales[QK_K/16];
|
1806
|
+
float sw[QK_K/16];
|
1807
|
+
float weight[QK_K/16];
|
1808
|
+
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
1809
|
+
|
1810
|
+
for (int i = 0; i < nb; i++) {
|
1811
|
+
memset(sw, 0, QK_K/16*sizeof(float));
|
1812
|
+
float sumx2 = 0;
|
1813
|
+
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
1814
|
+
float sigma2 = sumx2/QK_K;
|
1815
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1816
|
+
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
1817
|
+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
1818
|
+
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
|
1819
|
+
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
1820
|
+
}
|
1821
|
+
|
1822
|
+
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
1823
|
+
float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
1824
|
+
y[i].d = GGML_FP32_TO_FP16(dm);
|
1825
|
+
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
1826
|
+
dm = GGML_FP16_TO_FP32(y[i].d);
|
1827
|
+
mm = GGML_FP16_TO_FP32(y[i].dmin);
|
1828
|
+
|
1829
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1830
|
+
y[i].scales[j] = Ls[j] | (Lm[j] << 4);
|
1831
|
+
}
|
1832
|
+
|
1833
|
+
if (requantize) {
|
1834
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1835
|
+
const float d = dm * (y[i].scales[j] & 0xF);
|
1836
|
+
if (!d) continue;
|
1837
|
+
const float m = mm * (y[i].scales[j] >> 4);
|
1838
|
+
for (int ii = 0; ii < 16; ++ii) {
|
1839
|
+
int l = nearest_int((x[16*j + ii] + m)/d);
|
1840
|
+
l = MAX(0, MIN(3, l));
|
1841
|
+
L[16*j + ii] = l;
|
1842
|
+
}
|
1843
|
+
}
|
1844
|
+
}
|
1845
|
+
|
1846
|
+
#if QK_K == 256
|
1847
|
+
for (int j = 0; j < QK_K; j += 128) {
|
1848
|
+
for (int l = 0; l < 32; ++l) {
|
1849
|
+
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
1850
|
+
}
|
1851
|
+
}
|
1852
|
+
#else
|
1853
|
+
for (int l = 0; l < 16; ++l) {
|
1854
|
+
y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
|
1855
|
+
}
|
1856
|
+
#endif
|
1857
|
+
|
1858
|
+
x += QK_K;
|
1859
|
+
|
1860
|
+
}
|
1861
|
+
}
|
1862
|
+
|
1863
|
+
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
1864
|
+
(void)hist;
|
1865
|
+
int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
1866
|
+
if (!quant_weights) {
|
1867
|
+
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
1868
|
+
}
|
1869
|
+
else {
|
1870
|
+
char * qrow = (char *)dst;
|
1871
|
+
for (int row = 0; row < nrow; ++row) {
|
1872
|
+
quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
|
1873
|
+
src += n_per_row;
|
1874
|
+
qrow += row_size;
|
1875
|
+
}
|
1876
|
+
}
|
1877
|
+
return nrow * row_size;
|
1878
|
+
}
|
1879
|
+
|
1611
1880
|
//========================= 3-bit (de)-quantization
|
1612
1881
|
|
1613
1882
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
@@ -1821,6 +2090,112 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
|
|
1821
2090
|
return (n/QK_K*sizeof(block_q3_K));
|
1822
2091
|
}
|
1823
2092
|
|
2093
|
+
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
|
2094
|
+
#if QK_K != 256
|
2095
|
+
(void)quant_weights;
|
2096
|
+
quantize_row_q3_K_reference(x, y, n_per_row);
|
2097
|
+
#else
|
2098
|
+
assert(n_per_row % QK_K == 0);
|
2099
|
+
const int nb = n_per_row / QK_K;
|
2100
|
+
|
2101
|
+
int8_t L[QK_K];
|
2102
|
+
float scales[QK_K / 16];
|
2103
|
+
float weight[16];
|
2104
|
+
float sw[QK_K / 16];
|
2105
|
+
int8_t Ls[QK_K / 16];
|
2106
|
+
|
2107
|
+
for (int i = 0; i < nb; i++) {
|
2108
|
+
|
2109
|
+
float sumx2 = 0;
|
2110
|
+
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
2111
|
+
float sigma2 = 2*sumx2/QK_K;
|
2112
|
+
|
2113
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
2114
|
+
if (quant_weights) {
|
2115
|
+
const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
|
2116
|
+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
|
2117
|
+
} else {
|
2118
|
+
for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
|
2119
|
+
}
|
2120
|
+
float sumw = 0;
|
2121
|
+
for (int l = 0; l < 16; ++l) sumw += weight[l];
|
2122
|
+
sw[j] = sumw;
|
2123
|
+
|
2124
|
+
scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
|
2125
|
+
|
2126
|
+
}
|
2127
|
+
|
2128
|
+
memset(y[i].scales, 0, 12);
|
2129
|
+
|
2130
|
+
float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
|
2131
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
2132
|
+
int l = Ls[j];
|
2133
|
+
if (j < 8) {
|
2134
|
+
y[i].scales[j] = l & 0xF;
|
2135
|
+
} else {
|
2136
|
+
y[i].scales[j-8] |= ((l & 0xF) << 4);
|
2137
|
+
}
|
2138
|
+
l >>= 4;
|
2139
|
+
y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
|
2140
|
+
}
|
2141
|
+
y[i].d = GGML_FP32_TO_FP16(d_block);
|
2142
|
+
|
2143
|
+
int8_t sc;
|
2144
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
2145
|
+
sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
|
2146
|
+
sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
|
2147
|
+
float d = GGML_FP16_TO_FP32(y[i].d) * sc;
|
2148
|
+
if (!d) {
|
2149
|
+
continue;
|
2150
|
+
}
|
2151
|
+
for (int ii = 0; ii < 16; ++ii) {
|
2152
|
+
int l = nearest_int(x[16*j + ii]/d);
|
2153
|
+
l = MAX(-4, MIN(3, l));
|
2154
|
+
L[16*j + ii] = l + 4;
|
2155
|
+
}
|
2156
|
+
}
|
2157
|
+
|
2158
|
+
memset(y[i].hmask, 0, QK_K/8);
|
2159
|
+
// We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
|
2160
|
+
int m = 0;
|
2161
|
+
uint8_t hm = 1;
|
2162
|
+
for (int j = 0; j < QK_K; ++j) {
|
2163
|
+
if (L[j] > 3) {
|
2164
|
+
y[i].hmask[m] |= hm;
|
2165
|
+
L[j] -= 4;
|
2166
|
+
}
|
2167
|
+
if (++m == QK_K/8) {
|
2168
|
+
m = 0; hm <<= 1;
|
2169
|
+
}
|
2170
|
+
}
|
2171
|
+
for (int j = 0; j < QK_K; j += 128) {
|
2172
|
+
for (int l = 0; l < 32; ++l) {
|
2173
|
+
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
2174
|
+
}
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
x += QK_K;
|
2178
|
+
}
|
2179
|
+
#endif
|
2180
|
+
}
|
2181
|
+
|
2182
|
+
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
2183
|
+
(void)hist;
|
2184
|
+
int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
2185
|
+
if (!quant_weights) {
|
2186
|
+
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
2187
|
+
}
|
2188
|
+
else {
|
2189
|
+
char * qrow = (char *)dst;
|
2190
|
+
for (int row = 0; row < nrow; ++row) {
|
2191
|
+
quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
|
2192
|
+
src += n_per_row;
|
2193
|
+
qrow += row_size;
|
2194
|
+
}
|
2195
|
+
}
|
2196
|
+
return nrow * row_size;
|
2197
|
+
}
|
2198
|
+
|
1824
2199
|
// ====================== 4-bit (de)-quantization
|
1825
2200
|
|
1826
2201
|
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
|
@@ -1986,36 +2361,38 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
|
|
1986
2361
|
return (n/QK_K*sizeof(block_q4_K));
|
1987
2362
|
}
|
1988
2363
|
|
1989
|
-
|
1990
|
-
|
1991
|
-
void
|
1992
|
-
|
1993
|
-
|
2364
|
+
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
|
2365
|
+
#if QK_K != 256
|
2366
|
+
(void)quant_weights;
|
2367
|
+
quantize_row_q4_K_reference(x, y, n_per_row);
|
2368
|
+
#else
|
2369
|
+
assert(n_per_row % QK_K == 0);
|
2370
|
+
const int nb = n_per_row / QK_K;
|
1994
2371
|
|
1995
|
-
#if QK_K == 256
|
1996
2372
|
uint8_t L[QK_K];
|
2373
|
+
uint8_t Laux[32];
|
2374
|
+
float weights[32];
|
1997
2375
|
float mins[QK_K/32];
|
1998
2376
|
float scales[QK_K/32];
|
1999
|
-
float weights[32];
|
2000
|
-
uint8_t Laux[32];
|
2001
|
-
#else
|
2002
|
-
int8_t L[QK_K];
|
2003
|
-
float scales[QK_K/16];
|
2004
|
-
#endif
|
2005
2377
|
|
2006
2378
|
for (int i = 0; i < nb; i++) {
|
2007
2379
|
|
2008
|
-
|
2380
|
+
float sum_x2 = 0;
|
2381
|
+
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
2382
|
+
float sigma2 = sum_x2/QK_K;
|
2383
|
+
float av_x = sqrtf(sigma2);
|
2009
2384
|
|
2010
2385
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
2011
2386
|
float max_min = 0;
|
2012
2387
|
for (int j = 0; j < QK_K/32; ++j) {
|
2013
|
-
|
2014
|
-
|
2015
|
-
|
2016
|
-
|
2017
|
-
|
2018
|
-
|
2388
|
+
if (quant_weights) {
|
2389
|
+
const float * qw = quant_weights + QK_K*i + 32*j;
|
2390
|
+
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
|
2391
|
+
} else {
|
2392
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
2393
|
+
}
|
2394
|
+
scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
2395
|
+
//scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
|
2019
2396
|
float scale = scales[j];
|
2020
2397
|
if (scale > max_scale) {
|
2021
2398
|
max_scale = scale;
|
@@ -2053,18 +2430,118 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
2053
2430
|
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
|
2054
2431
|
for (int ii = 0; ii < 32; ++ii) {
|
2055
2432
|
int l = nearest_int((x[32*j + ii] + dm)/d);
|
2056
|
-
l = MAX(0, MIN(
|
2433
|
+
l = MAX(0, MIN(15, l));
|
2057
2434
|
L[32*j + ii] = l;
|
2058
2435
|
}
|
2059
2436
|
}
|
2437
|
+
uint8_t * q = y[i].qs;
|
2438
|
+
for (int j = 0; j < QK_K; j += 64) {
|
2439
|
+
for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
|
2440
|
+
q += 32;
|
2441
|
+
}
|
2060
2442
|
|
2061
|
-
|
2062
|
-
uint8_t * restrict ql = y[i].qs;
|
2063
|
-
memset(qh, 0, QK_K/8);
|
2443
|
+
x += QK_K;
|
2064
2444
|
|
2065
|
-
|
2066
|
-
|
2067
|
-
|
2445
|
+
}
|
2446
|
+
#endif
|
2447
|
+
}
|
2448
|
+
|
2449
|
+
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
2450
|
+
(void)hist;
|
2451
|
+
int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
2452
|
+
if (!quant_weights) {
|
2453
|
+
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
2454
|
+
}
|
2455
|
+
else {
|
2456
|
+
char * qrow = (char *)dst;
|
2457
|
+
for (int row = 0; row < nrow; ++row) {
|
2458
|
+
quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
|
2459
|
+
src += n_per_row;
|
2460
|
+
qrow += row_size;
|
2461
|
+
}
|
2462
|
+
}
|
2463
|
+
return nrow * row_size;
|
2464
|
+
}
|
2465
|
+
|
2466
|
+
// ====================== 5-bit (de)-quantization
|
2467
|
+
|
2468
|
+
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
|
2469
|
+
assert(k % QK_K == 0);
|
2470
|
+
const int nb = k / QK_K;
|
2471
|
+
|
2472
|
+
#if QK_K == 256
|
2473
|
+
uint8_t L[QK_K];
|
2474
|
+
float mins[QK_K/32];
|
2475
|
+
float scales[QK_K/32];
|
2476
|
+
float weights[32];
|
2477
|
+
uint8_t Laux[32];
|
2478
|
+
#else
|
2479
|
+
int8_t L[QK_K];
|
2480
|
+
float scales[QK_K/16];
|
2481
|
+
#endif
|
2482
|
+
|
2483
|
+
for (int i = 0; i < nb; i++) {
|
2484
|
+
|
2485
|
+
#if QK_K == 256
|
2486
|
+
|
2487
|
+
float max_scale = 0; // as we are deducting the min, scales are always positive
|
2488
|
+
float max_min = 0;
|
2489
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2490
|
+
//scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
2491
|
+
float sum_x2 = 0;
|
2492
|
+
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
2493
|
+
float av_x = sqrtf(sum_x2/32);
|
2494
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
2495
|
+
scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
|
2496
|
+
float scale = scales[j];
|
2497
|
+
if (scale > max_scale) {
|
2498
|
+
max_scale = scale;
|
2499
|
+
}
|
2500
|
+
float min = mins[j];
|
2501
|
+
if (min > max_min) {
|
2502
|
+
max_min = min;
|
2503
|
+
}
|
2504
|
+
}
|
2505
|
+
|
2506
|
+
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
|
2507
|
+
float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
|
2508
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2509
|
+
uint8_t ls = nearest_int(inv_scale*scales[j]);
|
2510
|
+
uint8_t lm = nearest_int(inv_min*mins[j]);
|
2511
|
+
ls = MIN(63, ls);
|
2512
|
+
lm = MIN(63, lm);
|
2513
|
+
if (j < 4) {
|
2514
|
+
y[i].scales[j] = ls;
|
2515
|
+
y[i].scales[j+4] = lm;
|
2516
|
+
} else {
|
2517
|
+
y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
|
2518
|
+
y[i].scales[j-4] |= ((ls >> 4) << 6);
|
2519
|
+
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
2520
|
+
}
|
2521
|
+
}
|
2522
|
+
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
|
2523
|
+
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
|
2524
|
+
|
2525
|
+
uint8_t sc, m;
|
2526
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2527
|
+
get_scale_min_k4(j, y[i].scales, &sc, &m);
|
2528
|
+
const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
|
2529
|
+
if (!d) continue;
|
2530
|
+
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
|
2531
|
+
for (int ii = 0; ii < 32; ++ii) {
|
2532
|
+
int l = nearest_int((x[32*j + ii] + dm)/d);
|
2533
|
+
l = MAX(0, MIN(31, l));
|
2534
|
+
L[32*j + ii] = l;
|
2535
|
+
}
|
2536
|
+
}
|
2537
|
+
|
2538
|
+
uint8_t * restrict qh = y[i].qh;
|
2539
|
+
uint8_t * restrict ql = y[i].qs;
|
2540
|
+
memset(qh, 0, QK_K/8);
|
2541
|
+
|
2542
|
+
uint8_t m1 = 1, m2 = 2;
|
2543
|
+
for (int n = 0; n < QK_K; n += 64) {
|
2544
|
+
for (int j = 0; j < 32; ++j) {
|
2068
2545
|
int l1 = L[n + j];
|
2069
2546
|
if (l1 > 15) {
|
2070
2547
|
l1 -= 16; qh[j] |= m1;
|
@@ -2081,7 +2558,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
2081
2558
|
#else
|
2082
2559
|
float max_scale = 0, amax = 0;
|
2083
2560
|
for (int j = 0; j < QK_K/16; ++j) {
|
2084
|
-
scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
|
2561
|
+
scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL);
|
2085
2562
|
float abs_scale = fabsf(scales[j]);
|
2086
2563
|
if (abs_scale > amax) {
|
2087
2564
|
amax = abs_scale;
|
@@ -2192,6 +2669,123 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
|
|
2192
2669
|
return (n/QK_K*sizeof(block_q5_K));
|
2193
2670
|
}
|
2194
2671
|
|
2672
|
+
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
|
2673
|
+
#if QK_K != 256
|
2674
|
+
(void)quant_weights;
|
2675
|
+
quantize_row_q5_K_reference(x, y, n_per_row);
|
2676
|
+
#else
|
2677
|
+
assert(n_per_row % QK_K == 0);
|
2678
|
+
const int nb = n_per_row / QK_K;
|
2679
|
+
|
2680
|
+
uint8_t L[QK_K];
|
2681
|
+
float mins[QK_K/32];
|
2682
|
+
float scales[QK_K/32];
|
2683
|
+
float weights[32];
|
2684
|
+
uint8_t Laux[32];
|
2685
|
+
|
2686
|
+
for (int i = 0; i < nb; i++) {
|
2687
|
+
|
2688
|
+
float sum_x2 = 0;
|
2689
|
+
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
2690
|
+
float sigma2 = sum_x2/QK_K;
|
2691
|
+
float av_x = sqrtf(sigma2);
|
2692
|
+
|
2693
|
+
float max_scale = 0; // as we are deducting the min, scales are always positive
|
2694
|
+
float max_min = 0;
|
2695
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2696
|
+
if (quant_weights) {
|
2697
|
+
const float * qw = quant_weights + QK_K*i + 32*j;
|
2698
|
+
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
|
2699
|
+
} else {
|
2700
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
2701
|
+
}
|
2702
|
+
scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
2703
|
+
float scale = scales[j];
|
2704
|
+
if (scale > max_scale) {
|
2705
|
+
max_scale = scale;
|
2706
|
+
}
|
2707
|
+
float min = mins[j];
|
2708
|
+
if (min > max_min) {
|
2709
|
+
max_min = min;
|
2710
|
+
}
|
2711
|
+
}
|
2712
|
+
|
2713
|
+
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
|
2714
|
+
float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
|
2715
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2716
|
+
uint8_t ls = nearest_int(inv_scale*scales[j]);
|
2717
|
+
uint8_t lm = nearest_int(inv_min*mins[j]);
|
2718
|
+
ls = MIN(63, ls);
|
2719
|
+
lm = MIN(63, lm);
|
2720
|
+
if (j < 4) {
|
2721
|
+
y[i].scales[j] = ls;
|
2722
|
+
y[i].scales[j+4] = lm;
|
2723
|
+
} else {
|
2724
|
+
y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
|
2725
|
+
y[i].scales[j-4] |= ((ls >> 4) << 6);
|
2726
|
+
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
2727
|
+
}
|
2728
|
+
}
|
2729
|
+
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
|
2730
|
+
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
|
2731
|
+
|
2732
|
+
uint8_t sc, m;
|
2733
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2734
|
+
get_scale_min_k4(j, y[i].scales, &sc, &m);
|
2735
|
+
const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
|
2736
|
+
if (!d) continue;
|
2737
|
+
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
|
2738
|
+
for (int ii = 0; ii < 32; ++ii) {
|
2739
|
+
int l = nearest_int((x[32*j + ii] + dm)/d);
|
2740
|
+
l = MAX(0, MIN(31, l));
|
2741
|
+
L[32*j + ii] = l;
|
2742
|
+
}
|
2743
|
+
}
|
2744
|
+
|
2745
|
+
uint8_t * restrict qh = y[i].qh;
|
2746
|
+
uint8_t * restrict ql = y[i].qs;
|
2747
|
+
memset(qh, 0, QK_K/8);
|
2748
|
+
|
2749
|
+
uint8_t m1 = 1, m2 = 2;
|
2750
|
+
for (int n = 0; n < QK_K; n += 64) {
|
2751
|
+
for (int j = 0; j < 32; ++j) {
|
2752
|
+
int l1 = L[n + j];
|
2753
|
+
if (l1 > 15) {
|
2754
|
+
l1 -= 16; qh[j] |= m1;
|
2755
|
+
}
|
2756
|
+
int l2 = L[n + j + 32];
|
2757
|
+
if (l2 > 15) {
|
2758
|
+
l2 -= 16; qh[j] |= m2;
|
2759
|
+
}
|
2760
|
+
ql[j] = l1 | (l2 << 4);
|
2761
|
+
}
|
2762
|
+
m1 <<= 2; m2 <<= 2;
|
2763
|
+
ql += 32;
|
2764
|
+
}
|
2765
|
+
|
2766
|
+
x += QK_K;
|
2767
|
+
|
2768
|
+
}
|
2769
|
+
#endif
|
2770
|
+
}
|
2771
|
+
|
2772
|
+
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
2773
|
+
(void)hist;
|
2774
|
+
int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
2775
|
+
if (!quant_weights) {
|
2776
|
+
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
2777
|
+
}
|
2778
|
+
else {
|
2779
|
+
char * qrow = (char *)dst;
|
2780
|
+
for (int row = 0; row < nrow; ++row) {
|
2781
|
+
quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
|
2782
|
+
src += n_per_row;
|
2783
|
+
qrow += row_size;
|
2784
|
+
}
|
2785
|
+
}
|
2786
|
+
return nrow * row_size;
|
2787
|
+
}
|
2788
|
+
|
2195
2789
|
// ====================== 6-bit (de)-quantization
|
2196
2790
|
|
2197
2791
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
@@ -2208,7 +2802,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
2208
2802
|
|
2209
2803
|
for (int ib = 0; ib < QK_K/16; ++ib) {
|
2210
2804
|
|
2211
|
-
const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1);
|
2805
|
+
const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
|
2212
2806
|
scales[ib] = scale;
|
2213
2807
|
|
2214
2808
|
const float abs_scale = fabsf(scale);
|
@@ -2276,68 +2870,365 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
2276
2870
|
}
|
2277
2871
|
}
|
2278
2872
|
|
2279
|
-
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
|
2280
|
-
assert(k % QK_K == 0);
|
2281
|
-
const int nb = k / QK_K;
|
2873
|
+
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
|
2874
|
+
assert(k % QK_K == 0);
|
2875
|
+
const int nb = k / QK_K;
|
2876
|
+
|
2877
|
+
for (int i = 0; i < nb; i++) {
|
2878
|
+
|
2879
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
2880
|
+
|
2881
|
+
const uint8_t * restrict ql = x[i].ql;
|
2882
|
+
const uint8_t * restrict qh = x[i].qh;
|
2883
|
+
const int8_t * restrict sc = x[i].scales;
|
2884
|
+
|
2885
|
+
#if QK_K == 256
|
2886
|
+
for (int n = 0; n < QK_K; n += 128) {
|
2887
|
+
for (int l = 0; l < 32; ++l) {
|
2888
|
+
int is = l/16;
|
2889
|
+
const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
2890
|
+
const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
2891
|
+
const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
2892
|
+
const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
2893
|
+
y[l + 0] = d * sc[is + 0] * q1;
|
2894
|
+
y[l + 32] = d * sc[is + 2] * q2;
|
2895
|
+
y[l + 64] = d * sc[is + 4] * q3;
|
2896
|
+
y[l + 96] = d * sc[is + 6] * q4;
|
2897
|
+
}
|
2898
|
+
y += 128;
|
2899
|
+
ql += 64;
|
2900
|
+
qh += 32;
|
2901
|
+
sc += 8;
|
2902
|
+
}
|
2903
|
+
#else
|
2904
|
+
for (int l = 0; l < 16; ++l) {
|
2905
|
+
const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
2906
|
+
const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
2907
|
+
const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
2908
|
+
const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
2909
|
+
y[l+ 0] = d * sc[0] * q1;
|
2910
|
+
y[l+16] = d * sc[1] * q2;
|
2911
|
+
y[l+32] = d * sc[2] * q3;
|
2912
|
+
y[l+48] = d * sc[3] * q4;
|
2913
|
+
}
|
2914
|
+
y += 64;
|
2915
|
+
#endif
|
2916
|
+
|
2917
|
+
}
|
2918
|
+
}
|
2919
|
+
|
2920
|
+
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
2921
|
+
assert(k % QK_K == 0);
|
2922
|
+
block_q6_K * restrict y = vy;
|
2923
|
+
quantize_row_q6_K_reference(x, y, k);
|
2924
|
+
}
|
2925
|
+
|
2926
|
+
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
2927
|
+
assert(k % QK_K == 0);
|
2928
|
+
(void)hist; // TODO: collect histograms
|
2929
|
+
|
2930
|
+
for (int j = 0; j < n; j += k) {
|
2931
|
+
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
2932
|
+
quantize_row_q6_K_reference(src + j, y, k);
|
2933
|
+
}
|
2934
|
+
return (n/QK_K*sizeof(block_q6_K));
|
2935
|
+
}
|
2936
|
+
|
2937
|
+
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
|
2938
|
+
#if QK_K != 256
|
2939
|
+
(void)quant_weights;
|
2940
|
+
quantize_row_q6_K_reference(x, y, n_per_row);
|
2941
|
+
#else
|
2942
|
+
assert(n_per_row % QK_K == 0);
|
2943
|
+
const int nb = n_per_row / QK_K;
|
2944
|
+
|
2945
|
+
int8_t L[QK_K];
|
2946
|
+
float scales[QK_K/16];
|
2947
|
+
//float weights[16];
|
2948
|
+
|
2949
|
+
for (int i = 0; i < nb; i++) {
|
2950
|
+
|
2951
|
+
//float sum_x2 = 0;
|
2952
|
+
//for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j];
|
2953
|
+
//float sigma2 = sum_x2/QK_K;
|
2954
|
+
|
2955
|
+
float max_scale = 0;
|
2956
|
+
float max_abs_scale = 0;
|
2957
|
+
|
2958
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
2959
|
+
|
2960
|
+
float scale;
|
2961
|
+
if (quant_weights) {
|
2962
|
+
const float * qw = quant_weights + QK_K*i + 16*ib;
|
2963
|
+
//for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]);
|
2964
|
+
//scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights);
|
2965
|
+
scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw);
|
2966
|
+
} else {
|
2967
|
+
scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
|
2968
|
+
}
|
2969
|
+
scales[ib] = scale;
|
2970
|
+
|
2971
|
+
const float abs_scale = fabsf(scale);
|
2972
|
+
if (abs_scale > max_abs_scale) {
|
2973
|
+
max_abs_scale = abs_scale;
|
2974
|
+
max_scale = scale;
|
2975
|
+
}
|
2976
|
+
|
2977
|
+
}
|
2978
|
+
|
2979
|
+
if (!max_abs_scale) {
|
2980
|
+
memset(&y[i], 0, sizeof(block_q6_K));
|
2981
|
+
y[i].d = GGML_FP32_TO_FP16(0.f);
|
2982
|
+
x += QK_K;
|
2983
|
+
continue;
|
2984
|
+
}
|
2985
|
+
|
2986
|
+
float iscale = -128.f/max_scale;
|
2987
|
+
y[i].d = GGML_FP32_TO_FP16(1/iscale);
|
2988
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
2989
|
+
y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
|
2990
|
+
}
|
2991
|
+
|
2992
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
2993
|
+
float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
|
2994
|
+
if (!d) {
|
2995
|
+
continue;
|
2996
|
+
}
|
2997
|
+
for (int ii = 0; ii < 16; ++ii) {
|
2998
|
+
int l = nearest_int(x[16*j + ii]/d);
|
2999
|
+
l = MAX(-32, MIN(31, l));
|
3000
|
+
L[16*j + ii] = l + 32;
|
3001
|
+
}
|
3002
|
+
}
|
3003
|
+
|
3004
|
+
uint8_t * restrict ql = y[i].ql;
|
3005
|
+
uint8_t * restrict qh = y[i].qh;
|
3006
|
+
for (int j = 0; j < QK_K; j += 128) {
|
3007
|
+
for (int l = 0; l < 32; ++l) {
|
3008
|
+
const uint8_t q1 = L[j + l + 0] & 0xF;
|
3009
|
+
const uint8_t q2 = L[j + l + 32] & 0xF;
|
3010
|
+
const uint8_t q3 = L[j + l + 64] & 0xF;
|
3011
|
+
const uint8_t q4 = L[j + l + 96] & 0xF;
|
3012
|
+
ql[l+ 0] = q1 | (q3 << 4);
|
3013
|
+
ql[l+32] = q2 | (q4 << 4);
|
3014
|
+
qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
|
3015
|
+
}
|
3016
|
+
ql += 64;
|
3017
|
+
qh += 32;
|
3018
|
+
}
|
3019
|
+
|
3020
|
+
x += QK_K;
|
3021
|
+
|
3022
|
+
}
|
3023
|
+
#endif
|
3024
|
+
}
|
3025
|
+
|
3026
|
+
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3027
|
+
(void)hist;
|
3028
|
+
int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
3029
|
+
if (!quant_weights) {
|
3030
|
+
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
3031
|
+
}
|
3032
|
+
else {
|
3033
|
+
char * qrow = (char *)dst;
|
3034
|
+
for (int row = 0; row < nrow; ++row) {
|
3035
|
+
quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
|
3036
|
+
src += n_per_row;
|
3037
|
+
qrow += row_size;
|
3038
|
+
}
|
3039
|
+
}
|
3040
|
+
return nrow * row_size;
|
3041
|
+
}
|
3042
|
+
|
3043
|
+
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
|
3044
|
+
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
3045
|
+
|
3046
|
+
if (!quant_weights) {
|
3047
|
+
quantize_row_q4_0_reference(x, y, n_per_row);
|
3048
|
+
return;
|
3049
|
+
}
|
3050
|
+
|
3051
|
+
float weight[QK4_0];
|
3052
|
+
int8_t L[QK4_0];
|
3053
|
+
|
3054
|
+
float sum_x2 = 0;
|
3055
|
+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3056
|
+
float sigma2 = sum_x2/n_per_row;
|
3057
|
+
|
3058
|
+
const int nb = n_per_row/QK4_0;
|
3059
|
+
for (int ib = 0; ib < nb; ++ib) {
|
3060
|
+
const float * xb = x + QK4_0 * ib;
|
3061
|
+
const float * qw = quant_weights + QK4_0 * ib;
|
3062
|
+
for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
3063
|
+
float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
|
3064
|
+
y[ib].d = GGML_FP32_TO_FP16(d);
|
3065
|
+
for (int j = 0; j < 16; ++j) {
|
3066
|
+
y[ib].qs[j] = L[j] | (L[j+16] << 4);
|
3067
|
+
}
|
3068
|
+
}
|
3069
|
+
}
|
3070
|
+
|
3071
|
+
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3072
|
+
if (!quant_weights) {
|
3073
|
+
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
3074
|
+
}
|
3075
|
+
int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
3076
|
+
char * qrow = (char *)dst;
|
3077
|
+
for (int row = 0; row < nrow; ++row) {
|
3078
|
+
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
3079
|
+
src += n_per_row;
|
3080
|
+
qrow += row_size;
|
3081
|
+
}
|
3082
|
+
return nrow * row_size;
|
3083
|
+
}
|
3084
|
+
|
3085
|
+
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
|
3086
|
+
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
2282
3087
|
|
2283
|
-
|
3088
|
+
if (!quant_weights) {
|
3089
|
+
quantize_row_q4_1_reference(x, y, n_per_row);
|
3090
|
+
return;
|
3091
|
+
}
|
2284
3092
|
|
2285
|
-
|
3093
|
+
float weight[QK4_1];
|
3094
|
+
uint8_t L[QK4_1], Laux[QK4_1];
|
2286
3095
|
|
2287
|
-
|
2288
|
-
|
2289
|
-
|
3096
|
+
float sum_x2 = 0;
|
3097
|
+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3098
|
+
float sigma2 = sum_x2/n_per_row;
|
2290
3099
|
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2294
|
-
|
2295
|
-
|
2296
|
-
|
2297
|
-
|
2298
|
-
|
2299
|
-
|
2300
|
-
|
2301
|
-
|
2302
|
-
y[l + 96] = d * sc[is + 6] * q4;
|
2303
|
-
}
|
2304
|
-
y += 128;
|
2305
|
-
ql += 64;
|
2306
|
-
qh += 32;
|
2307
|
-
sc += 8;
|
3100
|
+
const int nb = n_per_row/QK4_1;
|
3101
|
+
for (int ib = 0; ib < nb; ++ib) {
|
3102
|
+
const float * xb = x + QK4_1 * ib;
|
3103
|
+
const float * qw = quant_weights + QK4_1 * ib;
|
3104
|
+
for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
3105
|
+
float min;
|
3106
|
+
float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
|
3107
|
+
y[ib].d = GGML_FP32_TO_FP16(d);
|
3108
|
+
y[ib].m = GGML_FP32_TO_FP16(-min);
|
3109
|
+
for (int j = 0; j < 16; ++j) {
|
3110
|
+
y[ib].qs[j] = L[j] | (L[j+16] << 4);
|
2308
3111
|
}
|
2309
|
-
|
2310
|
-
|
2311
|
-
|
2312
|
-
|
2313
|
-
|
2314
|
-
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
3112
|
+
}
|
3113
|
+
}
|
3114
|
+
|
3115
|
+
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3116
|
+
if (!quant_weights) {
|
3117
|
+
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
3118
|
+
}
|
3119
|
+
int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
3120
|
+
char * qrow = (char *)dst;
|
3121
|
+
for (int row = 0; row < nrow; ++row) {
|
3122
|
+
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
3123
|
+
src += n_per_row;
|
3124
|
+
qrow += row_size;
|
3125
|
+
}
|
3126
|
+
return nrow * row_size;
|
3127
|
+
}
|
3128
|
+
|
3129
|
+
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
|
3130
|
+
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
3131
|
+
|
3132
|
+
if (!quant_weights) {
|
3133
|
+
quantize_row_q5_0_reference(x, y, n_per_row);
|
3134
|
+
return;
|
3135
|
+
}
|
3136
|
+
|
3137
|
+
float weight[QK5_0];
|
3138
|
+
int8_t L[QK5_0];
|
3139
|
+
|
3140
|
+
float sum_x2 = 0;
|
3141
|
+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3142
|
+
float sigma2 = sum_x2/n_per_row;
|
3143
|
+
|
3144
|
+
const int nb = n_per_row/QK5_0;
|
3145
|
+
for (int ib = 0; ib < nb; ++ib) {
|
3146
|
+
const float * xb = x + QK5_0 * ib;
|
3147
|
+
const float * qw = quant_weights + QK5_0 * ib;
|
3148
|
+
for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
3149
|
+
float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
|
3150
|
+
y[ib].d = GGML_FP32_TO_FP16(d);
|
3151
|
+
|
3152
|
+
uint32_t qh = 0;
|
3153
|
+
|
3154
|
+
for (int j = 0; j < 16; ++j) {
|
3155
|
+
const uint8_t xi0 = L[j];
|
3156
|
+
const uint8_t xi1 = L[j+16];
|
3157
|
+
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
3158
|
+
|
3159
|
+
// get the 5-th bit and store it in qh at the right position
|
3160
|
+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
3161
|
+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
2319
3162
|
}
|
2320
|
-
y += 64;
|
2321
|
-
#endif
|
2322
3163
|
|
3164
|
+
memcpy(&y[ib].qh, &qh, sizeof(qh));
|
2323
3165
|
}
|
2324
3166
|
}
|
2325
3167
|
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2329
|
-
|
3168
|
+
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3169
|
+
if (!quant_weights) {
|
3170
|
+
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
3171
|
+
}
|
3172
|
+
int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
3173
|
+
char * qrow = (char *)dst;
|
3174
|
+
for (int row = 0; row < nrow; ++row) {
|
3175
|
+
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
3176
|
+
src += n_per_row;
|
3177
|
+
qrow += row_size;
|
3178
|
+
}
|
3179
|
+
return nrow * row_size;
|
2330
3180
|
}
|
2331
3181
|
|
2332
|
-
|
2333
|
-
|
2334
|
-
(void)hist; // TODO: collect histograms
|
3182
|
+
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
|
3183
|
+
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
2335
3184
|
|
2336
|
-
|
2337
|
-
|
2338
|
-
|
3185
|
+
if (!quant_weights) {
|
3186
|
+
quantize_row_q5_1_reference(x, y, n_per_row);
|
3187
|
+
return;
|
2339
3188
|
}
|
2340
|
-
|
3189
|
+
|
3190
|
+
float weight[QK5_1];
|
3191
|
+
uint8_t L[QK5_1], Laux[QK5_1];
|
3192
|
+
|
3193
|
+
float sum_x2 = 0;
|
3194
|
+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3195
|
+
float sigma2 = sum_x2/n_per_row;
|
3196
|
+
|
3197
|
+
const int nb = n_per_row/QK5_1;
|
3198
|
+
for (int ib = 0; ib < nb; ++ib) {
|
3199
|
+
const float * xb = x + QK5_1 * ib;
|
3200
|
+
const float * qw = quant_weights + QK5_1 * ib;
|
3201
|
+
for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
3202
|
+
float min;
|
3203
|
+
float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
|
3204
|
+
y[ib].d = GGML_FP32_TO_FP16(d);
|
3205
|
+
y[ib].m = GGML_FP32_TO_FP16(-min);
|
3206
|
+
|
3207
|
+
uint32_t qh = 0;
|
3208
|
+
for (int j = 0; j < 16; ++j) {
|
3209
|
+
const uint8_t xi0 = L[j];
|
3210
|
+
const uint8_t xi1 = L[j+16];
|
3211
|
+
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
3212
|
+
// get the 5-th bit and store it in qh at the right position
|
3213
|
+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
3214
|
+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
3215
|
+
}
|
3216
|
+
memcpy(&y[ib].qh, &qh, sizeof(qh));
|
3217
|
+
}
|
3218
|
+
}
|
3219
|
+
|
3220
|
+
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3221
|
+
if (!quant_weights) {
|
3222
|
+
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
3223
|
+
}
|
3224
|
+
int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
3225
|
+
char * qrow = (char *)dst;
|
3226
|
+
for (int row = 0; row < nrow; ++row) {
|
3227
|
+
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
3228
|
+
src += n_per_row;
|
3229
|
+
qrow += row_size;
|
3230
|
+
}
|
3231
|
+
return nrow * row_size;
|
2341
3232
|
}
|
2342
3233
|
|
2343
3234
|
// ====================== "True" 2-bit (de)-quantization
|
@@ -2553,14 +3444,6 @@ static const uint8_t ksigns_iq2xs[128] = {
|
|
2553
3444
|
|
2554
3445
|
static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
2555
3446
|
|
2556
|
-
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
|
2557
|
-
(void)x;
|
2558
|
-
(void)y;
|
2559
|
-
(void)k;
|
2560
|
-
assert(k % QK_K == 0);
|
2561
|
-
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
|
2562
|
-
}
|
2563
|
-
|
2564
3447
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
2565
3448
|
assert(k % QK_K == 0);
|
2566
3449
|
const int nb = k / QK_K;
|
@@ -2587,33 +3470,8 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
|
|
2587
3470
|
}
|
2588
3471
|
}
|
2589
3472
|
|
2590
|
-
void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
|
2591
|
-
assert(k % QK_K == 0);
|
2592
|
-
block_iq2_xxs * restrict y = vy;
|
2593
|
-
quantize_row_iq2_xxs_reference(x, y, k);
|
2594
|
-
}
|
2595
|
-
|
2596
|
-
size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
|
2597
|
-
assert(k % QK_K == 0);
|
2598
|
-
(void)hist; // TODO: collect histograms
|
2599
|
-
|
2600
|
-
for (int j = 0; j < n; j += k) {
|
2601
|
-
block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
|
2602
|
-
quantize_row_iq2_xxs_reference(src + j, y, k);
|
2603
|
-
}
|
2604
|
-
return (n/QK_K*sizeof(block_iq2_xxs));
|
2605
|
-
}
|
2606
|
-
|
2607
3473
|
// ====================== 2.3125 bpw (de)-quantization
|
2608
3474
|
|
2609
|
-
void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
|
2610
|
-
(void)x;
|
2611
|
-
(void)y;
|
2612
|
-
(void)k;
|
2613
|
-
assert(k % QK_K == 0);
|
2614
|
-
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
|
2615
|
-
}
|
2616
|
-
|
2617
3475
|
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
|
2618
3476
|
assert(k % QK_K == 0);
|
2619
3477
|
const int nb = k / QK_K;
|
@@ -2639,23 +3497,6 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
2639
3497
|
}
|
2640
3498
|
}
|
2641
3499
|
|
2642
|
-
void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
|
2643
|
-
assert(k % QK_K == 0);
|
2644
|
-
block_iq2_xs * restrict y = vy;
|
2645
|
-
quantize_row_iq2_xs_reference(x, y, k);
|
2646
|
-
}
|
2647
|
-
|
2648
|
-
size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
|
2649
|
-
assert(k % QK_K == 0);
|
2650
|
-
(void)hist; // TODO: collect histograms
|
2651
|
-
|
2652
|
-
for (int j = 0; j < n; j += k) {
|
2653
|
-
block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
|
2654
|
-
quantize_row_iq2_xs_reference(src + j, y, k);
|
2655
|
-
}
|
2656
|
-
return (n/QK_K*sizeof(block_iq2_xs));
|
2657
|
-
}
|
2658
|
-
|
2659
3500
|
//===================================== Q8_K ==============================================
|
2660
3501
|
|
2661
3502
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
@@ -7554,9 +8395,9 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|
7554
8395
|
|
7555
8396
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
7556
8397
|
|
7557
|
-
|
7558
|
-
|
7559
|
-
|
8398
|
+
ggml_int8x16x4_t q2u;
|
8399
|
+
ggml_int8x16x4_t q2s;
|
8400
|
+
ggml_int8x16x4_t q8b;
|
7560
8401
|
|
7561
8402
|
int32x4x4_t scales32;
|
7562
8403
|
|
@@ -7578,7 +8419,7 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|
7578
8419
|
scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
|
7579
8420
|
int32x4_t sumi = vdupq_n_s32(0);
|
7580
8421
|
for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
|
7581
|
-
q8b =
|
8422
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
7582
8423
|
q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
|
7583
8424
|
q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
|
7584
8425
|
q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
|
@@ -7699,3 +8540,666 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|
7699
8540
|
*s = 0.125f * sumf;
|
7700
8541
|
#endif
|
7701
8542
|
}
|
8543
|
+
|
8544
|
+
// ================================ IQ2 quantization =============================================
|
8545
|
+
|
8546
|
+
typedef struct {
|
8547
|
+
uint64_t * grid;
|
8548
|
+
int * map;
|
8549
|
+
uint16_t * neighbours;
|
8550
|
+
} iq2_entry_t;
|
8551
|
+
|
8552
|
+
static iq2_entry_t iq2_data[2] = {
|
8553
|
+
{NULL, NULL, NULL},
|
8554
|
+
{NULL, NULL, NULL},
|
8555
|
+
};
|
8556
|
+
|
8557
|
+
static inline int iq2_data_index(int grid_size) {
|
8558
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
8559
|
+
return grid_size == 256 ? 0 : 1;
|
8560
|
+
}
|
8561
|
+
|
8562
|
+
static int iq2_compare_func(const void * left, const void * right) {
|
8563
|
+
const int * l = (const int *)left;
|
8564
|
+
const int * r = (const int *)right;
|
8565
|
+
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
8566
|
+
}
|
8567
|
+
|
8568
|
+
static void q2xs_init_impl(int grid_size) {
|
8569
|
+
const int gindex = iq2_data_index(grid_size);
|
8570
|
+
if (iq2_data[gindex].grid) {
|
8571
|
+
return;
|
8572
|
+
}
|
8573
|
+
static const uint16_t kgrid_256[256] = {
|
8574
|
+
0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
|
8575
|
+
100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
|
8576
|
+
1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
|
8577
|
+
1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
|
8578
|
+
2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
|
8579
|
+
4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
|
8580
|
+
5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
|
8581
|
+
8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
|
8582
|
+
10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
|
8583
|
+
16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
|
8584
|
+
17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
|
8585
|
+
20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
|
8586
|
+
22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
|
8587
|
+
25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
|
8588
|
+
33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
|
8589
|
+
37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
|
8590
|
+
};
|
8591
|
+
static const uint16_t kgrid_512[512] = {
|
8592
|
+
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
8593
|
+
73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
|
8594
|
+
260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
|
8595
|
+
352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
|
8596
|
+
640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
|
8597
|
+
1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
|
8598
|
+
1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
|
8599
|
+
2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
|
8600
|
+
2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
|
8601
|
+
4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
|
8602
|
+
4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
|
8603
|
+
5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
|
8604
|
+
5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
|
8605
|
+
8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
|
8606
|
+
8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
|
8607
|
+
10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
|
8608
|
+
16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
|
8609
|
+
16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
|
8610
|
+
16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
|
8611
|
+
17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
|
8612
|
+
18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
|
8613
|
+
20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
|
8614
|
+
21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
|
8615
|
+
22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
|
8616
|
+
24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
|
8617
|
+
32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
|
8618
|
+
33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
|
8619
|
+
33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
|
8620
|
+
35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
|
8621
|
+
37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
|
8622
|
+
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
8623
|
+
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
8624
|
+
};
|
8625
|
+
const int kmap_size = 43692;
|
8626
|
+
const int nwant = 2;
|
8627
|
+
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
8628
|
+
uint64_t * kgrid_q2xs;
|
8629
|
+
int * kmap_q2xs;
|
8630
|
+
uint16_t * kneighbors_q2xs;
|
8631
|
+
|
8632
|
+
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
8633
|
+
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
|
8634
|
+
for (int k = 0; k < grid_size; ++k) {
|
8635
|
+
int8_t * pos = (int8_t *)(the_grid + k);
|
8636
|
+
for (int i = 0; i < 8; ++i) {
|
8637
|
+
int l = (kgrid[k] >> 2*i) & 0x3;
|
8638
|
+
pos[i] = 2*l + 1;
|
8639
|
+
}
|
8640
|
+
}
|
8641
|
+
kgrid_q2xs = the_grid;
|
8642
|
+
iq2_data[gindex].grid = the_grid;
|
8643
|
+
kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
|
8644
|
+
iq2_data[gindex].map = kmap_q2xs;
|
8645
|
+
for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
|
8646
|
+
uint64_t aux64;
|
8647
|
+
uint8_t * aux8 = (uint8_t *)&aux64;
|
8648
|
+
for (int i = 0; i < grid_size; ++i) {
|
8649
|
+
aux64 = kgrid_q2xs[i];
|
8650
|
+
uint16_t index = 0;
|
8651
|
+
for (int k=0; k<8; ++k) {
|
8652
|
+
uint16_t q = (aux8[k] - 1)/2;
|
8653
|
+
index |= (q << 2*k);
|
8654
|
+
}
|
8655
|
+
kmap_q2xs[index] = i;
|
8656
|
+
}
|
8657
|
+
int8_t pos[8];
|
8658
|
+
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
8659
|
+
int num_neighbors = 0, num_not_in_map = 0;
|
8660
|
+
for (int i = 0; i < kmap_size; ++i) {
|
8661
|
+
if (kmap_q2xs[i] >= 0) continue;
|
8662
|
+
++num_not_in_map;
|
8663
|
+
for (int k = 0; k < 8; ++k) {
|
8664
|
+
int l = (i >> 2*k) & 0x3;
|
8665
|
+
pos[k] = 2*l + 1;
|
8666
|
+
}
|
8667
|
+
for (int j = 0; j < grid_size; ++j) {
|
8668
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
8669
|
+
int d2 = 0;
|
8670
|
+
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
8671
|
+
dist2[2*j+0] = d2;
|
8672
|
+
dist2[2*j+1] = j;
|
8673
|
+
}
|
8674
|
+
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
8675
|
+
int n = 0; int d2 = dist2[0];
|
8676
|
+
int nhave = 1;
|
8677
|
+
for (int j = 0; j < grid_size; ++j) {
|
8678
|
+
if (dist2[2*j] > d2) {
|
8679
|
+
if (nhave == nwant) break;
|
8680
|
+
d2 = dist2[2*j];
|
8681
|
+
++nhave;
|
8682
|
+
}
|
8683
|
+
++n;
|
8684
|
+
}
|
8685
|
+
num_neighbors += n;
|
8686
|
+
}
|
8687
|
+
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
8688
|
+
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
8689
|
+
iq2_data[gindex].neighbours = kneighbors_q2xs;
|
8690
|
+
int counter = 0;
|
8691
|
+
for (int i = 0; i < kmap_size; ++i) {
|
8692
|
+
if (kmap_q2xs[i] >= 0) continue;
|
8693
|
+
for (int k = 0; k < 8; ++k) {
|
8694
|
+
int l = (i >> 2*k) & 0x3;
|
8695
|
+
pos[k] = 2*l + 1;
|
8696
|
+
}
|
8697
|
+
for (int j = 0; j < grid_size; ++j) {
|
8698
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
8699
|
+
int d2 = 0;
|
8700
|
+
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
8701
|
+
dist2[2*j+0] = d2;
|
8702
|
+
dist2[2*j+1] = j;
|
8703
|
+
}
|
8704
|
+
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
8705
|
+
kmap_q2xs[i] = -(counter + 1);
|
8706
|
+
int d2 = dist2[0];
|
8707
|
+
uint16_t * start = &kneighbors_q2xs[counter++];
|
8708
|
+
int n = 0, nhave = 1;
|
8709
|
+
for (int j = 0; j < grid_size; ++j) {
|
8710
|
+
if (dist2[2*j] > d2) {
|
8711
|
+
if (nhave == nwant) break;
|
8712
|
+
d2 = dist2[2*j];
|
8713
|
+
++nhave;
|
8714
|
+
}
|
8715
|
+
kneighbors_q2xs[counter++] = dist2[2*j+1];
|
8716
|
+
++n;
|
8717
|
+
}
|
8718
|
+
*start = n;
|
8719
|
+
}
|
8720
|
+
free(dist2);
|
8721
|
+
}
|
8722
|
+
|
8723
|
+
void ggml_init_iq2_quantization(enum ggml_type type) {
|
8724
|
+
if (type == GGML_TYPE_IQ2_XXS) {
|
8725
|
+
q2xs_init_impl(256);
|
8726
|
+
}
|
8727
|
+
else if (type == GGML_TYPE_IQ2_XS) {
|
8728
|
+
q2xs_init_impl(512);
|
8729
|
+
}
|
8730
|
+
else {
|
8731
|
+
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
8732
|
+
}
|
8733
|
+
}
|
8734
|
+
|
8735
|
+
static void q2xs_deinit_impl(int grid_size) {
|
8736
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
|
8737
|
+
const int gindex = iq2_data_index(grid_size);
|
8738
|
+
if (iq2_data[gindex].grid) {
|
8739
|
+
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
8740
|
+
free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
|
8741
|
+
free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
|
8742
|
+
}
|
8743
|
+
}
|
8744
|
+
|
8745
|
+
void ggml_deinit_iq2_quantization(enum ggml_type type) {
|
8746
|
+
if (type == GGML_TYPE_IQ2_XXS) {
|
8747
|
+
q2xs_deinit_impl(256);
|
8748
|
+
}
|
8749
|
+
else if (type == GGML_TYPE_IQ2_XS) {
|
8750
|
+
q2xs_deinit_impl(512);
|
8751
|
+
}
|
8752
|
+
else {
|
8753
|
+
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
8754
|
+
}
|
8755
|
+
}
|
8756
|
+
|
8757
|
+
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
8758
|
+
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
8759
|
+
int num_neighbors = neighbours[0];
|
8760
|
+
GGML_ASSERT(num_neighbors > 0);
|
8761
|
+
float best_d2 = FLT_MAX;
|
8762
|
+
int grid_index = -1;
|
8763
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
8764
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
8765
|
+
float d2 = 0;
|
8766
|
+
for (int i = 0; i < 8; ++i) {
|
8767
|
+
float q = pg[i];
|
8768
|
+
float diff = scale*q - xval[i];
|
8769
|
+
d2 += weight[i]*diff*diff;
|
8770
|
+
}
|
8771
|
+
if (d2 < best_d2) {
|
8772
|
+
best_d2 = d2; grid_index = neighbours[j];
|
8773
|
+
}
|
8774
|
+
}
|
8775
|
+
GGML_ASSERT(grid_index >= 0);
|
8776
|
+
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
8777
|
+
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
|
8778
|
+
return grid_index;
|
8779
|
+
}
|
8780
|
+
|
8781
|
+
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
8782
|
+
|
8783
|
+
const int gindex = iq2_data_index(256);
|
8784
|
+
|
8785
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
8786
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
8787
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
8788
|
+
|
8789
|
+
GGML_ASSERT(quant_weights);
|
8790
|
+
GGML_ASSERT(kgrid_q2xs);
|
8791
|
+
GGML_ASSERT(kmap_q2xs);
|
8792
|
+
GGML_ASSERT(kneighbors_q2xs);
|
8793
|
+
GGML_ASSERT(n%QK_K == 0);
|
8794
|
+
|
8795
|
+
const int kMaxQ = 3;
|
8796
|
+
|
8797
|
+
const int nbl = n/256;
|
8798
|
+
|
8799
|
+
block_iq2_xxs * y = vy;
|
8800
|
+
|
8801
|
+
float scales[QK_K/32];
|
8802
|
+
float weight[32];
|
8803
|
+
float xval[32];
|
8804
|
+
int8_t L[32];
|
8805
|
+
int8_t Laux[32];
|
8806
|
+
float waux[32];
|
8807
|
+
bool is_on_grid[4];
|
8808
|
+
bool is_on_grid_aux[4];
|
8809
|
+
uint8_t block_signs[4];
|
8810
|
+
uint32_t q2[2*(QK_K/32)];
|
8811
|
+
|
8812
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
8813
|
+
|
8814
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
8815
|
+
memset(q2, 0, QK_K/4);
|
8816
|
+
|
8817
|
+
float max_scale = 0;
|
8818
|
+
|
8819
|
+
const float * xbl = x + QK_K*ibl;
|
8820
|
+
float sumx2 = 0;
|
8821
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
8822
|
+
float sigma2 = sumx2/QK_K;
|
8823
|
+
|
8824
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
8825
|
+
const float * xb = xbl + 32*ib;
|
8826
|
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
8827
|
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
8828
|
+
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
8829
|
+
for (int k = 0; k < 4; ++k) {
|
8830
|
+
int nflip = 0;
|
8831
|
+
uint8_t s = 0;
|
8832
|
+
for (int i = 0; i < 8; ++i) {
|
8833
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
8834
|
+
else {
|
8835
|
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
8836
|
+
}
|
8837
|
+
}
|
8838
|
+
if (nflip%2) {
|
8839
|
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
8840
|
+
for (int i = 1; i < 8; ++i) {
|
8841
|
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
8842
|
+
if (ax < min) {
|
8843
|
+
min = ax; imin = i;
|
8844
|
+
}
|
8845
|
+
}
|
8846
|
+
xval[8*k+imin] = -xval[8*k+imin];
|
8847
|
+
s ^= (1 << imin);
|
8848
|
+
}
|
8849
|
+
block_signs[k] = s & 127;
|
8850
|
+
}
|
8851
|
+
float max = xval[0];
|
8852
|
+
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
8853
|
+
if (!max) {
|
8854
|
+
scales[ib] = 0;
|
8855
|
+
memset(L, 0, 32);
|
8856
|
+
continue;
|
8857
|
+
}
|
8858
|
+
float best = 0;
|
8859
|
+
float scale = max/(2*kMaxQ-1);
|
8860
|
+
for (int is = -9; is <= 9; ++is) {
|
8861
|
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
8862
|
+
float this_scale = 1/id;
|
8863
|
+
for (int k = 0; k < 4; ++k) {
|
8864
|
+
for (int i = 0; i < 8; ++i) {
|
8865
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
8866
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
8867
|
+
}
|
8868
|
+
uint16_t u = 0;
|
8869
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
8870
|
+
int grid_index = kmap_q2xs[u];
|
8871
|
+
is_on_grid_aux[k] = true;
|
8872
|
+
if (grid_index < 0) {
|
8873
|
+
is_on_grid_aux[k] = false;
|
8874
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
8875
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
8876
|
+
}
|
8877
|
+
}
|
8878
|
+
float sumqx = 0, sumq2 = 0;
|
8879
|
+
for (int i = 0; i < 32; ++i) {
|
8880
|
+
float w = weight[i];
|
8881
|
+
float q = 2*Laux[i] + 1;
|
8882
|
+
sumqx += w*xval[i]*q;
|
8883
|
+
sumq2 += w*q*q;
|
8884
|
+
}
|
8885
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
8886
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
8887
|
+
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
|
8888
|
+
for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
8889
|
+
}
|
8890
|
+
}
|
8891
|
+
int n_not_ongrid = 0;
|
8892
|
+
for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
8893
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
8894
|
+
float id = 1/scale;
|
8895
|
+
for (int k = 0; k < 4; ++k) {
|
8896
|
+
if (is_on_grid[k]) continue;
|
8897
|
+
uint16_t u = 0;
|
8898
|
+
for (int i = 0; i < 8; ++i) {
|
8899
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
8900
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
8901
|
+
u |= (l << 2*i);
|
8902
|
+
}
|
8903
|
+
int grid_index = kmap_q2xs[u];
|
8904
|
+
if (grid_index < 0) {
|
8905
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
8906
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
8907
|
+
}
|
8908
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
|
8909
|
+
for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
|
8910
|
+
}
|
8911
|
+
float sumqx = 0, sumq2 = 0;
|
8912
|
+
for (int i = 0; i < 32; ++i) {
|
8913
|
+
float w = weight[i];
|
8914
|
+
float q = 2*L[i] + 1;
|
8915
|
+
sumqx += w*xval[i]*q;
|
8916
|
+
sumq2 += w*q*q;
|
8917
|
+
}
|
8918
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
8919
|
+
}
|
8920
|
+
if (scale < 0) {
|
8921
|
+
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
8922
|
+
// and correspondingly flip quant signs.
|
8923
|
+
scale = -scale;
|
8924
|
+
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
8925
|
+
}
|
8926
|
+
for (int k = 0; k < 4; ++k) {
|
8927
|
+
uint16_t u = 0;
|
8928
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
8929
|
+
int grid_index = kmap_q2xs[u];
|
8930
|
+
if (grid_index < 0) {
|
8931
|
+
printf("Oops: found point %u not on grid:", u);
|
8932
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
8933
|
+
printf("\n");
|
8934
|
+
GGML_ASSERT(false);
|
8935
|
+
}
|
8936
|
+
q2[2*ib+0] |= (grid_index << 8*k);
|
8937
|
+
q2[2*ib+1] |= (block_signs[k] << 7*k);
|
8938
|
+
}
|
8939
|
+
GGML_ASSERT(scale >= 0);
|
8940
|
+
scales[ib] = scale;
|
8941
|
+
max_scale = MAX(max_scale, scale);
|
8942
|
+
}
|
8943
|
+
|
8944
|
+
if (!max_scale) {
|
8945
|
+
memset(y[ibl].qs, 0, QK_K/4);
|
8946
|
+
continue;
|
8947
|
+
}
|
8948
|
+
|
8949
|
+
float d = max_scale/31;
|
8950
|
+
y[ibl].d = GGML_FP32_TO_FP16(d);
|
8951
|
+
float id = 1/d;
|
8952
|
+
float sumqx = 0, sumq2 = 0;
|
8953
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
8954
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
8955
|
+
l = MAX(0, MIN(15, l));
|
8956
|
+
q2[2*ib+1] |= ((uint32_t)l << 28);
|
8957
|
+
const float * xb = xbl + 32*ib;
|
8958
|
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
8959
|
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
8960
|
+
const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
|
8961
|
+
const float db = d * (1 + 2*l);
|
8962
|
+
uint32_t u = 0;
|
8963
|
+
for (int k = 0; k < 4; ++k) {
|
8964
|
+
const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
|
8965
|
+
const float * xk = xb + 8*k;
|
8966
|
+
const float * wk = weight + 8*k;
|
8967
|
+
const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
|
8968
|
+
float best_mse = 0; int best_index = aux8[k];
|
8969
|
+
for (int j = 0; j < 8; ++j) {
|
8970
|
+
float diff = db * grid[j] * signs[j] - xk[j];
|
8971
|
+
best_mse += wk[j] * diff * diff;
|
8972
|
+
}
|
8973
|
+
for (int idx = 0; idx < 256; ++idx) {
|
8974
|
+
grid = (const uint8_t *)(kgrid_q2xs + idx);
|
8975
|
+
float mse = 0;
|
8976
|
+
for (int j = 0; j < 8; ++j) {
|
8977
|
+
float diff = db * grid[j] * signs[j] - xk[j];
|
8978
|
+
mse += wk[j] * diff * diff;
|
8979
|
+
}
|
8980
|
+
if (mse < best_mse) {
|
8981
|
+
best_mse = mse; best_index = idx;
|
8982
|
+
}
|
8983
|
+
}
|
8984
|
+
u |= (best_index << 8*k);
|
8985
|
+
grid = (const uint8_t *)(kgrid_q2xs + best_index);
|
8986
|
+
//grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
|
8987
|
+
for (int j = 0; j < 8; ++j) {
|
8988
|
+
float q = db * grid[j] * signs[j];
|
8989
|
+
sumqx += wk[j] * q * xk[j];
|
8990
|
+
sumq2 += wk[j] * q * q;
|
8991
|
+
}
|
8992
|
+
}
|
8993
|
+
q2[2*ib] = u;
|
8994
|
+
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
8995
|
+
}
|
8996
|
+
memcpy(y[ibl].qs, q2, QK_K/4);
|
8997
|
+
}
|
8998
|
+
}
|
8999
|
+
|
9000
|
+
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
9001
|
+
|
9002
|
+
const int gindex = iq2_data_index(512);
|
9003
|
+
|
9004
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
9005
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
9006
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
9007
|
+
|
9008
|
+
GGML_ASSERT(quant_weights);
|
9009
|
+
GGML_ASSERT(kmap_q2xs);
|
9010
|
+
GGML_ASSERT(kgrid_q2xs);
|
9011
|
+
GGML_ASSERT(kneighbors_q2xs);
|
9012
|
+
GGML_ASSERT(n%QK_K == 0);
|
9013
|
+
|
9014
|
+
const int kMaxQ = 3;
|
9015
|
+
|
9016
|
+
const int nbl = n/256;
|
9017
|
+
|
9018
|
+
block_iq2_xs * y = vy;
|
9019
|
+
|
9020
|
+
float scales[QK_K/16];
|
9021
|
+
float weight[16];
|
9022
|
+
float xval[16];
|
9023
|
+
int8_t L[16];
|
9024
|
+
int8_t Laux[16];
|
9025
|
+
float waux[16];
|
9026
|
+
bool is_on_grid[2];
|
9027
|
+
bool is_on_grid_aux[2];
|
9028
|
+
uint8_t block_signs[2];
|
9029
|
+
uint16_t q2[2*(QK_K/16)];
|
9030
|
+
|
9031
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
9032
|
+
|
9033
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
9034
|
+
memset(q2, 0, QK_K/4);
|
9035
|
+
memset(y[ibl].scales, 0, QK_K/32);
|
9036
|
+
|
9037
|
+
float max_scale = 0;
|
9038
|
+
|
9039
|
+
const float * xbl = x + QK_K*ibl;
|
9040
|
+
float sumx2 = 0;
|
9041
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
9042
|
+
float sigma2 = sumx2/QK_K;
|
9043
|
+
|
9044
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
9045
|
+
const float * xb = xbl + 16*ib;
|
9046
|
+
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
9047
|
+
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
9048
|
+
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
9049
|
+
for (int k = 0; k < 2; ++k) {
|
9050
|
+
int nflip = 0;
|
9051
|
+
uint8_t s = 0;
|
9052
|
+
for (int i = 0; i < 8; ++i) {
|
9053
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
9054
|
+
else {
|
9055
|
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
9056
|
+
}
|
9057
|
+
}
|
9058
|
+
if (nflip%2) {
|
9059
|
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
9060
|
+
for (int i = 1; i < 8; ++i) {
|
9061
|
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
9062
|
+
if (ax < min) {
|
9063
|
+
min = ax; imin = i;
|
9064
|
+
}
|
9065
|
+
}
|
9066
|
+
xval[8*k+imin] = -xval[8*k+imin];
|
9067
|
+
s ^= (1 << imin);
|
9068
|
+
}
|
9069
|
+
block_signs[k] = s & 127;
|
9070
|
+
}
|
9071
|
+
float max = xval[0];
|
9072
|
+
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
9073
|
+
if (!max) {
|
9074
|
+
scales[ib] = 0;
|
9075
|
+
memset(L, 0, 16);
|
9076
|
+
continue;
|
9077
|
+
}
|
9078
|
+
float best = 0;
|
9079
|
+
float scale = max/(2*kMaxQ-1);
|
9080
|
+
is_on_grid[0] = is_on_grid[1] = true;
|
9081
|
+
for (int is = -9; is <= 9; ++is) {
|
9082
|
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
9083
|
+
float this_scale = 1/id;
|
9084
|
+
for (int k = 0; k < 2; ++k) {
|
9085
|
+
for (int i = 0; i < 8; ++i) {
|
9086
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
9087
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
9088
|
+
}
|
9089
|
+
uint16_t u = 0;
|
9090
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
9091
|
+
int grid_index = kmap_q2xs[u];
|
9092
|
+
is_on_grid_aux[k] = true;
|
9093
|
+
if (grid_index < 0) {
|
9094
|
+
is_on_grid_aux[k] = false;
|
9095
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
9096
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
9097
|
+
}
|
9098
|
+
}
|
9099
|
+
float sumqx = 0, sumq2 = 0;
|
9100
|
+
for (int i = 0; i < 16; ++i) {
|
9101
|
+
float w = weight[i];
|
9102
|
+
float q = 2*Laux[i] + 1;
|
9103
|
+
sumqx += w*xval[i]*q;
|
9104
|
+
sumq2 += w*q*q;
|
9105
|
+
}
|
9106
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
9107
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
9108
|
+
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
9109
|
+
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
9110
|
+
}
|
9111
|
+
}
|
9112
|
+
int n_not_ongrid = 0;
|
9113
|
+
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
9114
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
9115
|
+
float id = 1/scale;
|
9116
|
+
for (int k = 0; k < 2; ++k) {
|
9117
|
+
if (is_on_grid[k]) continue;
|
9118
|
+
uint16_t u = 0;
|
9119
|
+
for (int i = 0; i < 8; ++i) {
|
9120
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
9121
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
9122
|
+
u |= (l << 2*i);
|
9123
|
+
L[8*k + i] = l;
|
9124
|
+
}
|
9125
|
+
int grid_index = kmap_q2xs[u];
|
9126
|
+
if (grid_index < 0) {
|
9127
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
9128
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
9129
|
+
}
|
9130
|
+
}
|
9131
|
+
float sumqx = 0, sumq2 = 0;
|
9132
|
+
for (int i = 0; i < 16; ++i) {
|
9133
|
+
float w = weight[i];
|
9134
|
+
float q = 2*L[i] + 1;
|
9135
|
+
sumqx += w*xval[i]*q;
|
9136
|
+
sumq2 += w*q*q;
|
9137
|
+
}
|
9138
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
9139
|
+
}
|
9140
|
+
if (scale < 0) {
|
9141
|
+
scale = -scale;
|
9142
|
+
for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
9143
|
+
}
|
9144
|
+
for (int k = 0; k < 2; ++k) {
|
9145
|
+
uint16_t u = 0;
|
9146
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
9147
|
+
int grid_index = kmap_q2xs[u];
|
9148
|
+
if (grid_index < 0) {
|
9149
|
+
printf("Oops: found point %u not on grid:", u);
|
9150
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
9151
|
+
printf("\n");
|
9152
|
+
GGML_ASSERT(false);
|
9153
|
+
}
|
9154
|
+
q2[2*ib+k] = grid_index | (block_signs[k] << 9);
|
9155
|
+
}
|
9156
|
+
GGML_ASSERT(scale >= 0);
|
9157
|
+
scales[ib] = scale;
|
9158
|
+
max_scale = MAX(max_scale, scale);
|
9159
|
+
}
|
9160
|
+
|
9161
|
+
if (!max_scale) {
|
9162
|
+
memset(y[ibl].qs, 0, QK_K/4);
|
9163
|
+
continue;
|
9164
|
+
}
|
9165
|
+
|
9166
|
+
float d = max_scale/31;
|
9167
|
+
y[ibl].d = GGML_FP32_TO_FP16(d);
|
9168
|
+
float id = 1/d;
|
9169
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
9170
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
9171
|
+
l = MAX(0, MIN(15, l));
|
9172
|
+
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
9173
|
+
else y[ibl].scales[ib/2] |= (l << 4);
|
9174
|
+
}
|
9175
|
+
memcpy(y[ibl].qs, q2, QK_K/4);
|
9176
|
+
|
9177
|
+
}
|
9178
|
+
}
|
9179
|
+
|
9180
|
+
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
9181
|
+
(void)hist;
|
9182
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
9183
|
+
int nblock = n_per_row/QK_K;
|
9184
|
+
char * qrow = (char *)dst;
|
9185
|
+
for (int row = 0; row < nrow; ++row) {
|
9186
|
+
quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
|
9187
|
+
src += n_per_row;
|
9188
|
+
qrow += nblock*sizeof(block_iq2_xxs);
|
9189
|
+
}
|
9190
|
+
return nrow * nblock * sizeof(block_iq2_xxs);
|
9191
|
+
}
|
9192
|
+
|
9193
|
+
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
9194
|
+
(void)hist;
|
9195
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
9196
|
+
int nblock = n_per_row/QK_K;
|
9197
|
+
char * qrow = (char *)dst;
|
9198
|
+
for (int row = 0; row < nrow; ++row) {
|
9199
|
+
quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
|
9200
|
+
src += n_per_row;
|
9201
|
+
qrow += nblock*sizeof(block_iq2_xs);
|
9202
|
+
}
|
9203
|
+
return nrow * nblock * sizeof(block_iq2_xs);
|
9204
|
+
}
|
9205
|
+
|