llama_cpp 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
132
132
|
}
|
133
133
|
|
134
134
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
135
|
-
#if __AVXVNNI__
|
135
|
+
#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
|
136
136
|
const __m256i zero = _mm256_setzero_si256();
|
137
137
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
138
138
|
return _mm256_cvtepi32_ps(summed_pairs);
|
@@ -3474,6 +3474,65 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
|
3474
3474
|
}
|
3475
3475
|
}
|
3476
3476
|
|
3477
|
+
void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
|
3478
|
+
assert(k % QK_K == 0);
|
3479
|
+
const int nb = k / QK_K;
|
3480
|
+
|
3481
|
+
float delta[4];
|
3482
|
+
uint16_t idx[4];
|
3483
|
+
|
3484
|
+
#if QK_K != 64
|
3485
|
+
iq1m_scale_t scale;
|
3486
|
+
#endif
|
3487
|
+
|
3488
|
+
for (int i = 0; i < nb; i++) {
|
3489
|
+
|
3490
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
3491
|
+
#if QK_K == 64
|
3492
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
3493
|
+
#else
|
3494
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
3495
|
+
const float d = GGML_FP16_TO_FP32(scale.f16);
|
3496
|
+
#endif
|
3497
|
+
const uint8_t * qs = x[i].qs;
|
3498
|
+
const uint8_t * qh = x[i].qh;
|
3499
|
+
|
3500
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
3501
|
+
#if QK_K == 64
|
3502
|
+
const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1);
|
3503
|
+
const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1);
|
3504
|
+
#else
|
3505
|
+
const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
|
3506
|
+
const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
|
3507
|
+
#endif
|
3508
|
+
idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
|
3509
|
+
idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
|
3510
|
+
idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
|
3511
|
+
idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
|
3512
|
+
delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3513
|
+
delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3514
|
+
delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3515
|
+
delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3516
|
+
for (int l = 0; l < 2; ++l) {
|
3517
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
3518
|
+
for (int j = 0; j < 8; ++j) {
|
3519
|
+
y[j] = dl1 * (grid[j] + delta[l]);
|
3520
|
+
}
|
3521
|
+
y += 8;
|
3522
|
+
}
|
3523
|
+
for (int l = 2; l < 4; ++l) {
|
3524
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
3525
|
+
for (int j = 0; j < 8; ++j) {
|
3526
|
+
y[j] = dl2 * (grid[j] + delta[l]);
|
3527
|
+
}
|
3528
|
+
y += 8;
|
3529
|
+
}
|
3530
|
+
qs += 4;
|
3531
|
+
qh += 2;
|
3532
|
+
}
|
3533
|
+
}
|
3534
|
+
}
|
3535
|
+
|
3477
3536
|
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
3478
3537
|
|
3479
3538
|
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
|
@@ -9695,6 +9754,248 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
9695
9754
|
#endif
|
9696
9755
|
}
|
9697
9756
|
|
9757
|
+
void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9758
|
+
assert(n % QK_K == 0);
|
9759
|
+
assert(nrc == 1);
|
9760
|
+
UNUSED(nrc);
|
9761
|
+
UNUSED(bx);
|
9762
|
+
UNUSED(by);
|
9763
|
+
UNUSED(bs);
|
9764
|
+
|
9765
|
+
const block_iq1_m * restrict x = vx;
|
9766
|
+
const block_q8_K * restrict y = vy;
|
9767
|
+
|
9768
|
+
const int nb = n / QK_K;
|
9769
|
+
|
9770
|
+
#if QK_K != 64
|
9771
|
+
iq1m_scale_t scale;
|
9772
|
+
#endif
|
9773
|
+
|
9774
|
+
#if defined __ARM_NEON
|
9775
|
+
|
9776
|
+
#if QK_K == 64
|
9777
|
+
const int32x4_t mask = vdupq_n_s32(0xf);
|
9778
|
+
#else
|
9779
|
+
const int32x4_t mask = vdupq_n_s32(0x7);
|
9780
|
+
#endif
|
9781
|
+
const int32x4_t mone = vdupq_n_s32(1);
|
9782
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
9783
|
+
|
9784
|
+
ggml_int8x16x4_t deltas;
|
9785
|
+
deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
|
9786
|
+
deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
|
9787
|
+
deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
|
9788
|
+
deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
|
9789
|
+
|
9790
|
+
ggml_int8x16x4_t q1b;
|
9791
|
+
ggml_int8x16x4_t q8b;
|
9792
|
+
|
9793
|
+
uint32_t aux32;
|
9794
|
+
const uint8_t * aux8 = (const uint8_t *)&aux32;
|
9795
|
+
|
9796
|
+
float sumf = 0;
|
9797
|
+
for (int i = 0; i < nb; ++i) {
|
9798
|
+
|
9799
|
+
const int8_t * q8 = y[i].qs;
|
9800
|
+
const uint8_t * qs = x[i].qs;
|
9801
|
+
const uint8_t * qh = x[i].qh;
|
9802
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
9803
|
+
|
9804
|
+
#if QK_K != 64
|
9805
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
9806
|
+
#endif
|
9807
|
+
|
9808
|
+
int32x4_t sumi1 = mzero;
|
9809
|
+
int32x4_t sumi2 = mzero;
|
9810
|
+
|
9811
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
9812
|
+
|
9813
|
+
q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
|
9814
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
|
9815
|
+
q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
|
9816
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
|
9817
|
+
q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
|
9818
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
|
9819
|
+
q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
|
9820
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
|
9821
|
+
|
9822
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9823
|
+
|
9824
|
+
const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
|
9825
|
+
const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
|
9826
|
+
const int32x4_t p12 = vpaddq_s32(p1, p2);
|
9827
|
+
|
9828
|
+
const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
|
9829
|
+
aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
|
9830
|
+
|
9831
|
+
const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
|
9832
|
+
const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
|
9833
|
+
const int32x4_t p34 = vpaddq_s32(p3, p4);
|
9834
|
+
|
9835
|
+
#if QK_K == 64
|
9836
|
+
int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12);
|
9837
|
+
#else
|
9838
|
+
int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
|
9839
|
+
#endif
|
9840
|
+
scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
|
9841
|
+
|
9842
|
+
sumi1 = vmlaq_s32(sumi1, scales_4, p12);
|
9843
|
+
sumi2 = vmlaq_s32(sumi2, scales_4, p34);
|
9844
|
+
|
9845
|
+
qs += 8; qh += 4;
|
9846
|
+
|
9847
|
+
}
|
9848
|
+
|
9849
|
+
#if QK_K == 64
|
9850
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
|
9851
|
+
#else
|
9852
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
|
9853
|
+
#endif
|
9854
|
+
}
|
9855
|
+
|
9856
|
+
*s = sumf;
|
9857
|
+
|
9858
|
+
#elif defined __AVX2__
|
9859
|
+
|
9860
|
+
#if QK_K == 64
|
9861
|
+
const __m256i mask = _mm256_set1_epi16(0xf);
|
9862
|
+
#else
|
9863
|
+
const __m256i mask = _mm256_set1_epi16(0x7);
|
9864
|
+
#endif
|
9865
|
+
const __m256i mone = _mm256_set1_epi16(1);
|
9866
|
+
|
9867
|
+
__m256 accum1 = _mm256_setzero_ps();
|
9868
|
+
__m256 accum2 = _mm256_setzero_ps();
|
9869
|
+
for (int i = 0; i < nb; ++i) {
|
9870
|
+
|
9871
|
+
const int8_t * q8 = y[i].qs;
|
9872
|
+
const uint8_t * qs = x[i].qs;
|
9873
|
+
const uint8_t * qh = x[i].qh;
|
9874
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
9875
|
+
|
9876
|
+
#if QK_K != 64
|
9877
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
9878
|
+
#endif
|
9879
|
+
|
9880
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
9881
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
9882
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
9883
|
+
const __m256i q1b_1 = _mm256_set_epi64x(
|
9884
|
+
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
|
9885
|
+
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
|
9886
|
+
);
|
9887
|
+
const __m256i q1b_2 = _mm256_set_epi64x(
|
9888
|
+
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
|
9889
|
+
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
|
9890
|
+
);
|
9891
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9892
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9893
|
+
|
9894
|
+
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
9895
|
+
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
9896
|
+
|
9897
|
+
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9898
|
+
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
9899
|
+
qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9900
|
+
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
9901
|
+
const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9902
|
+
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
9903
|
+
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9904
|
+
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
9905
|
+
|
9906
|
+
const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
|
9907
|
+
const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
|
9908
|
+
#if QK_K == 64
|
9909
|
+
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 4), _mm_set1_epi16(sc[0] >> 0));
|
9910
|
+
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8));
|
9911
|
+
#else
|
9912
|
+
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
|
9913
|
+
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
|
9914
|
+
#endif
|
9915
|
+
scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
|
9916
|
+
scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
|
9917
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
|
9918
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
|
9919
|
+
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
|
9920
|
+
const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
|
9921
|
+
|
9922
|
+
sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
|
9923
|
+
sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
|
9924
|
+
|
9925
|
+
qs += 8; qh += 4;
|
9926
|
+
}
|
9927
|
+
|
9928
|
+
#if QK_K == 64
|
9929
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
9930
|
+
#else
|
9931
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
9932
|
+
#endif
|
9933
|
+
accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
|
9934
|
+
accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
|
9935
|
+
|
9936
|
+
}
|
9937
|
+
|
9938
|
+
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
9939
|
+
|
9940
|
+
#else
|
9941
|
+
|
9942
|
+
int sum1[2], sum2[2], delta[4];
|
9943
|
+
|
9944
|
+
float sumf = 0;
|
9945
|
+
for (int i = 0; i < nb; i++) {
|
9946
|
+
|
9947
|
+
const int8_t * q8 = y[i].qs;
|
9948
|
+
const uint8_t * qs = x[i].qs;
|
9949
|
+
const uint8_t * qh = x[i].qh;
|
9950
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
9951
|
+
|
9952
|
+
#if QK_K != 64
|
9953
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
9954
|
+
#endif
|
9955
|
+
|
9956
|
+
int sumi1 = 0, sumi2 = 0;
|
9957
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
9958
|
+
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
9959
|
+
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
9960
|
+
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
9961
|
+
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
9962
|
+
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
9963
|
+
for (int l = 0; l < 4; ++l) {
|
9964
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
9965
|
+
int lsum1 = 0, lsum2 = 0;
|
9966
|
+
for (int j = 0; j < 8; ++j) {
|
9967
|
+
lsum1 += q8[j] * grid[j];
|
9968
|
+
lsum2 += q8[j];
|
9969
|
+
}
|
9970
|
+
q8 += 8;
|
9971
|
+
sum1[l/2] += lsum1;
|
9972
|
+
sum2[l/2] += lsum2*delta[l];
|
9973
|
+
}
|
9974
|
+
#if QK_K == 64
|
9975
|
+
const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1;
|
9976
|
+
const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1;
|
9977
|
+
#else
|
9978
|
+
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
9979
|
+
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
9980
|
+
#endif
|
9981
|
+
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
9982
|
+
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
9983
|
+
qs += 4;
|
9984
|
+
qh += 2;
|
9985
|
+
}
|
9986
|
+
|
9987
|
+
#if QK_K == 64
|
9988
|
+
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
9989
|
+
#else
|
9990
|
+
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
9991
|
+
#endif
|
9992
|
+
}
|
9993
|
+
|
9994
|
+
*s = sumf;
|
9995
|
+
|
9996
|
+
#endif
|
9997
|
+
}
|
9998
|
+
|
9698
9999
|
void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9699
10000
|
assert(nrc == 1);
|
9700
10001
|
UNUSED(nrc);
|
@@ -9938,17 +10239,17 @@ static iq2_entry_t iq2_data[4] = {
|
|
9938
10239
|
};
|
9939
10240
|
|
9940
10241
|
static inline int iq2_data_index(enum ggml_type type) {
|
9941
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10242
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
|
9942
10243
|
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
9943
10244
|
type == GGML_TYPE_IQ2_XS ? 1 :
|
9944
|
-
type == GGML_TYPE_IQ1_S
|
10245
|
+
type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
|
9945
10246
|
}
|
9946
10247
|
|
9947
10248
|
static inline int iq2_grid_size(enum ggml_type type) {
|
9948
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10249
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
|
9949
10250
|
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
9950
10251
|
type == GGML_TYPE_IQ2_XS ? 512 :
|
9951
|
-
type == GGML_TYPE_IQ1_S
|
10252
|
+
type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
|
9952
10253
|
}
|
9953
10254
|
|
9954
10255
|
static int iq2_compare_func(const void * left, const void * right) {
|
@@ -10214,10 +10515,10 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10214
10515
|
|
10215
10516
|
const int kmap_size = 43692;
|
10216
10517
|
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
10217
|
-
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
10518
|
+
const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
10218
10519
|
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
10219
10520
|
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10220
|
-
type == GGML_TYPE_IQ1_S
|
10521
|
+
type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
|
10221
10522
|
uint64_t * kgrid_q2xs;
|
10222
10523
|
int * kmap_q2xs;
|
10223
10524
|
uint16_t * kneighbors_q2xs;
|
@@ -10314,7 +10615,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10314
10615
|
}
|
10315
10616
|
|
10316
10617
|
void iq2xs_free_impl(enum ggml_type type) {
|
10317
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10618
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
|
10318
10619
|
const int gindex = iq2_data_index(type);
|
10319
10620
|
if (iq2_data[gindex].grid) {
|
10320
10621
|
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
@@ -11520,7 +11821,16 @@ static int iq1_sort_helper(const void * left, const void * right) {
|
|
11520
11821
|
}
|
11521
11822
|
|
11522
11823
|
#define IQ1S_BLOCK_SIZE 32
|
11523
|
-
|
11824
|
+
#define IQ1M_BLOCK_SIZE 16
|
11825
|
+
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
|
11826
|
+
float * scales,
|
11827
|
+
float * weight,
|
11828
|
+
float * sumx,
|
11829
|
+
float * sumw,
|
11830
|
+
float * pairs,
|
11831
|
+
int8_t * L,
|
11832
|
+
uint16_t * index,
|
11833
|
+
int8_t * shifts) {
|
11524
11834
|
|
11525
11835
|
const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
|
11526
11836
|
|
@@ -11534,22 +11844,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11534
11844
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
11535
11845
|
GGML_ASSERT(n%QK_K == 0);
|
11536
11846
|
|
11847
|
+
block_iq1_s * y = vy;
|
11848
|
+
|
11537
11849
|
const int nbl = n/QK_K;
|
11538
11850
|
|
11539
|
-
|
11851
|
+
const int block_size = IQ1S_BLOCK_SIZE;
|
11540
11852
|
|
11541
11853
|
const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
|
11542
11854
|
const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
|
11543
11855
|
|
11544
|
-
|
11545
|
-
float weight[IQ1S_BLOCK_SIZE];
|
11546
|
-
int8_t L[IQ1S_BLOCK_SIZE];
|
11547
|
-
float sumx[IQ1S_BLOCK_SIZE+1];
|
11548
|
-
float sumw[IQ1S_BLOCK_SIZE+1];
|
11549
|
-
float pairs[2*IQ1S_BLOCK_SIZE];
|
11856
|
+
|
11550
11857
|
int * idx = (int *)(pairs + 1);
|
11551
|
-
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
11552
|
-
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
11553
11858
|
|
11554
11859
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
11555
11860
|
|
@@ -11564,15 +11869,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11564
11869
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
11565
11870
|
float sigma2 = 2*sumx2/QK_K;
|
11566
11871
|
|
11567
|
-
for (int ib = 0; ib < QK_K/
|
11568
|
-
const float * xb = xbl +
|
11569
|
-
const float * qw = quant_weights + QK_K*ibl +
|
11570
|
-
for (int i = 0; i <
|
11872
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
11873
|
+
const float * xb = xbl + block_size*ib;
|
11874
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
11875
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
11571
11876
|
float max = fabsf(xb[0]);
|
11572
|
-
for (int i = 1; i <
|
11877
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
11573
11878
|
if (!max) {
|
11574
11879
|
scales[ib] = 0;
|
11575
|
-
memset(L, 1,
|
11880
|
+
memset(L, 1, block_size);
|
11576
11881
|
continue;
|
11577
11882
|
}
|
11578
11883
|
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
@@ -11581,14 +11886,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11581
11886
|
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
11582
11887
|
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
11583
11888
|
// for each possible and score for each split.
|
11584
|
-
for (int j = 0; j <
|
11889
|
+
for (int j = 0; j < block_size; ++j) {
|
11585
11890
|
pairs[2*j] = xb[j];
|
11586
11891
|
idx[2*j] = j;
|
11587
11892
|
}
|
11588
|
-
qsort(pairs,
|
11893
|
+
qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
|
11589
11894
|
{
|
11590
11895
|
sumx[0] = sumw[0] = 0;
|
11591
|
-
for (int j = 0; j <
|
11896
|
+
for (int j = 0; j < block_size; ++j) {
|
11592
11897
|
int i = idx[2*j];
|
11593
11898
|
sumx[j+1] = sumx[j] + weight[i]*xb[i];
|
11594
11899
|
sumw[j+1] = sumw[j] + weight[i];
|
@@ -11596,16 +11901,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11596
11901
|
}
|
11597
11902
|
float best_score = 0, scale = max;
|
11598
11903
|
int besti1 = -1, besti2 = -1, best_shift = 0;
|
11599
|
-
for (int i1 = 0; i1 <=
|
11600
|
-
for (int i2 = i1; i2 <=
|
11601
|
-
float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[
|
11602
|
-
float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[
|
11904
|
+
for (int i1 = 0; i1 <= block_size; ++i1) {
|
11905
|
+
for (int i2 = i1; i2 <= block_size; ++i2) {
|
11906
|
+
float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
|
11907
|
+
float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
|
11603
11908
|
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
11604
11909
|
scale = sumqx/sumq2; best_score = scale*sumqx;
|
11605
11910
|
besti1 = i1; besti2 = i2; best_shift = 1;
|
11606
11911
|
}
|
11607
|
-
sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[
|
11608
|
-
sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[
|
11912
|
+
sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
|
11913
|
+
sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
|
11609
11914
|
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
11610
11915
|
scale = sumqx/sumq2; best_score = scale*sumqx;
|
11611
11916
|
besti1 = i1; besti2 = i2; best_shift = -1;
|
@@ -11615,14 +11920,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11615
11920
|
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
|
11616
11921
|
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
11617
11922
|
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
11618
|
-
for (int j = besti2; j <
|
11923
|
+
for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
|
11619
11924
|
if (scale < 0) {
|
11620
|
-
for (int j = 0; j <
|
11925
|
+
for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
|
11621
11926
|
scale = -scale; best_shift = -best_shift;
|
11622
11927
|
}
|
11623
11928
|
bool all_on_grid = true;
|
11624
11929
|
const float * xx = best_shift == 1 ? x_p : x_m;
|
11625
|
-
for (int k = 0; k <
|
11930
|
+
for (int k = 0; k < block_size/8; ++k) {
|
11626
11931
|
uint16_t u = 0;
|
11627
11932
|
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
|
11628
11933
|
int grid_index = kmap_q2xs[u];
|
@@ -11636,7 +11941,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11636
11941
|
}
|
11637
11942
|
if (!all_on_grid) {
|
11638
11943
|
float sumqx = 0, sumq2 = 0;
|
11639
|
-
for (int k = 0; k <
|
11944
|
+
for (int k = 0; k < block_size/8; ++k) {
|
11640
11945
|
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
|
11641
11946
|
for (int j = 0; j < 8; ++j) {
|
11642
11947
|
float w = weight[8*k + j];
|
@@ -11648,8 +11953,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11648
11953
|
if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
|
11649
11954
|
}
|
11650
11955
|
uint16_t h = 0;
|
11651
|
-
for (int k = 0; k <
|
11652
|
-
y[ibl].qs[(
|
11956
|
+
for (int k = 0; k < block_size/8; ++k) {
|
11957
|
+
y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
|
11653
11958
|
h |= (index[k] >> 8) << 3*k;
|
11654
11959
|
}
|
11655
11960
|
y[ibl].qh[ib] = h;
|
@@ -11660,14 +11965,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11660
11965
|
}
|
11661
11966
|
|
11662
11967
|
if (!max_scale) {
|
11663
|
-
memset(y[ibl].qs, 0, QK_K/8);
|
11664
11968
|
continue;
|
11665
11969
|
}
|
11666
11970
|
|
11667
11971
|
float d = max_scale/15;
|
11668
|
-
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.
|
11972
|
+
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
|
11669
11973
|
float id = 1/d;
|
11670
|
-
for (int ib = 0; ib < QK_K/
|
11974
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
11671
11975
|
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
11672
11976
|
l = MAX(0, MIN(7, l));
|
11673
11977
|
if (shifts[ib] == -1) l |= 8;
|
@@ -11678,16 +11982,307 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11678
11982
|
|
11679
11983
|
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
11680
11984
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11985
|
+
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
11986
|
+
float weight[IQ1S_BLOCK_SIZE];
|
11987
|
+
int8_t L[IQ1S_BLOCK_SIZE];
|
11988
|
+
float sumx[IQ1S_BLOCK_SIZE+1];
|
11989
|
+
float sumw[IQ1S_BLOCK_SIZE+1];
|
11990
|
+
float pairs[2*IQ1S_BLOCK_SIZE];
|
11991
|
+
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
11992
|
+
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
11681
11993
|
int nblock = n_per_row/QK_K;
|
11682
11994
|
char * qrow = (char *)dst;
|
11683
11995
|
for (int row = 0; row < nrow; ++row) {
|
11684
|
-
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
|
11996
|
+
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
|
11685
11997
|
src += n_per_row;
|
11686
11998
|
qrow += nblock*sizeof(block_iq1_s);
|
11687
11999
|
}
|
11688
12000
|
return nrow * nblock * sizeof(block_iq1_s);
|
11689
12001
|
}
|
11690
12002
|
|
12003
|
+
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
|
12004
|
+
float * scales,
|
12005
|
+
float * weight,
|
12006
|
+
float * pairs,
|
12007
|
+
int8_t * L,
|
12008
|
+
uint16_t * index,
|
12009
|
+
int8_t * shifts) {
|
12010
|
+
|
12011
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
|
12012
|
+
|
12013
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
12014
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
12015
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
12016
|
+
|
12017
|
+
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
12018
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
12019
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
12020
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
12021
|
+
GGML_ASSERT(n%QK_K == 0);
|
12022
|
+
|
12023
|
+
block_iq1_m * y = vy;
|
12024
|
+
|
12025
|
+
const int nbl = n/QK_K;
|
12026
|
+
|
12027
|
+
const int block_size = IQ1M_BLOCK_SIZE;
|
12028
|
+
|
12029
|
+
const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
|
12030
|
+
const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
|
12031
|
+
const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
|
12032
|
+
|
12033
|
+
int * idx = (int *)(pairs + 1);
|
12034
|
+
|
12035
|
+
float sumqx[4], sumq2[4];
|
12036
|
+
|
12037
|
+
iq1m_scale_t s;
|
12038
|
+
const float * xx;
|
12039
|
+
|
12040
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
12041
|
+
|
12042
|
+
#if QK_K == 64
|
12043
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
12044
|
+
#endif
|
12045
|
+
memset(y[ibl].qs, 0, QK_K/8);
|
12046
|
+
memset(y[ibl].qh, 0, QK_K/16);
|
12047
|
+
memset(y[ibl].scales, 0, QK_K/32);
|
12048
|
+
|
12049
|
+
float max_scale = 0;
|
12050
|
+
|
12051
|
+
const float * xbl = x + QK_K*ibl;
|
12052
|
+
float sumx2 = 0;
|
12053
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
12054
|
+
float sigma2 = 2*sumx2/QK_K;
|
12055
|
+
|
12056
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
12057
|
+
const float * xb = xbl + block_size*ib;
|
12058
|
+
if (quant_weights) {
|
12059
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
12060
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12061
|
+
} else {
|
12062
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
12063
|
+
}
|
12064
|
+
float max = fabsf(xb[0]);
|
12065
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
12066
|
+
if (!max) {
|
12067
|
+
scales[ib] = 0;
|
12068
|
+
memset(L, 1, block_size);
|
12069
|
+
continue;
|
12070
|
+
}
|
12071
|
+
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
12072
|
+
// With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
|
12073
|
+
// boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
|
12074
|
+
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
12075
|
+
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
12076
|
+
// for each possible and score for each split.
|
12077
|
+
for (int j = 0; j < block_size; ++j) {
|
12078
|
+
pairs[2*j] = xb[j];
|
12079
|
+
idx[2*j] = j;
|
12080
|
+
}
|
12081
|
+
qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
|
12082
|
+
float best_score = 0, scale = max;
|
12083
|
+
int besti1 = -1, besti2 = -1, best_k = -1;
|
12084
|
+
// 0: +, +
|
12085
|
+
// 1: +, -
|
12086
|
+
// 2: -, +
|
12087
|
+
// 3: -, -
|
12088
|
+
for (int i1 = 0; i1 <= block_size; ++i1) {
|
12089
|
+
for (int i2 = i1; i2 <= block_size; ++i2) {
|
12090
|
+
memset(sumqx, 0, 4*sizeof(float));
|
12091
|
+
memset(sumq2, 0, 4*sizeof(float));
|
12092
|
+
for (int j = 0; j < i1; ++j) {
|
12093
|
+
int i = idx[2*j];
|
12094
|
+
if (i < block_size/2) {
|
12095
|
+
sumqx[0] += weight[i]*x_p[0]*xb[i];
|
12096
|
+
sumqx[1] += weight[i]*x_p[0]*xb[i];
|
12097
|
+
sumqx[2] += weight[i]*x_m[0]*xb[i];
|
12098
|
+
sumqx[3] += weight[i]*x_m[0]*xb[i];
|
12099
|
+
sumq2[0] += weight[i]*x_p[0]*x_p[0];
|
12100
|
+
sumq2[1] += weight[i]*x_p[0]*x_p[0];
|
12101
|
+
sumq2[2] += weight[i]*x_m[0]*x_m[0];
|
12102
|
+
sumq2[3] += weight[i]*x_m[0]*x_m[0];
|
12103
|
+
} else {
|
12104
|
+
sumqx[0] += weight[i]*x_p[0]*xb[i];
|
12105
|
+
sumqx[2] += weight[i]*x_p[0]*xb[i];
|
12106
|
+
sumqx[1] += weight[i]*x_m[0]*xb[i];
|
12107
|
+
sumqx[3] += weight[i]*x_m[0]*xb[i];
|
12108
|
+
sumq2[0] += weight[i]*x_p[0]*x_p[0];
|
12109
|
+
sumq2[2] += weight[i]*x_p[0]*x_p[0];
|
12110
|
+
sumq2[1] += weight[i]*x_m[0]*x_m[0];
|
12111
|
+
sumq2[3] += weight[i]*x_m[0]*x_m[0];
|
12112
|
+
}
|
12113
|
+
}
|
12114
|
+
for (int j = i1; j < i2; ++j) {
|
12115
|
+
int i = idx[2*j];
|
12116
|
+
if (i < block_size/2) {
|
12117
|
+
sumqx[0] += weight[i]*x_p[1]*xb[i];
|
12118
|
+
sumqx[1] += weight[i]*x_p[1]*xb[i];
|
12119
|
+
sumqx[2] += weight[i]*x_m[1]*xb[i];
|
12120
|
+
sumqx[3] += weight[i]*x_m[1]*xb[i];
|
12121
|
+
sumq2[0] += weight[i]*x_p[1]*x_p[1];
|
12122
|
+
sumq2[1] += weight[i]*x_p[1]*x_p[1];
|
12123
|
+
sumq2[2] += weight[i]*x_m[1]*x_m[1];
|
12124
|
+
sumq2[3] += weight[i]*x_m[1]*x_m[1];
|
12125
|
+
} else {
|
12126
|
+
sumqx[0] += weight[i]*x_p[1]*xb[i];
|
12127
|
+
sumqx[2] += weight[i]*x_p[1]*xb[i];
|
12128
|
+
sumqx[1] += weight[i]*x_m[1]*xb[i];
|
12129
|
+
sumqx[3] += weight[i]*x_m[1]*xb[i];
|
12130
|
+
sumq2[0] += weight[i]*x_p[1]*x_p[1];
|
12131
|
+
sumq2[2] += weight[i]*x_p[1]*x_p[1];
|
12132
|
+
sumq2[1] += weight[i]*x_m[1]*x_m[1];
|
12133
|
+
sumq2[3] += weight[i]*x_m[1]*x_m[1];
|
12134
|
+
}
|
12135
|
+
}
|
12136
|
+
for (int j = i2; j < block_size; ++j) {
|
12137
|
+
int i = idx[2*j];
|
12138
|
+
if (i < block_size/2) {
|
12139
|
+
sumqx[0] += weight[i]*x_p[2]*xb[i];
|
12140
|
+
sumqx[1] += weight[i]*x_p[2]*xb[i];
|
12141
|
+
sumqx[2] += weight[i]*x_m[2]*xb[i];
|
12142
|
+
sumqx[3] += weight[i]*x_m[2]*xb[i];
|
12143
|
+
sumq2[0] += weight[i]*x_p[2]*x_p[2];
|
12144
|
+
sumq2[1] += weight[i]*x_p[2]*x_p[2];
|
12145
|
+
sumq2[2] += weight[i]*x_m[2]*x_m[2];
|
12146
|
+
sumq2[3] += weight[i]*x_m[2]*x_m[2];
|
12147
|
+
} else {
|
12148
|
+
sumqx[0] += weight[i]*x_p[2]*xb[i];
|
12149
|
+
sumqx[2] += weight[i]*x_p[2]*xb[i];
|
12150
|
+
sumqx[1] += weight[i]*x_m[2]*xb[i];
|
12151
|
+
sumqx[3] += weight[i]*x_m[2]*xb[i];
|
12152
|
+
sumq2[0] += weight[i]*x_p[2]*x_p[2];
|
12153
|
+
sumq2[2] += weight[i]*x_p[2]*x_p[2];
|
12154
|
+
sumq2[1] += weight[i]*x_m[2]*x_m[2];
|
12155
|
+
sumq2[3] += weight[i]*x_m[2]*x_m[2];
|
12156
|
+
}
|
12157
|
+
}
|
12158
|
+
for (int k = 0; k < 4; ++k) {
|
12159
|
+
if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
|
12160
|
+
scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
|
12161
|
+
besti1 = i1; besti2 = i2; best_k = k;
|
12162
|
+
}
|
12163
|
+
}
|
12164
|
+
}
|
12165
|
+
}
|
12166
|
+
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
|
12167
|
+
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
12168
|
+
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
12169
|
+
for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
|
12170
|
+
if (scale < 0) {
|
12171
|
+
for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
|
12172
|
+
scale = -scale;
|
12173
|
+
best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
|
12174
|
+
}
|
12175
|
+
bool all_on_grid = true;
|
12176
|
+
for (int k = 0; k < block_size/8; ++k) {
|
12177
|
+
if (k == 0) xx = best_k < 2 ? x_p : x_m;
|
12178
|
+
else xx = best_k%2 == 0 ? x_p : x_m;
|
12179
|
+
uint16_t u = 0;
|
12180
|
+
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
|
12181
|
+
int grid_index = kmap_q2xs[u];
|
12182
|
+
if (grid_index < 0) {
|
12183
|
+
all_on_grid = false;
|
12184
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12185
|
+
grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
|
12186
|
+
GGML_ASSERT(grid_index >= 0);
|
12187
|
+
}
|
12188
|
+
index[k] = grid_index;
|
12189
|
+
}
|
12190
|
+
if (!all_on_grid) {
|
12191
|
+
float sumqx_f = 0, sumq2_f = 0;
|
12192
|
+
for (int k = 0; k < block_size/8; ++k) {
|
12193
|
+
if (k == 0) xx = best_k < 2 ? x_p : x_m;
|
12194
|
+
else xx = best_k%2 == 0 ? x_p : x_m;
|
12195
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
|
12196
|
+
for (int j = 0; j < 8; ++j) {
|
12197
|
+
float w = weight[8*k + j];
|
12198
|
+
float q = xx[(pg[j] - 1)/2];
|
12199
|
+
sumqx_f += w*q*xb[8*k+j];
|
12200
|
+
sumq2_f += w*q*q;
|
12201
|
+
}
|
12202
|
+
}
|
12203
|
+
if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
|
12204
|
+
}
|
12205
|
+
y[ibl].qs[2*ib + 0] = index[0] & 255;
|
12206
|
+
y[ibl].qs[2*ib + 1] = index[1] & 255;
|
12207
|
+
y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
|
12208
|
+
GGML_ASSERT(scale >= 0);
|
12209
|
+
scales[ib] = scale;
|
12210
|
+
shifts[ib] = best_k;
|
12211
|
+
max_scale = MAX(max_scale, scale);
|
12212
|
+
}
|
12213
|
+
|
12214
|
+
if (!max_scale) {
|
12215
|
+
continue;
|
12216
|
+
}
|
12217
|
+
|
12218
|
+
uint16_t * sc = (uint16_t *)y[ibl].scales;
|
12219
|
+
#if QK_K == 64
|
12220
|
+
float d = max_scale/31;
|
12221
|
+
#else
|
12222
|
+
float d = max_scale/15;
|
12223
|
+
#endif
|
12224
|
+
float id = 1/d;
|
12225
|
+
float sumqx_f = 0, sumq2_f = 0;
|
12226
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
12227
|
+
int l = nearest_int(0.5f*(id*scales[ib+0]-1));
|
12228
|
+
#if QK_K == 64
|
12229
|
+
l = MAX(0, MIN(15, l));
|
12230
|
+
sc[ib/4] |= (l << 4*(ib%4));
|
12231
|
+
#else
|
12232
|
+
l = MAX(0, MIN(7, l));
|
12233
|
+
sc[ib/4] |= (l << 3*(ib%4));
|
12234
|
+
#endif
|
12235
|
+
y[ibl].qh[ib] |= masks[shifts[ib]];
|
12236
|
+
const float * xb = xbl + block_size*ib;
|
12237
|
+
if (quant_weights) {
|
12238
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
12239
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12240
|
+
} else {
|
12241
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
12242
|
+
}
|
12243
|
+
for (int k = 0; k < block_size/8; ++k) {
|
12244
|
+
if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
|
12245
|
+
else xx = shifts[ib]%2 == 0 ? x_p : x_m;
|
12246
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
|
12247
|
+
for (int j = 0; j < 8; ++j) {
|
12248
|
+
float w = weight[8*k + j];
|
12249
|
+
float q = xx[(pg[j] - 1)/2]*(2*l+1);
|
12250
|
+
sumqx_f += w*q*xb[8*k+j];
|
12251
|
+
sumq2_f += w*q*q;
|
12252
|
+
}
|
12253
|
+
}
|
12254
|
+
}
|
12255
|
+
if (sumq2_f > 0) d = sumqx_f/sumq2_f;
|
12256
|
+
s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
|
12257
|
+
#if QK_K == 64
|
12258
|
+
y[ibl].d = s.f16;
|
12259
|
+
#else
|
12260
|
+
sc[0] |= ((s.u16 & 0x000f) << 12);
|
12261
|
+
sc[1] |= ((s.u16 & 0x00f0) << 8);
|
12262
|
+
sc[2] |= ((s.u16 & 0x0f00) << 4);
|
12263
|
+
sc[3] |= ((s.u16 & 0xf000) << 0);
|
12264
|
+
#endif
|
12265
|
+
}
|
12266
|
+
}
|
12267
|
+
|
12268
|
+
size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12269
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
12270
|
+
float scales[QK_K/IQ1M_BLOCK_SIZE];
|
12271
|
+
float weight[IQ1M_BLOCK_SIZE];
|
12272
|
+
int8_t L[IQ1M_BLOCK_SIZE];
|
12273
|
+
float pairs[2*IQ1M_BLOCK_SIZE];
|
12274
|
+
uint16_t index[IQ1M_BLOCK_SIZE/8];
|
12275
|
+
int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
|
12276
|
+
int nblock = n_per_row/QK_K;
|
12277
|
+
char * qrow = (char *)dst;
|
12278
|
+
for (int row = 0; row < nrow; ++row) {
|
12279
|
+
quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
|
12280
|
+
src += n_per_row;
|
12281
|
+
qrow += nblock*sizeof(block_iq1_m);
|
12282
|
+
}
|
12283
|
+
return nrow * nblock * sizeof(block_iq1_m);
|
12284
|
+
}
|
12285
|
+
|
11691
12286
|
// ============================ 4-bit non-linear quants
|
11692
12287
|
|
11693
12288
|
static inline int best_index_int8(int n, const int8_t * val, float x) {
|
@@ -11705,9 +12300,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
11705
12300
|
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
11706
12301
|
float * scales, float * weight, uint8_t * L,
|
11707
12302
|
const int8_t * values,
|
11708
|
-
const float * quant_weights
|
11709
|
-
|
11710
|
-
const int ntry = 7;
|
12303
|
+
const float * quant_weights,
|
12304
|
+
const int ntry) {
|
11711
12305
|
|
11712
12306
|
float sigma2 = 0;
|
11713
12307
|
for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
|
@@ -11719,6 +12313,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
11719
12313
|
float max_scale = 0, amax_scale = 0;
|
11720
12314
|
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
|
11721
12315
|
const float * xb = x + ib*block_size;
|
12316
|
+
uint8_t * Lb = L + ib*block_size;
|
11722
12317
|
if (quant_weights) {
|
11723
12318
|
const float * qw = quant_weights + ib*block_size;
|
11724
12319
|
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
@@ -11736,12 +12331,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
11736
12331
|
scales[ib] = 0;
|
11737
12332
|
continue;
|
11738
12333
|
}
|
11739
|
-
float d = -max/values[0];
|
12334
|
+
float d = ntry > 0 ? -max/values[0] : max/values[0];
|
11740
12335
|
float id = 1/d;
|
11741
12336
|
float sumqx = 0, sumq2 = 0;
|
11742
12337
|
for (int j = 0; j < block_size; ++j) {
|
11743
12338
|
float al = id*xb[j];
|
11744
12339
|
int l = best_index_int8(16, values, al);
|
12340
|
+
Lb[j] = l;
|
11745
12341
|
float q = values[l];
|
11746
12342
|
float w = weight[j];
|
11747
12343
|
sumqx += w*q*xb[j];
|
@@ -11796,9 +12392,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
11796
12392
|
}
|
11797
12393
|
} else {
|
11798
12394
|
dh[0] = GGML_FP32_TO_FP16(scales[0]);
|
11799
|
-
|
11800
|
-
|
11801
|
-
|
12395
|
+
if (ntry > 0) {
|
12396
|
+
float id = scales[0] ? 1/scales[0] : 0;
|
12397
|
+
for (int j = 0; j < super_block_size; ++j) {
|
12398
|
+
L[j] = best_index_int8(16, values, id*x[j]);
|
12399
|
+
}
|
11802
12400
|
}
|
11803
12401
|
}
|
11804
12402
|
|
@@ -11823,7 +12421,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
|
|
11823
12421
|
for (int ibl = 0; ibl < nblock; ++ibl) {
|
11824
12422
|
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
11825
12423
|
quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
|
11826
|
-
&scale, weight, L, kvalues_iq4nl, qw);
|
12424
|
+
&scale, weight, L, kvalues_iq4nl, qw, 7);
|
11827
12425
|
}
|
11828
12426
|
src += n_per_row;
|
11829
12427
|
qrow += nblock*sizeof(block_iq4_nl);
|
@@ -11832,14 +12430,23 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
|
|
11832
12430
|
}
|
11833
12431
|
|
11834
12432
|
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
11835
|
-
|
11836
|
-
|
11837
|
-
|
12433
|
+
GGML_ASSERT(k%QK4_NL == 0);
|
12434
|
+
int nblock = k/QK4_NL;
|
12435
|
+
uint8_t L[QK4_NL];
|
12436
|
+
float weight[QK4_NL];
|
12437
|
+
uint16_t unused_h;
|
12438
|
+
uint8_t * unused_l = NULL;
|
12439
|
+
float scale;
|
12440
|
+
block_iq4_nl * iq4 = (block_iq4_nl *)vy;
|
12441
|
+
for (int ibl = 0; ibl < nblock; ++ibl) {
|
12442
|
+
quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
|
12443
|
+
&scale, weight, L, kvalues_iq4nl, NULL, -1);
|
12444
|
+
}
|
11838
12445
|
}
|
11839
12446
|
|
11840
12447
|
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
11841
12448
|
assert(k % QK4_NL == 0);
|
11842
|
-
|
12449
|
+
quantize_row_iq4_nl(x, y, k);
|
11843
12450
|
}
|
11844
12451
|
|
11845
12452
|
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
@@ -11857,7 +12464,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
|
|
11857
12464
|
for (int ibl = 0; ibl < nblock; ++ibl) {
|
11858
12465
|
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
|
11859
12466
|
quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
|
11860
|
-
scales, weight, L, kvalues_iq4nl, qw);
|
12467
|
+
scales, weight, L, kvalues_iq4nl, qw, 7);
|
11861
12468
|
}
|
11862
12469
|
src += n_per_row;
|
11863
12470
|
qrow += nblock*sizeof(block_iq4_xs);
|