llama_cpp 0.14.2 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
132
132
  }
133
133
 
134
134
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
135
- #if __AVXVNNI__
135
+ #if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
136
136
  const __m256i zero = _mm256_setzero_si256();
137
137
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
138
138
  return _mm256_cvtepi32_ps(summed_pairs);
@@ -3474,6 +3474,65 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
3474
3474
  }
3475
3475
  }
3476
3476
 
3477
+ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
3478
+ assert(k % QK_K == 0);
3479
+ const int nb = k / QK_K;
3480
+
3481
+ float delta[4];
3482
+ uint16_t idx[4];
3483
+
3484
+ #if QK_K != 64
3485
+ iq1m_scale_t scale;
3486
+ #endif
3487
+
3488
+ for (int i = 0; i < nb; i++) {
3489
+
3490
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
3491
+ #if QK_K == 64
3492
+ const float d = GGML_FP16_TO_FP32(x[i].d);
3493
+ #else
3494
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
3495
+ const float d = GGML_FP16_TO_FP32(scale.f16);
3496
+ #endif
3497
+ const uint8_t * qs = x[i].qs;
3498
+ const uint8_t * qh = x[i].qh;
3499
+
3500
+ for (int ib = 0; ib < QK_K/32; ++ib) {
3501
+ #if QK_K == 64
3502
+ const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1);
3503
+ const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1);
3504
+ #else
3505
+ const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
3506
+ const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
3507
+ #endif
3508
+ idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
3509
+ idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
3510
+ idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
3511
+ idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
3512
+ delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
3513
+ delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
3514
+ delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
3515
+ delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
3516
+ for (int l = 0; l < 2; ++l) {
3517
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
3518
+ for (int j = 0; j < 8; ++j) {
3519
+ y[j] = dl1 * (grid[j] + delta[l]);
3520
+ }
3521
+ y += 8;
3522
+ }
3523
+ for (int l = 2; l < 4; ++l) {
3524
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
3525
+ for (int j = 0; j < 8; ++j) {
3526
+ y[j] = dl2 * (grid[j] + delta[l]);
3527
+ }
3528
+ y += 8;
3529
+ }
3530
+ qs += 4;
3531
+ qh += 2;
3532
+ }
3533
+ }
3534
+ }
3535
+
3477
3536
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
3478
3537
 
3479
3538
  void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
@@ -9695,6 +9754,248 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9695
9754
  #endif
9696
9755
  }
9697
9756
 
9757
+ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9758
+ assert(n % QK_K == 0);
9759
+ assert(nrc == 1);
9760
+ UNUSED(nrc);
9761
+ UNUSED(bx);
9762
+ UNUSED(by);
9763
+ UNUSED(bs);
9764
+
9765
+ const block_iq1_m * restrict x = vx;
9766
+ const block_q8_K * restrict y = vy;
9767
+
9768
+ const int nb = n / QK_K;
9769
+
9770
+ #if QK_K != 64
9771
+ iq1m_scale_t scale;
9772
+ #endif
9773
+
9774
+ #if defined __ARM_NEON
9775
+
9776
+ #if QK_K == 64
9777
+ const int32x4_t mask = vdupq_n_s32(0xf);
9778
+ #else
9779
+ const int32x4_t mask = vdupq_n_s32(0x7);
9780
+ #endif
9781
+ const int32x4_t mone = vdupq_n_s32(1);
9782
+ const int32x4_t mzero = vdupq_n_s32(0);
9783
+
9784
+ ggml_int8x16x4_t deltas;
9785
+ deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
9786
+ deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
9787
+ deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
9788
+ deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
9789
+
9790
+ ggml_int8x16x4_t q1b;
9791
+ ggml_int8x16x4_t q8b;
9792
+
9793
+ uint32_t aux32;
9794
+ const uint8_t * aux8 = (const uint8_t *)&aux32;
9795
+
9796
+ float sumf = 0;
9797
+ for (int i = 0; i < nb; ++i) {
9798
+
9799
+ const int8_t * q8 = y[i].qs;
9800
+ const uint8_t * qs = x[i].qs;
9801
+ const uint8_t * qh = x[i].qh;
9802
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
9803
+
9804
+ #if QK_K != 64
9805
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
9806
+ #endif
9807
+
9808
+ int32x4_t sumi1 = mzero;
9809
+ int32x4_t sumi2 = mzero;
9810
+
9811
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
9812
+
9813
+ q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
9814
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
9815
+ q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
9816
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
9817
+ q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
9818
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
9819
+ q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
9820
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
9821
+
9822
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9823
+
9824
+ const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
9825
+ const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
9826
+ const int32x4_t p12 = vpaddq_s32(p1, p2);
9827
+
9828
+ const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
9829
+ aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
9830
+
9831
+ const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
9832
+ const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
9833
+ const int32x4_t p34 = vpaddq_s32(p3, p4);
9834
+
9835
+ #if QK_K == 64
9836
+ int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12);
9837
+ #else
9838
+ int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
9839
+ #endif
9840
+ scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
9841
+
9842
+ sumi1 = vmlaq_s32(sumi1, scales_4, p12);
9843
+ sumi2 = vmlaq_s32(sumi2, scales_4, p34);
9844
+
9845
+ qs += 8; qh += 4;
9846
+
9847
+ }
9848
+
9849
+ #if QK_K == 64
9850
+ sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
9851
+ #else
9852
+ sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
9853
+ #endif
9854
+ }
9855
+
9856
+ *s = sumf;
9857
+
9858
+ #elif defined __AVX2__
9859
+
9860
+ #if QK_K == 64
9861
+ const __m256i mask = _mm256_set1_epi16(0xf);
9862
+ #else
9863
+ const __m256i mask = _mm256_set1_epi16(0x7);
9864
+ #endif
9865
+ const __m256i mone = _mm256_set1_epi16(1);
9866
+
9867
+ __m256 accum1 = _mm256_setzero_ps();
9868
+ __m256 accum2 = _mm256_setzero_ps();
9869
+ for (int i = 0; i < nb; ++i) {
9870
+
9871
+ const int8_t * q8 = y[i].qs;
9872
+ const uint8_t * qs = x[i].qs;
9873
+ const uint8_t * qh = x[i].qh;
9874
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
9875
+
9876
+ #if QK_K != 64
9877
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
9878
+ #endif
9879
+
9880
+ __m256i sumi1 = _mm256_setzero_si256();
9881
+ __m256i sumi2 = _mm256_setzero_si256();
9882
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
9883
+ const __m256i q1b_1 = _mm256_set_epi64x(
9884
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
9885
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
9886
+ );
9887
+ const __m256i q1b_2 = _mm256_set_epi64x(
9888
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
9889
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
9890
+ );
9891
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9892
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9893
+
9894
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
9895
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
9896
+
9897
+ const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9898
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
9899
+ qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9900
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
9901
+ const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9902
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
9903
+ qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9904
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
9905
+
9906
+ const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
9907
+ const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
9908
+ #if QK_K == 64
9909
+ __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 4), _mm_set1_epi16(sc[0] >> 0));
9910
+ __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8));
9911
+ #else
9912
+ __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
9913
+ __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
9914
+ #endif
9915
+ scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
9916
+ scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
9917
+ const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
9918
+ const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
9919
+ const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
9920
+ const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
9921
+
9922
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
9923
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
9924
+
9925
+ qs += 8; qh += 4;
9926
+ }
9927
+
9928
+ #if QK_K == 64
9929
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
9930
+ #else
9931
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
9932
+ #endif
9933
+ accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
9934
+ accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
9935
+
9936
+ }
9937
+
9938
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
9939
+
9940
+ #else
9941
+
9942
+ int sum1[2], sum2[2], delta[4];
9943
+
9944
+ float sumf = 0;
9945
+ for (int i = 0; i < nb; i++) {
9946
+
9947
+ const int8_t * q8 = y[i].qs;
9948
+ const uint8_t * qs = x[i].qs;
9949
+ const uint8_t * qh = x[i].qh;
9950
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
9951
+
9952
+ #if QK_K != 64
9953
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
9954
+ #endif
9955
+
9956
+ int sumi1 = 0, sumi2 = 0;
9957
+ for (int ib = 0; ib < QK_K/32; ++ib) {
9958
+ delta[0] = qh[0] & 0x08 ? -1 : 1;
9959
+ delta[1] = qh[0] & 0x80 ? -1 : 1;
9960
+ delta[2] = qh[1] & 0x08 ? -1 : 1;
9961
+ delta[3] = qh[1] & 0x80 ? -1 : 1;
9962
+ sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
9963
+ for (int l = 0; l < 4; ++l) {
9964
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
9965
+ int lsum1 = 0, lsum2 = 0;
9966
+ for (int j = 0; j < 8; ++j) {
9967
+ lsum1 += q8[j] * grid[j];
9968
+ lsum2 += q8[j];
9969
+ }
9970
+ q8 += 8;
9971
+ sum1[l/2] += lsum1;
9972
+ sum2[l/2] += lsum2*delta[l];
9973
+ }
9974
+ #if QK_K == 64
9975
+ const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1;
9976
+ const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1;
9977
+ #else
9978
+ const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
9979
+ const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
9980
+ #endif
9981
+ sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
9982
+ sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
9983
+ qs += 4;
9984
+ qh += 2;
9985
+ }
9986
+
9987
+ #if QK_K == 64
9988
+ sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
9989
+ #else
9990
+ sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
9991
+ #endif
9992
+ }
9993
+
9994
+ *s = sumf;
9995
+
9996
+ #endif
9997
+ }
9998
+
9698
9999
  void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9699
10000
  assert(nrc == 1);
9700
10001
  UNUSED(nrc);
@@ -9938,17 +10239,17 @@ static iq2_entry_t iq2_data[4] = {
9938
10239
  };
9939
10240
 
9940
10241
  static inline int iq2_data_index(enum ggml_type type) {
9941
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10242
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
9942
10243
  return type == GGML_TYPE_IQ2_XXS ? 0 :
9943
10244
  type == GGML_TYPE_IQ2_XS ? 1 :
9944
- type == GGML_TYPE_IQ1_S ? 2 : 3;
10245
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
9945
10246
  }
9946
10247
 
9947
10248
  static inline int iq2_grid_size(enum ggml_type type) {
9948
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10249
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
9949
10250
  return type == GGML_TYPE_IQ2_XXS ? 256 :
9950
10251
  type == GGML_TYPE_IQ2_XS ? 512 :
9951
- type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024;
10252
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
9952
10253
  }
9953
10254
 
9954
10255
  static int iq2_compare_func(const void * left, const void * right) {
@@ -10214,10 +10515,10 @@ void iq2xs_init_impl(enum ggml_type type) {
10214
10515
 
10215
10516
  const int kmap_size = 43692;
10216
10517
  //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
10217
- const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10518
+ const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10218
10519
  const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
10219
10520
  type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
10220
- type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024;
10521
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
10221
10522
  uint64_t * kgrid_q2xs;
10222
10523
  int * kmap_q2xs;
10223
10524
  uint16_t * kneighbors_q2xs;
@@ -10314,7 +10615,7 @@ void iq2xs_init_impl(enum ggml_type type) {
10314
10615
  }
10315
10616
 
10316
10617
  void iq2xs_free_impl(enum ggml_type type) {
10317
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10618
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
10318
10619
  const int gindex = iq2_data_index(type);
10319
10620
  if (iq2_data[gindex].grid) {
10320
10621
  free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
@@ -11520,7 +11821,16 @@ static int iq1_sort_helper(const void * left, const void * right) {
11520
11821
  }
11521
11822
 
11522
11823
  #define IQ1S_BLOCK_SIZE 32
11523
- static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
11824
+ #define IQ1M_BLOCK_SIZE 16
11825
+ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
11826
+ float * scales,
11827
+ float * weight,
11828
+ float * sumx,
11829
+ float * sumw,
11830
+ float * pairs,
11831
+ int8_t * L,
11832
+ uint16_t * index,
11833
+ int8_t * shifts) {
11524
11834
 
11525
11835
  const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
11526
11836
 
@@ -11534,22 +11844,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11534
11844
  GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
11535
11845
  GGML_ASSERT(n%QK_K == 0);
11536
11846
 
11847
+ block_iq1_s * y = vy;
11848
+
11537
11849
  const int nbl = n/QK_K;
11538
11850
 
11539
- block_iq1_s * y = vy;
11851
+ const int block_size = IQ1S_BLOCK_SIZE;
11540
11852
 
11541
11853
  const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
11542
11854
  const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
11543
11855
 
11544
- float scales[QK_K/IQ1S_BLOCK_SIZE];
11545
- float weight[IQ1S_BLOCK_SIZE];
11546
- int8_t L[IQ1S_BLOCK_SIZE];
11547
- float sumx[IQ1S_BLOCK_SIZE+1];
11548
- float sumw[IQ1S_BLOCK_SIZE+1];
11549
- float pairs[2*IQ1S_BLOCK_SIZE];
11856
+
11550
11857
  int * idx = (int *)(pairs + 1);
11551
- uint16_t index[IQ1S_BLOCK_SIZE/8];
11552
- int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11553
11858
 
11554
11859
  for (int ibl = 0; ibl < nbl; ++ibl) {
11555
11860
 
@@ -11564,15 +11869,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11564
11869
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
11565
11870
  float sigma2 = 2*sumx2/QK_K;
11566
11871
 
11567
- for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
11568
- const float * xb = xbl + IQ1S_BLOCK_SIZE*ib;
11569
- const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib;
11570
- for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11872
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
11873
+ const float * xb = xbl + block_size*ib;
11874
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
11875
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11571
11876
  float max = fabsf(xb[0]);
11572
- for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i]));
11877
+ for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
11573
11878
  if (!max) {
11574
11879
  scales[ib] = 0;
11575
- memset(L, 1, IQ1S_BLOCK_SIZE);
11880
+ memset(L, 1, block_size);
11576
11881
  continue;
11577
11882
  }
11578
11883
  // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
@@ -11581,14 +11886,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11581
11886
  // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
11582
11887
  // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
11583
11888
  // for each possible and score for each split.
11584
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
11889
+ for (int j = 0; j < block_size; ++j) {
11585
11890
  pairs[2*j] = xb[j];
11586
11891
  idx[2*j] = j;
11587
11892
  }
11588
- qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper);
11893
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
11589
11894
  {
11590
11895
  sumx[0] = sumw[0] = 0;
11591
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
11896
+ for (int j = 0; j < block_size; ++j) {
11592
11897
  int i = idx[2*j];
11593
11898
  sumx[j+1] = sumx[j] + weight[i]*xb[i];
11594
11899
  sumw[j+1] = sumw[j] + weight[i];
@@ -11596,16 +11901,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11596
11901
  }
11597
11902
  float best_score = 0, scale = max;
11598
11903
  int besti1 = -1, besti2 = -1, best_shift = 0;
11599
- for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) {
11600
- for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) {
11601
- float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2];
11602
- float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2];
11904
+ for (int i1 = 0; i1 <= block_size; ++i1) {
11905
+ for (int i2 = i1; i2 <= block_size; ++i2) {
11906
+ float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
11907
+ float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
11603
11908
  if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
11604
11909
  scale = sumqx/sumq2; best_score = scale*sumqx;
11605
11910
  besti1 = i1; besti2 = i2; best_shift = 1;
11606
11911
  }
11607
- sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2];
11608
- sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2];
11912
+ sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
11913
+ sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
11609
11914
  if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
11610
11915
  scale = sumqx/sumq2; best_score = scale*sumqx;
11611
11916
  besti1 = i1; besti2 = i2; best_shift = -1;
@@ -11615,14 +11920,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11615
11920
  GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
11616
11921
  for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
11617
11922
  for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
11618
- for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2;
11923
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
11619
11924
  if (scale < 0) {
11620
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j];
11925
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
11621
11926
  scale = -scale; best_shift = -best_shift;
11622
11927
  }
11623
11928
  bool all_on_grid = true;
11624
11929
  const float * xx = best_shift == 1 ? x_p : x_m;
11625
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11930
+ for (int k = 0; k < block_size/8; ++k) {
11626
11931
  uint16_t u = 0;
11627
11932
  for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
11628
11933
  int grid_index = kmap_q2xs[u];
@@ -11636,7 +11941,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11636
11941
  }
11637
11942
  if (!all_on_grid) {
11638
11943
  float sumqx = 0, sumq2 = 0;
11639
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11944
+ for (int k = 0; k < block_size/8; ++k) {
11640
11945
  const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
11641
11946
  for (int j = 0; j < 8; ++j) {
11642
11947
  float w = weight[8*k + j];
@@ -11648,8 +11953,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11648
11953
  if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
11649
11954
  }
11650
11955
  uint16_t h = 0;
11651
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11652
- y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255;
11956
+ for (int k = 0; k < block_size/8; ++k) {
11957
+ y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
11653
11958
  h |= (index[k] >> 8) << 3*k;
11654
11959
  }
11655
11960
  y[ibl].qh[ib] = h;
@@ -11660,14 +11965,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11660
11965
  }
11661
11966
 
11662
11967
  if (!max_scale) {
11663
- memset(y[ibl].qs, 0, QK_K/8);
11664
11968
  continue;
11665
11969
  }
11666
11970
 
11667
11971
  float d = max_scale/15;
11668
- y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
11972
+ y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
11669
11973
  float id = 1/d;
11670
- for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
11974
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
11671
11975
  int l = nearest_int(0.5f*(id*scales[ib]-1));
11672
11976
  l = MAX(0, MIN(7, l));
11673
11977
  if (shifts[ib] == -1) l |= 8;
@@ -11678,16 +11982,307 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11678
11982
 
11679
11983
  size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11680
11984
  GGML_ASSERT(n_per_row%QK_K == 0);
11985
+ float scales[QK_K/IQ1S_BLOCK_SIZE];
11986
+ float weight[IQ1S_BLOCK_SIZE];
11987
+ int8_t L[IQ1S_BLOCK_SIZE];
11988
+ float sumx[IQ1S_BLOCK_SIZE+1];
11989
+ float sumw[IQ1S_BLOCK_SIZE+1];
11990
+ float pairs[2*IQ1S_BLOCK_SIZE];
11991
+ uint16_t index[IQ1S_BLOCK_SIZE/8];
11992
+ int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11681
11993
  int nblock = n_per_row/QK_K;
11682
11994
  char * qrow = (char *)dst;
11683
11995
  for (int row = 0; row < nrow; ++row) {
11684
- quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
11996
+ quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
11685
11997
  src += n_per_row;
11686
11998
  qrow += nblock*sizeof(block_iq1_s);
11687
11999
  }
11688
12000
  return nrow * nblock * sizeof(block_iq1_s);
11689
12001
  }
11690
12002
 
12003
+ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
12004
+ float * scales,
12005
+ float * weight,
12006
+ float * pairs,
12007
+ int8_t * L,
12008
+ uint16_t * index,
12009
+ int8_t * shifts) {
12010
+
12011
+ const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
12012
+
12013
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
12014
+ const int * kmap_q2xs = iq2_data[gindex].map;
12015
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
12016
+
12017
+ //GGML_ASSERT(quant_weights && "missing quantization weights");
12018
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
12019
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
12020
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
12021
+ GGML_ASSERT(n%QK_K == 0);
12022
+
12023
+ block_iq1_m * y = vy;
12024
+
12025
+ const int nbl = n/QK_K;
12026
+
12027
+ const int block_size = IQ1M_BLOCK_SIZE;
12028
+
12029
+ const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
12030
+ const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
12031
+ const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
12032
+
12033
+ int * idx = (int *)(pairs + 1);
12034
+
12035
+ float sumqx[4], sumq2[4];
12036
+
12037
+ iq1m_scale_t s;
12038
+ const float * xx;
12039
+
12040
+ for (int ibl = 0; ibl < nbl; ++ibl) {
12041
+
12042
+ #if QK_K == 64
12043
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
12044
+ #endif
12045
+ memset(y[ibl].qs, 0, QK_K/8);
12046
+ memset(y[ibl].qh, 0, QK_K/16);
12047
+ memset(y[ibl].scales, 0, QK_K/32);
12048
+
12049
+ float max_scale = 0;
12050
+
12051
+ const float * xbl = x + QK_K*ibl;
12052
+ float sumx2 = 0;
12053
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
12054
+ float sigma2 = 2*sumx2/QK_K;
12055
+
12056
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
12057
+ const float * xb = xbl + block_size*ib;
12058
+ if (quant_weights) {
12059
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
12060
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12061
+ } else {
12062
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
12063
+ }
12064
+ float max = fabsf(xb[0]);
12065
+ for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
12066
+ if (!max) {
12067
+ scales[ib] = 0;
12068
+ memset(L, 1, block_size);
12069
+ continue;
12070
+ }
12071
+ // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
12072
+ // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
12073
+ // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
12074
+ // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
12075
+ // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
12076
+ // for each possible and score for each split.
12077
+ for (int j = 0; j < block_size; ++j) {
12078
+ pairs[2*j] = xb[j];
12079
+ idx[2*j] = j;
12080
+ }
12081
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
12082
+ float best_score = 0, scale = max;
12083
+ int besti1 = -1, besti2 = -1, best_k = -1;
12084
+ // 0: +, +
12085
+ // 1: +, -
12086
+ // 2: -, +
12087
+ // 3: -, -
12088
+ for (int i1 = 0; i1 <= block_size; ++i1) {
12089
+ for (int i2 = i1; i2 <= block_size; ++i2) {
12090
+ memset(sumqx, 0, 4*sizeof(float));
12091
+ memset(sumq2, 0, 4*sizeof(float));
12092
+ for (int j = 0; j < i1; ++j) {
12093
+ int i = idx[2*j];
12094
+ if (i < block_size/2) {
12095
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
12096
+ sumqx[1] += weight[i]*x_p[0]*xb[i];
12097
+ sumqx[2] += weight[i]*x_m[0]*xb[i];
12098
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
12099
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
12100
+ sumq2[1] += weight[i]*x_p[0]*x_p[0];
12101
+ sumq2[2] += weight[i]*x_m[0]*x_m[0];
12102
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
12103
+ } else {
12104
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
12105
+ sumqx[2] += weight[i]*x_p[0]*xb[i];
12106
+ sumqx[1] += weight[i]*x_m[0]*xb[i];
12107
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
12108
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
12109
+ sumq2[2] += weight[i]*x_p[0]*x_p[0];
12110
+ sumq2[1] += weight[i]*x_m[0]*x_m[0];
12111
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
12112
+ }
12113
+ }
12114
+ for (int j = i1; j < i2; ++j) {
12115
+ int i = idx[2*j];
12116
+ if (i < block_size/2) {
12117
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
12118
+ sumqx[1] += weight[i]*x_p[1]*xb[i];
12119
+ sumqx[2] += weight[i]*x_m[1]*xb[i];
12120
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
12121
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
12122
+ sumq2[1] += weight[i]*x_p[1]*x_p[1];
12123
+ sumq2[2] += weight[i]*x_m[1]*x_m[1];
12124
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
12125
+ } else {
12126
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
12127
+ sumqx[2] += weight[i]*x_p[1]*xb[i];
12128
+ sumqx[1] += weight[i]*x_m[1]*xb[i];
12129
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
12130
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
12131
+ sumq2[2] += weight[i]*x_p[1]*x_p[1];
12132
+ sumq2[1] += weight[i]*x_m[1]*x_m[1];
12133
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
12134
+ }
12135
+ }
12136
+ for (int j = i2; j < block_size; ++j) {
12137
+ int i = idx[2*j];
12138
+ if (i < block_size/2) {
12139
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
12140
+ sumqx[1] += weight[i]*x_p[2]*xb[i];
12141
+ sumqx[2] += weight[i]*x_m[2]*xb[i];
12142
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
12143
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
12144
+ sumq2[1] += weight[i]*x_p[2]*x_p[2];
12145
+ sumq2[2] += weight[i]*x_m[2]*x_m[2];
12146
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
12147
+ } else {
12148
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
12149
+ sumqx[2] += weight[i]*x_p[2]*xb[i];
12150
+ sumqx[1] += weight[i]*x_m[2]*xb[i];
12151
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
12152
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
12153
+ sumq2[2] += weight[i]*x_p[2]*x_p[2];
12154
+ sumq2[1] += weight[i]*x_m[2]*x_m[2];
12155
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
12156
+ }
12157
+ }
12158
+ for (int k = 0; k < 4; ++k) {
12159
+ if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
12160
+ scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
12161
+ besti1 = i1; besti2 = i2; best_k = k;
12162
+ }
12163
+ }
12164
+ }
12165
+ }
12166
+ GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
12167
+ for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
12168
+ for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
12169
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
12170
+ if (scale < 0) {
12171
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
12172
+ scale = -scale;
12173
+ best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
12174
+ }
12175
+ bool all_on_grid = true;
12176
+ for (int k = 0; k < block_size/8; ++k) {
12177
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
12178
+ else xx = best_k%2 == 0 ? x_p : x_m;
12179
+ uint16_t u = 0;
12180
+ for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
12181
+ int grid_index = kmap_q2xs[u];
12182
+ if (grid_index < 0) {
12183
+ all_on_grid = false;
12184
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12185
+ grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
12186
+ GGML_ASSERT(grid_index >= 0);
12187
+ }
12188
+ index[k] = grid_index;
12189
+ }
12190
+ if (!all_on_grid) {
12191
+ float sumqx_f = 0, sumq2_f = 0;
12192
+ for (int k = 0; k < block_size/8; ++k) {
12193
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
12194
+ else xx = best_k%2 == 0 ? x_p : x_m;
12195
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
12196
+ for (int j = 0; j < 8; ++j) {
12197
+ float w = weight[8*k + j];
12198
+ float q = xx[(pg[j] - 1)/2];
12199
+ sumqx_f += w*q*xb[8*k+j];
12200
+ sumq2_f += w*q*q;
12201
+ }
12202
+ }
12203
+ if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
12204
+ }
12205
+ y[ibl].qs[2*ib + 0] = index[0] & 255;
12206
+ y[ibl].qs[2*ib + 1] = index[1] & 255;
12207
+ y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
12208
+ GGML_ASSERT(scale >= 0);
12209
+ scales[ib] = scale;
12210
+ shifts[ib] = best_k;
12211
+ max_scale = MAX(max_scale, scale);
12212
+ }
12213
+
12214
+ if (!max_scale) {
12215
+ continue;
12216
+ }
12217
+
12218
+ uint16_t * sc = (uint16_t *)y[ibl].scales;
12219
+ #if QK_K == 64
12220
+ float d = max_scale/31;
12221
+ #else
12222
+ float d = max_scale/15;
12223
+ #endif
12224
+ float id = 1/d;
12225
+ float sumqx_f = 0, sumq2_f = 0;
12226
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
12227
+ int l = nearest_int(0.5f*(id*scales[ib+0]-1));
12228
+ #if QK_K == 64
12229
+ l = MAX(0, MIN(15, l));
12230
+ sc[ib/4] |= (l << 4*(ib%4));
12231
+ #else
12232
+ l = MAX(0, MIN(7, l));
12233
+ sc[ib/4] |= (l << 3*(ib%4));
12234
+ #endif
12235
+ y[ibl].qh[ib] |= masks[shifts[ib]];
12236
+ const float * xb = xbl + block_size*ib;
12237
+ if (quant_weights) {
12238
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
12239
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12240
+ } else {
12241
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
12242
+ }
12243
+ for (int k = 0; k < block_size/8; ++k) {
12244
+ if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
12245
+ else xx = shifts[ib]%2 == 0 ? x_p : x_m;
12246
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
12247
+ for (int j = 0; j < 8; ++j) {
12248
+ float w = weight[8*k + j];
12249
+ float q = xx[(pg[j] - 1)/2]*(2*l+1);
12250
+ sumqx_f += w*q*xb[8*k+j];
12251
+ sumq2_f += w*q*q;
12252
+ }
12253
+ }
12254
+ }
12255
+ if (sumq2_f > 0) d = sumqx_f/sumq2_f;
12256
+ s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
12257
+ #if QK_K == 64
12258
+ y[ibl].d = s.f16;
12259
+ #else
12260
+ sc[0] |= ((s.u16 & 0x000f) << 12);
12261
+ sc[1] |= ((s.u16 & 0x00f0) << 8);
12262
+ sc[2] |= ((s.u16 & 0x0f00) << 4);
12263
+ sc[3] |= ((s.u16 & 0xf000) << 0);
12264
+ #endif
12265
+ }
12266
+ }
12267
+
12268
+ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12269
+ GGML_ASSERT(n_per_row%QK_K == 0);
12270
+ float scales[QK_K/IQ1M_BLOCK_SIZE];
12271
+ float weight[IQ1M_BLOCK_SIZE];
12272
+ int8_t L[IQ1M_BLOCK_SIZE];
12273
+ float pairs[2*IQ1M_BLOCK_SIZE];
12274
+ uint16_t index[IQ1M_BLOCK_SIZE/8];
12275
+ int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
12276
+ int nblock = n_per_row/QK_K;
12277
+ char * qrow = (char *)dst;
12278
+ for (int row = 0; row < nrow; ++row) {
12279
+ quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
12280
+ src += n_per_row;
12281
+ qrow += nblock*sizeof(block_iq1_m);
12282
+ }
12283
+ return nrow * nblock * sizeof(block_iq1_m);
12284
+ }
12285
+
11691
12286
  // ============================ 4-bit non-linear quants
11692
12287
 
11693
12288
  static inline int best_index_int8(int n, const int8_t * val, float x) {
@@ -11705,9 +12300,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11705
12300
  ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
11706
12301
  float * scales, float * weight, uint8_t * L,
11707
12302
  const int8_t * values,
11708
- const float * quant_weights) {
11709
-
11710
- const int ntry = 7;
12303
+ const float * quant_weights,
12304
+ const int ntry) {
11711
12305
 
11712
12306
  float sigma2 = 0;
11713
12307
  for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
@@ -11719,6 +12313,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11719
12313
  float max_scale = 0, amax_scale = 0;
11720
12314
  for (int ib = 0; ib < super_block_size/block_size; ++ib) {
11721
12315
  const float * xb = x + ib*block_size;
12316
+ uint8_t * Lb = L + ib*block_size;
11722
12317
  if (quant_weights) {
11723
12318
  const float * qw = quant_weights + ib*block_size;
11724
12319
  for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
@@ -11736,12 +12331,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11736
12331
  scales[ib] = 0;
11737
12332
  continue;
11738
12333
  }
11739
- float d = -max/values[0];
12334
+ float d = ntry > 0 ? -max/values[0] : max/values[0];
11740
12335
  float id = 1/d;
11741
12336
  float sumqx = 0, sumq2 = 0;
11742
12337
  for (int j = 0; j < block_size; ++j) {
11743
12338
  float al = id*xb[j];
11744
12339
  int l = best_index_int8(16, values, al);
12340
+ Lb[j] = l;
11745
12341
  float q = values[l];
11746
12342
  float w = weight[j];
11747
12343
  sumqx += w*q*xb[j];
@@ -11796,9 +12392,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11796
12392
  }
11797
12393
  } else {
11798
12394
  dh[0] = GGML_FP32_TO_FP16(scales[0]);
11799
- float id = scales[0] ? 1/scales[0] : 0;
11800
- for (int j = 0; j < super_block_size; ++j) {
11801
- L[j] = best_index_int8(16, values, id*x[j]);
12395
+ if (ntry > 0) {
12396
+ float id = scales[0] ? 1/scales[0] : 0;
12397
+ for (int j = 0; j < super_block_size; ++j) {
12398
+ L[j] = best_index_int8(16, values, id*x[j]);
12399
+ }
11802
12400
  }
11803
12401
  }
11804
12402
 
@@ -11823,7 +12421,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
11823
12421
  for (int ibl = 0; ibl < nblock; ++ibl) {
11824
12422
  const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
11825
12423
  quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
11826
- &scale, weight, L, kvalues_iq4nl, qw);
12424
+ &scale, weight, L, kvalues_iq4nl, qw, 7);
11827
12425
  }
11828
12426
  src += n_per_row;
11829
12427
  qrow += nblock*sizeof(block_iq4_nl);
@@ -11832,14 +12430,23 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
11832
12430
  }
11833
12431
 
11834
12432
  void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
11835
- assert(k % QK4_NL == 0);
11836
- block_iq4_nl * restrict y = vy;
11837
- quantize_row_iq4_nl_reference(x, y, k);
12433
+ GGML_ASSERT(k%QK4_NL == 0);
12434
+ int nblock = k/QK4_NL;
12435
+ uint8_t L[QK4_NL];
12436
+ float weight[QK4_NL];
12437
+ uint16_t unused_h;
12438
+ uint8_t * unused_l = NULL;
12439
+ float scale;
12440
+ block_iq4_nl * iq4 = (block_iq4_nl *)vy;
12441
+ for (int ibl = 0; ibl < nblock; ++ibl) {
12442
+ quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
12443
+ &scale, weight, L, kvalues_iq4nl, NULL, -1);
12444
+ }
11838
12445
  }
11839
12446
 
11840
12447
  void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
11841
12448
  assert(k % QK4_NL == 0);
11842
- quantize_iq4_nl(x, y, 1, k, NULL);
12449
+ quantize_row_iq4_nl(x, y, k);
11843
12450
  }
11844
12451
 
11845
12452
  size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
@@ -11857,7 +12464,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
11857
12464
  for (int ibl = 0; ibl < nblock; ++ibl) {
11858
12465
  const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
11859
12466
  quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
11860
- scales, weight, L, kvalues_iq4nl, qw);
12467
+ scales, weight, L, kvalues_iq4nl, qw, 7);
11861
12468
  }
11862
12469
  src += n_per_row;
11863
12470
  qrow += nblock*sizeof(block_iq4_xs);