llama_cpp 0.14.3 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
132
132
  }
133
133
 
134
134
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
135
- #if __AVXVNNI__
135
+ #if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
136
136
  const __m256i zero = _mm256_setzero_si256();
137
137
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
138
138
  return _mm256_cvtepi32_ps(summed_pairs);
@@ -544,7 +544,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
544
544
  #endif
545
545
 
546
546
  // reference implementation for deterministic creation of model files
547
- void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
547
+ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
548
548
  static const int qk = QK4_0;
549
549
 
550
550
  assert(k % qk == 0);
@@ -581,12 +581,12 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
581
581
  }
582
582
  }
583
583
 
584
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
584
+ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
585
585
  quantize_row_q4_0_reference(x, y, k);
586
586
  }
587
587
 
588
588
 
589
- void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
589
+ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
590
590
  const int qk = QK4_1;
591
591
 
592
592
  assert(k % qk == 0);
@@ -623,11 +623,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
623
623
  }
624
624
  }
625
625
 
626
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
626
+ void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
627
627
  quantize_row_q4_1_reference(x, y, k);
628
628
  }
629
629
 
630
- void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
630
+ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
631
631
  static const int qk = QK5_0;
632
632
 
633
633
  assert(k % qk == 0);
@@ -671,11 +671,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
671
671
  }
672
672
  }
673
673
 
674
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
674
+ void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
675
675
  quantize_row_q5_0_reference(x, y, k);
676
676
  }
677
677
 
678
- void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
678
+ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
679
679
  const int qk = QK5_1;
680
680
 
681
681
  assert(k % qk == 0);
@@ -719,12 +719,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
719
719
  }
720
720
  }
721
721
 
722
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
722
+ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
723
723
  quantize_row_q5_1_reference(x, y, k);
724
724
  }
725
725
 
726
726
  // reference implementation for deterministic creation of model files
727
- void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
727
+ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
728
728
  assert(k % QK8_0 == 0);
729
729
  const int nb = k / QK8_0;
730
730
 
@@ -749,7 +749,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
749
749
  }
750
750
  }
751
751
 
752
- void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
752
+ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
753
753
  assert(QK8_0 == 32);
754
754
  assert(k % QK8_0 == 0);
755
755
  const int nb = k / QK8_0;
@@ -938,7 +938,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
938
938
  }
939
939
 
940
940
  // reference implementation for deterministic creation of model files
941
- void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
941
+ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
942
942
  assert(QK8_1 == 32);
943
943
  assert(k % QK8_1 == 0);
944
944
  const int nb = k / QK8_1;
@@ -973,7 +973,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
973
973
  }
974
974
  }
975
975
 
976
- void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
976
+ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
977
977
  assert(k % QK8_1 == 0);
978
978
  const int nb = k / QK8_1;
979
979
 
@@ -1192,7 +1192,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1192
1192
  #endif
1193
1193
  }
1194
1194
 
1195
- void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
1195
+ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
1196
1196
  static const int qk = QK4_0;
1197
1197
 
1198
1198
  assert(k % qk == 0);
@@ -1212,7 +1212,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
1212
1212
  }
1213
1213
  }
1214
1214
 
1215
- void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
1215
+ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
1216
1216
  static const int qk = QK4_1;
1217
1217
 
1218
1218
  assert(k % qk == 0);
@@ -1233,7 +1233,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
1233
1233
  }
1234
1234
  }
1235
1235
 
1236
- void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
1236
+ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
1237
1237
  static const int qk = QK5_0;
1238
1238
 
1239
1239
  assert(k % qk == 0);
@@ -1259,7 +1259,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
1259
1259
  }
1260
1260
  }
1261
1261
 
1262
- void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
1262
+ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
1263
1263
  static const int qk = QK5_1;
1264
1264
 
1265
1265
  assert(k % qk == 0);
@@ -1286,7 +1286,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
1286
1286
  }
1287
1287
  }
1288
1288
 
1289
- void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
1289
+ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
1290
1290
  static const int qk = QK8_0;
1291
1291
 
1292
1292
  assert(k % qk == 0);
@@ -1581,7 +1581,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
1581
1581
 
1582
1582
  //========================- 2-bit (de)-quantization
1583
1583
 
1584
- void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
1584
+ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
1585
1585
  assert(k % QK_K == 0);
1586
1586
  const int nb = k / QK_K;
1587
1587
 
@@ -1658,7 +1658,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
1658
1658
  }
1659
1659
  }
1660
1660
 
1661
- void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
1661
+ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
1662
1662
  assert(k % QK_K == 0);
1663
1663
  const int nb = k / QK_K;
1664
1664
 
@@ -1704,7 +1704,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
1704
1704
  }
1705
1705
  }
1706
1706
 
1707
- void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
1707
+ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
1708
1708
  quantize_row_q2_K_reference(x, vy, k);
1709
1709
  }
1710
1710
 
@@ -1960,14 +1960,14 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1960
1960
  }
1961
1961
  }
1962
1962
 
1963
- size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
1963
+ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1964
1964
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1965
1965
  if (!quant_weights) {
1966
- quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
1966
+ quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
1967
1967
  }
1968
1968
  else {
1969
1969
  char * qrow = (char *)dst;
1970
- for (int row = 0; row < nrow; ++row) {
1970
+ for (int64_t row = 0; row < nrow; ++row) {
1971
1971
  quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
1972
1972
  src += n_per_row;
1973
1973
  qrow += row_size;
@@ -1978,7 +1978,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow,
1978
1978
 
1979
1979
  //========================= 3-bit (de)-quantization
1980
1980
 
1981
- void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
1981
+ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
1982
1982
  assert(k % QK_K == 0);
1983
1983
  const int nb = k / QK_K;
1984
1984
 
@@ -2092,7 +2092,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
2092
2092
  }
2093
2093
 
2094
2094
  #if QK_K == 256
2095
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
2095
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
2096
2096
  assert(k % QK_K == 0);
2097
2097
  const int nb = k / QK_K;
2098
2098
 
@@ -2142,7 +2142,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
2142
2142
  }
2143
2143
  }
2144
2144
  #else
2145
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
2145
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
2146
2146
  assert(k % QK_K == 0);
2147
2147
  assert(QK_K == 64);
2148
2148
  const int nb = k / QK_K;
@@ -2175,11 +2175,11 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
2175
2175
  }
2176
2176
  #endif
2177
2177
 
2178
- void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
2178
+ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
2179
2179
  quantize_row_q3_K_reference(x, vy, k);
2180
2180
  }
2181
2181
 
2182
- static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2182
+ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
2183
2183
  #if QK_K != 256
2184
2184
  (void)quant_weights;
2185
2185
  quantize_row_q3_K_reference(x, y, n_per_row);
@@ -2268,14 +2268,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
2268
2268
  #endif
2269
2269
  }
2270
2270
 
2271
- size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2271
+ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2272
2272
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2273
2273
  if (!quant_weights) {
2274
- quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
2274
+ quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
2275
2275
  }
2276
2276
  else {
2277
2277
  char * qrow = (char *)dst;
2278
- for (int row = 0; row < nrow; ++row) {
2278
+ for (int64_t row = 0; row < nrow; ++row) {
2279
2279
  quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
2280
2280
  src += n_per_row;
2281
2281
  qrow += row_size;
@@ -2286,7 +2286,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow,
2286
2286
 
2287
2287
  // ====================== 4-bit (de)-quantization
2288
2288
 
2289
- void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
2289
+ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
2290
2290
  assert(k % QK_K == 0);
2291
2291
  const int nb = k / QK_K;
2292
2292
 
@@ -2393,7 +2393,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
2393
2393
  }
2394
2394
  }
2395
2395
 
2396
- void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
2396
+ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
2397
2397
  assert(k % QK_K == 0);
2398
2398
  const int nb = k / QK_K;
2399
2399
 
@@ -2432,19 +2432,19 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
2432
2432
  }
2433
2433
  }
2434
2434
 
2435
- void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
2435
+ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
2436
2436
  assert(k % QK_K == 0);
2437
2437
  block_q4_K * restrict y = vy;
2438
2438
  quantize_row_q4_K_reference(x, y, k);
2439
2439
  }
2440
2440
 
2441
- static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2441
+ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2442
2442
  #if QK_K != 256
2443
2443
  (void)quant_weights;
2444
2444
  quantize_row_q4_K_reference(x, y, n_per_row);
2445
2445
  #else
2446
2446
  assert(n_per_row % QK_K == 0);
2447
- const int nb = n_per_row / QK_K;
2447
+ const int64_t nb = n_per_row / QK_K;
2448
2448
 
2449
2449
  uint8_t L[QK_K];
2450
2450
  uint8_t Laux[32];
@@ -2516,14 +2516,14 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2516
2516
  #endif
2517
2517
  }
2518
2518
 
2519
- size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2519
+ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2520
2520
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2521
2521
  if (!quant_weights) {
2522
- quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
2522
+ quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
2523
2523
  }
2524
2524
  else {
2525
2525
  char * qrow = (char *)dst;
2526
- for (int row = 0; row < nrow; ++row) {
2526
+ for (int64_t row = 0; row < nrow; ++row) {
2527
2527
  quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
2528
2528
  src += n_per_row;
2529
2529
  qrow += row_size;
@@ -2534,9 +2534,9 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow,
2534
2534
 
2535
2535
  // ====================== 5-bit (de)-quantization
2536
2536
 
2537
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
2537
+ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
2538
2538
  assert(k % QK_K == 0);
2539
- const int nb = k / QK_K;
2539
+ const int64_t nb = k / QK_K;
2540
2540
 
2541
2541
  #if QK_K == 256
2542
2542
  uint8_t L[QK_K];
@@ -2676,9 +2676,9 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
2676
2676
  }
2677
2677
  }
2678
2678
 
2679
- void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
2679
+ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
2680
2680
  assert(k % QK_K == 0);
2681
- const int nb = k / QK_K;
2681
+ const int64_t nb = k / QK_K;
2682
2682
 
2683
2683
  for (int i = 0; i < nb; i++) {
2684
2684
 
@@ -2721,19 +2721,19 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
2721
2721
  }
2722
2722
  }
2723
2723
 
2724
- void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
2724
+ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
2725
2725
  assert(k % QK_K == 0);
2726
2726
  block_q5_K * restrict y = vy;
2727
2727
  quantize_row_q5_K_reference(x, y, k);
2728
2728
  }
2729
2729
 
2730
- static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2730
+ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2731
2731
  #if QK_K != 256
2732
2732
  (void)quant_weights;
2733
2733
  quantize_row_q5_K_reference(x, y, n_per_row);
2734
2734
  #else
2735
2735
  assert(n_per_row % QK_K == 0);
2736
- const int nb = n_per_row / QK_K;
2736
+ const int64_t nb = n_per_row / QK_K;
2737
2737
 
2738
2738
  uint8_t L[QK_K];
2739
2739
  uint8_t Laux[32];
@@ -2825,14 +2825,14 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2825
2825
  #endif
2826
2826
  }
2827
2827
 
2828
- size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2828
+ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2829
2829
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2830
2830
  if (!quant_weights) {
2831
- quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
2831
+ quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
2832
2832
  }
2833
2833
  else {
2834
2834
  char * qrow = (char *)dst;
2835
- for (int row = 0; row < nrow; ++row) {
2835
+ for (int64_t row = 0; row < nrow; ++row) {
2836
2836
  quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
2837
2837
  src += n_per_row;
2838
2838
  qrow += row_size;
@@ -2843,9 +2843,9 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow,
2843
2843
 
2844
2844
  // ====================== 6-bit (de)-quantization
2845
2845
 
2846
- void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
2846
+ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
2847
2847
  assert(k % QK_K == 0);
2848
- const int nb = k / QK_K;
2848
+ const int64_t nb = k / QK_K;
2849
2849
 
2850
2850
  int8_t L[QK_K];
2851
2851
  float scales[QK_K/16];
@@ -2925,9 +2925,9 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2925
2925
  }
2926
2926
  }
2927
2927
 
2928
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
2928
+ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
2929
2929
  assert(k % QK_K == 0);
2930
- const int nb = k / QK_K;
2930
+ const int64_t nb = k / QK_K;
2931
2931
 
2932
2932
  for (int i = 0; i < nb; i++) {
2933
2933
 
@@ -2972,19 +2972,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
2972
2972
  }
2973
2973
  }
2974
2974
 
2975
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
2975
+ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
2976
2976
  assert(k % QK_K == 0);
2977
2977
  block_q6_K * restrict y = vy;
2978
2978
  quantize_row_q6_K_reference(x, y, k);
2979
2979
  }
2980
2980
 
2981
- static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
2981
+ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2982
2982
  #if QK_K != 256
2983
2983
  (void)quant_weights;
2984
2984
  quantize_row_q6_K_reference(x, y, n_per_row);
2985
2985
  #else
2986
2986
  assert(n_per_row % QK_K == 0);
2987
- const int nb = n_per_row / QK_K;
2987
+ const int64_t nb = n_per_row / QK_K;
2988
2988
 
2989
2989
  int8_t L[QK_K];
2990
2990
  float scales[QK_K/16];
@@ -3067,14 +3067,14 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
3067
3067
  #endif
3068
3068
  }
3069
3069
 
3070
- size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3070
+ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3071
3071
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3072
3072
  if (!quant_weights) {
3073
- quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
3073
+ quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
3074
3074
  }
3075
3075
  else {
3076
3076
  char * qrow = (char *)dst;
3077
- for (int row = 0; row < nrow; ++row) {
3077
+ for (int64_t row = 0; row < nrow; ++row) {
3078
3078
  quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
3079
3079
  src += n_per_row;
3080
3080
  qrow += row_size;
@@ -3083,7 +3083,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow,
3083
3083
  return nrow * row_size;
3084
3084
  }
3085
3085
 
3086
- static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
3086
+ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
3087
3087
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
3088
3088
 
3089
3089
  if (!quant_weights) {
@@ -3098,7 +3098,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3098
3098
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3099
3099
  float sigma2 = sum_x2/n_per_row;
3100
3100
 
3101
- const int nb = n_per_row/QK4_0;
3101
+ const int64_t nb = n_per_row/QK4_0;
3102
3102
  for (int ib = 0; ib < nb; ++ib) {
3103
3103
  const float * xb = x + QK4_0 * ib;
3104
3104
  const float * qw = quant_weights + QK4_0 * ib;
@@ -3111,14 +3111,14 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3111
3111
  }
3112
3112
  }
3113
3113
 
3114
- size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3114
+ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3115
3115
  if (!quant_weights) {
3116
- quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
3116
+ quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
3117
3117
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3118
3118
  }
3119
3119
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3120
3120
  char * qrow = (char *)dst;
3121
- for (int row = 0; row < nrow; ++row) {
3121
+ for (int64_t row = 0; row < nrow; ++row) {
3122
3122
  quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
3123
3123
  src += n_per_row;
3124
3124
  qrow += row_size;
@@ -3126,7 +3126,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow,
3126
3126
  return nrow * row_size;
3127
3127
  }
3128
3128
 
3129
- static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
3129
+ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
3130
3130
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
3131
3131
 
3132
3132
  if (!quant_weights) {
@@ -3141,7 +3141,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3141
3141
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3142
3142
  float sigma2 = sum_x2/n_per_row;
3143
3143
 
3144
- const int nb = n_per_row/QK4_1;
3144
+ const int64_t nb = n_per_row/QK4_1;
3145
3145
  for (int ib = 0; ib < nb; ++ib) {
3146
3146
  const float * xb = x + QK4_1 * ib;
3147
3147
  const float * qw = quant_weights + QK4_1 * ib;
@@ -3156,14 +3156,14 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3156
3156
  }
3157
3157
  }
3158
3158
 
3159
- size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3159
+ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3160
3160
  if (!quant_weights) {
3161
- quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
3161
+ quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
3162
3162
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3163
3163
  }
3164
3164
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3165
3165
  char * qrow = (char *)dst;
3166
- for (int row = 0; row < nrow; ++row) {
3166
+ for (int64_t row = 0; row < nrow; ++row) {
3167
3167
  quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
3168
3168
  src += n_per_row;
3169
3169
  qrow += row_size;
@@ -3171,7 +3171,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow,
3171
3171
  return nrow * row_size;
3172
3172
  }
3173
3173
 
3174
- static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
3174
+ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
3175
3175
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
3176
3176
 
3177
3177
  if (!quant_weights) {
@@ -3186,7 +3186,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3186
3186
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3187
3187
  float sigma2 = sum_x2/n_per_row;
3188
3188
 
3189
- const int nb = n_per_row/QK5_0;
3189
+ const int64_t nb = n_per_row/QK5_0;
3190
3190
  for (int ib = 0; ib < nb; ++ib) {
3191
3191
  const float * xb = x + QK5_0 * ib;
3192
3192
  const float * qw = quant_weights + QK5_0 * ib;
@@ -3210,14 +3210,14 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3210
3210
  }
3211
3211
  }
3212
3212
 
3213
- size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3213
+ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3214
3214
  if (!quant_weights) {
3215
- quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
3215
+ quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
3216
3216
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3217
3217
  }
3218
3218
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3219
3219
  char * qrow = (char *)dst;
3220
- for (int row = 0; row < nrow; ++row) {
3220
+ for (int64_t row = 0; row < nrow; ++row) {
3221
3221
  quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
3222
3222
  src += n_per_row;
3223
3223
  qrow += row_size;
@@ -3225,7 +3225,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow,
3225
3225
  return nrow * row_size;
3226
3226
  }
3227
3227
 
3228
- static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
3228
+ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
3229
3229
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
3230
3230
 
3231
3231
  if (!quant_weights) {
@@ -3240,7 +3240,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3240
3240
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3241
3241
  float sigma2 = sum_x2/n_per_row;
3242
3242
 
3243
- const int nb = n_per_row/QK5_1;
3243
+ const int64_t nb = n_per_row/QK5_1;
3244
3244
  for (int ib = 0; ib < nb; ++ib) {
3245
3245
  const float * xb = x + QK5_1 * ib;
3246
3246
  const float * qw = quant_weights + QK5_1 * ib;
@@ -3263,14 +3263,14 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3263
3263
  }
3264
3264
  }
3265
3265
 
3266
- size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3266
+ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3267
3267
  if (!quant_weights) {
3268
- quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
3268
+ quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
3269
3269
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3270
3270
  }
3271
3271
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3272
3272
  char * qrow = (char *)dst;
3273
- for (int row = 0; row < nrow; ++row) {
3273
+ for (int64_t row = 0; row < nrow; ++row) {
3274
3274
  quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
3275
3275
  src += n_per_row;
3276
3276
  qrow += row_size;
@@ -3278,18 +3278,18 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow,
3278
3278
  return nrow * row_size;
3279
3279
  }
3280
3280
 
3281
- size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3281
+ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3282
3282
  (void)quant_weights; // not used
3283
3283
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3284
- quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
3284
+ quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
3285
3285
  return nrow * row_size;
3286
3286
  }
3287
3287
 
3288
3288
  // ====================== "True" 2-bit (de)-quantization
3289
3289
 
3290
- void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
3290
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
3291
3291
  assert(k % QK_K == 0);
3292
- const int nb = k / QK_K;
3292
+ const int64_t nb = k / QK_K;
3293
3293
 
3294
3294
  uint32_t aux32[2];
3295
3295
  const uint8_t * aux8 = (const uint8_t *)aux32;
@@ -3315,9 +3315,9 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
3315
3315
 
3316
3316
  // ====================== 2.3125 bpw (de)-quantization
3317
3317
 
3318
- void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
3318
+ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
3319
3319
  assert(k % QK_K == 0);
3320
- const int nb = k / QK_K;
3320
+ const int64_t nb = k / QK_K;
3321
3321
 
3322
3322
  float db[2];
3323
3323
 
@@ -3342,9 +3342,9 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
3342
3342
 
3343
3343
  // ====================== 2.5625 bpw (de)-quantization
3344
3344
 
3345
- void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
3345
+ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
3346
3346
  assert(k % QK_K == 0);
3347
- const int nb = k / QK_K;
3347
+ const int64_t nb = k / QK_K;
3348
3348
 
3349
3349
  float db[2];
3350
3350
 
@@ -3374,9 +3374,9 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
3374
3374
 
3375
3375
  // ====================== 3.0625 bpw (de)-quantization
3376
3376
 
3377
- void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
3377
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
3378
3378
  assert(k % QK_K == 0);
3379
- const int nb = k / QK_K;
3379
+ const int64_t nb = k / QK_K;
3380
3380
 
3381
3381
  uint32_t aux32;
3382
3382
 
@@ -3406,9 +3406,9 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
3406
3406
 
3407
3407
  // ====================== 3.3125 bpw (de)-quantization
3408
3408
 
3409
- void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
3409
+ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
3410
3410
  assert(k % QK_K == 0);
3411
- const int nb = k / QK_K;
3411
+ const int64_t nb = k / QK_K;
3412
3412
 
3413
3413
  for (int i = 0; i < nb; i++) {
3414
3414
 
@@ -3449,9 +3449,9 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
3449
3449
 
3450
3450
  // ====================== 1.5625 bpw (de)-quantization
3451
3451
 
3452
- void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
3452
+ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
3453
3453
  assert(k % QK_K == 0);
3454
- const int nb = k / QK_K;
3454
+ const int64_t nb = k / QK_K;
3455
3455
 
3456
3456
  for (int i = 0; i < nb; i++) {
3457
3457
 
@@ -3474,11 +3474,70 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
3474
3474
  }
3475
3475
  }
3476
3476
 
3477
+ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
3478
+ assert(k % QK_K == 0);
3479
+ const int64_t nb = k / QK_K;
3480
+
3481
+ float delta[4];
3482
+ uint16_t idx[4];
3483
+
3484
+ #if QK_K != 64
3485
+ iq1m_scale_t scale;
3486
+ #endif
3487
+
3488
+ for (int i = 0; i < nb; i++) {
3489
+
3490
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
3491
+ #if QK_K == 64
3492
+ const float d = GGML_FP16_TO_FP32(x[i].d);
3493
+ #else
3494
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
3495
+ const float d = GGML_FP16_TO_FP32(scale.f16);
3496
+ #endif
3497
+ const uint8_t * qs = x[i].qs;
3498
+ const uint8_t * qh = x[i].qh;
3499
+
3500
+ for (int ib = 0; ib < QK_K/32; ++ib) {
3501
+ #if QK_K == 64
3502
+ const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1);
3503
+ const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1);
3504
+ #else
3505
+ const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
3506
+ const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
3507
+ #endif
3508
+ idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
3509
+ idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
3510
+ idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
3511
+ idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
3512
+ delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
3513
+ delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
3514
+ delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
3515
+ delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
3516
+ for (int l = 0; l < 2; ++l) {
3517
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
3518
+ for (int j = 0; j < 8; ++j) {
3519
+ y[j] = dl1 * (grid[j] + delta[l]);
3520
+ }
3521
+ y += 8;
3522
+ }
3523
+ for (int l = 2; l < 4; ++l) {
3524
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
3525
+ for (int j = 0; j < 8; ++j) {
3526
+ y[j] = dl2 * (grid[j] + delta[l]);
3527
+ }
3528
+ y += 8;
3529
+ }
3530
+ qs += 4;
3531
+ qh += 2;
3532
+ }
3533
+ }
3534
+ }
3535
+
3477
3536
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
3478
3537
 
3479
- void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
3538
+ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
3480
3539
  assert(k % QK4_NL == 0);
3481
- const int nb = k / QK4_NL;
3540
+ const int64_t nb = k / QK4_NL;
3482
3541
 
3483
3542
  for (int i = 0; i < nb; i++) {
3484
3543
 
@@ -3494,12 +3553,12 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
3494
3553
  }
3495
3554
  }
3496
3555
 
3497
- void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
3556
+ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
3498
3557
  assert(k % QK_K == 0);
3499
3558
  #if QK_K == 64
3500
3559
  dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
3501
3560
  #else
3502
- const int nb = k / QK_K;
3561
+ const int64_t nb = k / QK_K;
3503
3562
 
3504
3563
  for (int i = 0; i < nb; i++) {
3505
3564
 
@@ -3523,9 +3582,9 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
3523
3582
 
3524
3583
  //===================================== Q8_K ==============================================
3525
3584
 
3526
- void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
3585
+ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
3527
3586
  assert(k % QK_K == 0);
3528
- const int nb = k / QK_K;
3587
+ const int64_t nb = k / QK_K;
3529
3588
 
3530
3589
  for (int i = 0; i < nb; i++) {
3531
3590
 
@@ -3562,9 +3621,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
3562
3621
  }
3563
3622
  }
3564
3623
 
3565
- void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
3624
+ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
3566
3625
  assert(k % QK_K == 0);
3567
- const int nb = k / QK_K;
3626
+ const int64_t nb = k / QK_K;
3568
3627
 
3569
3628
  for (int i = 0; i < nb; i++) {
3570
3629
  for (int j = 0; j < QK_K; ++j) {
@@ -3573,7 +3632,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
3573
3632
  }
3574
3633
  }
3575
3634
 
3576
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
3635
+ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
3577
3636
  quantize_row_q8_K_reference(x, y, k);
3578
3637
  }
3579
3638
 
@@ -9695,6 +9754,248 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9695
9754
  #endif
9696
9755
  }
9697
9756
 
9757
+ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9758
+ assert(n % QK_K == 0);
9759
+ assert(nrc == 1);
9760
+ UNUSED(nrc);
9761
+ UNUSED(bx);
9762
+ UNUSED(by);
9763
+ UNUSED(bs);
9764
+
9765
+ const block_iq1_m * restrict x = vx;
9766
+ const block_q8_K * restrict y = vy;
9767
+
9768
+ const int nb = n / QK_K;
9769
+
9770
+ #if QK_K != 64
9771
+ iq1m_scale_t scale;
9772
+ #endif
9773
+
9774
+ #if defined __ARM_NEON
9775
+
9776
+ #if QK_K == 64
9777
+ const int32x4_t mask = vdupq_n_s32(0xf);
9778
+ #else
9779
+ const int32x4_t mask = vdupq_n_s32(0x7);
9780
+ #endif
9781
+ const int32x4_t mone = vdupq_n_s32(1);
9782
+ const int32x4_t mzero = vdupq_n_s32(0);
9783
+
9784
+ ggml_int8x16x4_t deltas;
9785
+ deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
9786
+ deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
9787
+ deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
9788
+ deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
9789
+
9790
+ ggml_int8x16x4_t q1b;
9791
+ ggml_int8x16x4_t q8b;
9792
+
9793
+ uint32_t aux32;
9794
+ const uint8_t * aux8 = (const uint8_t *)&aux32;
9795
+
9796
+ float sumf = 0;
9797
+ for (int i = 0; i < nb; ++i) {
9798
+
9799
+ const int8_t * q8 = y[i].qs;
9800
+ const uint8_t * qs = x[i].qs;
9801
+ const uint8_t * qh = x[i].qh;
9802
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
9803
+
9804
+ #if QK_K != 64
9805
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
9806
+ #endif
9807
+
9808
+ int32x4_t sumi1 = mzero;
9809
+ int32x4_t sumi2 = mzero;
9810
+
9811
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
9812
+
9813
+ q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
9814
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
9815
+ q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
9816
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
9817
+ q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
9818
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
9819
+ q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
9820
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
9821
+
9822
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9823
+
9824
+ const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
9825
+ const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
9826
+ const int32x4_t p12 = vpaddq_s32(p1, p2);
9827
+
9828
+ const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
9829
+ aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
9830
+
9831
+ const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
9832
+ const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
9833
+ const int32x4_t p34 = vpaddq_s32(p3, p4);
9834
+
9835
+ #if QK_K == 64
9836
+ int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12);
9837
+ #else
9838
+ int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
9839
+ #endif
9840
+ scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
9841
+
9842
+ sumi1 = vmlaq_s32(sumi1, scales_4, p12);
9843
+ sumi2 = vmlaq_s32(sumi2, scales_4, p34);
9844
+
9845
+ qs += 8; qh += 4;
9846
+
9847
+ }
9848
+
9849
+ #if QK_K == 64
9850
+ sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
9851
+ #else
9852
+ sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
9853
+ #endif
9854
+ }
9855
+
9856
+ *s = sumf;
9857
+
9858
+ #elif defined __AVX2__
9859
+
9860
+ #if QK_K == 64
9861
+ const __m256i mask = _mm256_set1_epi16(0xf);
9862
+ #else
9863
+ const __m256i mask = _mm256_set1_epi16(0x7);
9864
+ #endif
9865
+ const __m256i mone = _mm256_set1_epi16(1);
9866
+
9867
+ __m256 accum1 = _mm256_setzero_ps();
9868
+ __m256 accum2 = _mm256_setzero_ps();
9869
+ for (int i = 0; i < nb; ++i) {
9870
+
9871
+ const int8_t * q8 = y[i].qs;
9872
+ const uint8_t * qs = x[i].qs;
9873
+ const uint8_t * qh = x[i].qh;
9874
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
9875
+
9876
+ #if QK_K != 64
9877
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
9878
+ #endif
9879
+
9880
+ __m256i sumi1 = _mm256_setzero_si256();
9881
+ __m256i sumi2 = _mm256_setzero_si256();
9882
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
9883
+ const __m256i q1b_1 = _mm256_set_epi64x(
9884
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
9885
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
9886
+ );
9887
+ const __m256i q1b_2 = _mm256_set_epi64x(
9888
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
9889
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
9890
+ );
9891
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9892
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9893
+
9894
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
9895
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
9896
+
9897
+ const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9898
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
9899
+ qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9900
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
9901
+ const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9902
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
9903
+ qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9904
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
9905
+
9906
+ const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
9907
+ const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
9908
+ #if QK_K == 64
9909
+ __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 4), _mm_set1_epi16(sc[0] >> 0));
9910
+ __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8));
9911
+ #else
9912
+ __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
9913
+ __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
9914
+ #endif
9915
+ scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
9916
+ scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
9917
+ const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
9918
+ const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
9919
+ const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
9920
+ const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
9921
+
9922
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
9923
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
9924
+
9925
+ qs += 8; qh += 4;
9926
+ }
9927
+
9928
+ #if QK_K == 64
9929
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
9930
+ #else
9931
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
9932
+ #endif
9933
+ accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
9934
+ accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
9935
+
9936
+ }
9937
+
9938
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
9939
+
9940
+ #else
9941
+
9942
+ int sum1[2], sum2[2], delta[4];
9943
+
9944
+ float sumf = 0;
9945
+ for (int i = 0; i < nb; i++) {
9946
+
9947
+ const int8_t * q8 = y[i].qs;
9948
+ const uint8_t * qs = x[i].qs;
9949
+ const uint8_t * qh = x[i].qh;
9950
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
9951
+
9952
+ #if QK_K != 64
9953
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
9954
+ #endif
9955
+
9956
+ int sumi1 = 0, sumi2 = 0;
9957
+ for (int ib = 0; ib < QK_K/32; ++ib) {
9958
+ delta[0] = qh[0] & 0x08 ? -1 : 1;
9959
+ delta[1] = qh[0] & 0x80 ? -1 : 1;
9960
+ delta[2] = qh[1] & 0x08 ? -1 : 1;
9961
+ delta[3] = qh[1] & 0x80 ? -1 : 1;
9962
+ sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
9963
+ for (int l = 0; l < 4; ++l) {
9964
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
9965
+ int lsum1 = 0, lsum2 = 0;
9966
+ for (int j = 0; j < 8; ++j) {
9967
+ lsum1 += q8[j] * grid[j];
9968
+ lsum2 += q8[j];
9969
+ }
9970
+ q8 += 8;
9971
+ sum1[l/2] += lsum1;
9972
+ sum2[l/2] += lsum2*delta[l];
9973
+ }
9974
+ #if QK_K == 64
9975
+ const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1;
9976
+ const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1;
9977
+ #else
9978
+ const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
9979
+ const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
9980
+ #endif
9981
+ sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
9982
+ sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
9983
+ qs += 4;
9984
+ qh += 2;
9985
+ }
9986
+
9987
+ #if QK_K == 64
9988
+ sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
9989
+ #else
9990
+ sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
9991
+ #endif
9992
+ }
9993
+
9994
+ *s = sumf;
9995
+
9996
+ #endif
9997
+ }
9998
+
9698
9999
  void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9699
10000
  assert(nrc == 1);
9700
10001
  UNUSED(nrc);
@@ -9938,17 +10239,17 @@ static iq2_entry_t iq2_data[4] = {
9938
10239
  };
9939
10240
 
9940
10241
  static inline int iq2_data_index(enum ggml_type type) {
9941
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10242
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
9942
10243
  return type == GGML_TYPE_IQ2_XXS ? 0 :
9943
10244
  type == GGML_TYPE_IQ2_XS ? 1 :
9944
- type == GGML_TYPE_IQ1_S ? 2 : 3;
10245
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
9945
10246
  }
9946
10247
 
9947
10248
  static inline int iq2_grid_size(enum ggml_type type) {
9948
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10249
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
9949
10250
  return type == GGML_TYPE_IQ2_XXS ? 256 :
9950
10251
  type == GGML_TYPE_IQ2_XS ? 512 :
9951
- type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024;
10252
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
9952
10253
  }
9953
10254
 
9954
10255
  static int iq2_compare_func(const void * left, const void * right) {
@@ -10214,10 +10515,10 @@ void iq2xs_init_impl(enum ggml_type type) {
10214
10515
 
10215
10516
  const int kmap_size = 43692;
10216
10517
  //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
10217
- const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10518
+ const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10218
10519
  const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
10219
10520
  type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
10220
- type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024;
10521
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
10221
10522
  uint64_t * kgrid_q2xs;
10222
10523
  int * kmap_q2xs;
10223
10524
  uint16_t * kneighbors_q2xs;
@@ -10314,7 +10615,7 @@ void iq2xs_init_impl(enum ggml_type type) {
10314
10615
  }
10315
10616
 
10316
10617
  void iq2xs_free_impl(enum ggml_type type) {
10317
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10618
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
10318
10619
  const int gindex = iq2_data_index(type);
10319
10620
  if (iq2_data[gindex].grid) {
10320
10621
  free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
@@ -10347,7 +10648,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
10347
10648
  return grid_index;
10348
10649
  }
10349
10650
 
10350
- static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
10651
+ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
10351
10652
 
10352
10653
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
10353
10654
 
@@ -10363,7 +10664,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
10363
10664
 
10364
10665
  const int kMaxQ = 3;
10365
10666
 
10366
- const int nbl = n/QK_K;
10667
+ const int64_t nbl = n/QK_K;
10367
10668
 
10368
10669
  block_iq2_xxs * y = vy;
10369
10670
 
@@ -10520,7 +10821,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
10520
10821
  }
10521
10822
  }
10522
10823
 
10523
- static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
10824
+ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
10524
10825
 
10525
10826
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
10526
10827
 
@@ -10536,7 +10837,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
10536
10837
 
10537
10838
  const int kMaxQ = 3;
10538
10839
 
10539
- const int nbl = n/QK_K;
10840
+ const int64_t nbl = n/QK_K;
10540
10841
 
10541
10842
  block_iq2_xs * y = vy;
10542
10843
 
@@ -10700,11 +11001,11 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
10700
11001
  }
10701
11002
  }
10702
11003
 
10703
- size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11004
+ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
10704
11005
  GGML_ASSERT(n_per_row%QK_K == 0);
10705
- int nblock = n_per_row/QK_K;
11006
+ int64_t nblock = n_per_row/QK_K;
10706
11007
  char * qrow = (char *)dst;
10707
- for (int row = 0; row < nrow; ++row) {
11008
+ for (int64_t row = 0; row < nrow; ++row) {
10708
11009
  quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
10709
11010
  src += n_per_row;
10710
11011
  qrow += nblock*sizeof(block_iq2_xxs);
@@ -10712,11 +11013,11 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nro
10712
11013
  return nrow * nblock * sizeof(block_iq2_xxs);
10713
11014
  }
10714
11015
 
10715
- size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11016
+ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
10716
11017
  GGML_ASSERT(n_per_row%QK_K == 0);
10717
- int nblock = n_per_row/QK_K;
11018
+ int64_t nblock = n_per_row/QK_K;
10718
11019
  char * qrow = (char *)dst;
10719
- for (int row = 0; row < nrow; ++row) {
11020
+ for (int64_t row = 0; row < nrow; ++row) {
10720
11021
  quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
10721
11022
  src += n_per_row;
10722
11023
  qrow += nblock*sizeof(block_iq2_xs);
@@ -10941,7 +11242,7 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
10941
11242
  return grid_index;
10942
11243
  }
10943
11244
 
10944
- static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
11245
+ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
10945
11246
  const float * restrict quant_weights) {
10946
11247
 
10947
11248
  const int gindex = iq3_data_index(grid_size);
@@ -10958,7 +11259,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
10958
11259
 
10959
11260
  const int kMaxQ = 8;
10960
11261
 
10961
- const int nbl = n/QK_K;
11262
+ const int64_t nbl = n/QK_K;
10962
11263
 
10963
11264
  ggml_fp16_t * dh;
10964
11265
  uint8_t * qs;
@@ -11154,11 +11455,11 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11154
11455
  }
11155
11456
  }
11156
11457
 
11157
- size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11458
+ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11158
11459
  GGML_ASSERT(n_per_row%QK_K == 0);
11159
- int nblock = n_per_row/QK_K;
11460
+ int64_t nblock = n_per_row/QK_K;
11160
11461
  char * qrow = (char *)dst;
11161
- for (int row = 0; row < nrow; ++row) {
11462
+ for (int64_t row = 0; row < nrow; ++row) {
11162
11463
  quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
11163
11464
  src += n_per_row;
11164
11465
  qrow += nblock*sizeof(block_iq3_xxs);
@@ -11166,13 +11467,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nro
11166
11467
  return nrow * nblock * sizeof(block_iq3_xxs);
11167
11468
  }
11168
11469
 
11169
- void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
11470
+ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
11170
11471
  assert(k % QK_K == 0);
11171
11472
  block_iq3_xxs * restrict y = vy;
11172
11473
  quantize_row_iq3_xxs_reference(x, y, k);
11173
11474
  }
11174
11475
 
11175
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
11476
+ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
11176
11477
  assert(k % QK_K == 0);
11177
11478
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
11178
11479
  }
@@ -11203,7 +11504,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11203
11504
 
11204
11505
  const int kMaxQ = 8;
11205
11506
 
11206
- const int nbl = n/QK_K;
11507
+ const int64_t nbl = n/QK_K;
11207
11508
 
11208
11509
  block_iq3_s * y = vy;
11209
11510
 
@@ -11360,9 +11661,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11360
11661
  }
11361
11662
 
11362
11663
  #define IQ3S_BLOCK_SIZE 32
11363
- size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11664
+ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11364
11665
  GGML_ASSERT(n_per_row%QK_K == 0);
11365
- int nblock = n_per_row/QK_K;
11666
+ int64_t nblock = n_per_row/QK_K;
11366
11667
  float scales[QK_K/IQ3S_BLOCK_SIZE];
11367
11668
  float weight[IQ3S_BLOCK_SIZE];
11368
11669
  float xval[IQ3S_BLOCK_SIZE];
@@ -11373,7 +11674,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
11373
11674
  bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
11374
11675
  uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
11375
11676
  char * qrow = (char *)dst;
11376
- for (int row = 0; row < nrow; ++row) {
11677
+ for (int64_t row = 0; row < nrow; ++row) {
11377
11678
  quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
11378
11679
  scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
11379
11680
  src += n_per_row;
@@ -11382,13 +11683,13 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
11382
11683
  return nrow * nblock * sizeof(block_iq3_s);
11383
11684
  }
11384
11685
 
11385
- void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
11686
+ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
11386
11687
  assert(k % QK_K == 0);
11387
11688
  block_iq3_s * restrict y = vy;
11388
11689
  quantize_row_iq3_s_reference(x, y, k);
11389
11690
  }
11390
11691
 
11391
- void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
11692
+ void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
11392
11693
  assert(k % QK_K == 0);
11393
11694
  quantize_iq3_s(x, y, 1, k, NULL);
11394
11695
  }
@@ -11520,7 +11821,16 @@ static int iq1_sort_helper(const void * left, const void * right) {
11520
11821
  }
11521
11822
 
11522
11823
  #define IQ1S_BLOCK_SIZE 32
11523
- static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
11824
+ #define IQ1M_BLOCK_SIZE 16
11825
+ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
11826
+ float * scales,
11827
+ float * weight,
11828
+ float * sumx,
11829
+ float * sumw,
11830
+ float * pairs,
11831
+ int8_t * L,
11832
+ uint16_t * index,
11833
+ int8_t * shifts) {
11524
11834
 
11525
11835
  const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
11526
11836
 
@@ -11534,22 +11844,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11534
11844
  GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
11535
11845
  GGML_ASSERT(n%QK_K == 0);
11536
11846
 
11537
- const int nbl = n/QK_K;
11538
-
11539
11847
  block_iq1_s * y = vy;
11540
11848
 
11849
+ const int64_t nbl = n/QK_K;
11850
+
11851
+ const int block_size = IQ1S_BLOCK_SIZE;
11852
+
11541
11853
  const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
11542
11854
  const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
11543
11855
 
11544
- float scales[QK_K/IQ1S_BLOCK_SIZE];
11545
- float weight[IQ1S_BLOCK_SIZE];
11546
- int8_t L[IQ1S_BLOCK_SIZE];
11547
- float sumx[IQ1S_BLOCK_SIZE+1];
11548
- float sumw[IQ1S_BLOCK_SIZE+1];
11549
- float pairs[2*IQ1S_BLOCK_SIZE];
11856
+
11550
11857
  int * idx = (int *)(pairs + 1);
11551
- uint16_t index[IQ1S_BLOCK_SIZE/8];
11552
- int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11553
11858
 
11554
11859
  for (int ibl = 0; ibl < nbl; ++ibl) {
11555
11860
 
@@ -11564,15 +11869,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11564
11869
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
11565
11870
  float sigma2 = 2*sumx2/QK_K;
11566
11871
 
11567
- for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
11568
- const float * xb = xbl + IQ1S_BLOCK_SIZE*ib;
11569
- const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib;
11570
- for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11872
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
11873
+ const float * xb = xbl + block_size*ib;
11874
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
11875
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11571
11876
  float max = fabsf(xb[0]);
11572
- for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i]));
11877
+ for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
11573
11878
  if (!max) {
11574
11879
  scales[ib] = 0;
11575
- memset(L, 1, IQ1S_BLOCK_SIZE);
11880
+ memset(L, 1, block_size);
11576
11881
  continue;
11577
11882
  }
11578
11883
  // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
@@ -11581,14 +11886,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11581
11886
  // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
11582
11887
  // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
11583
11888
  // for each possible and score for each split.
11584
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
11889
+ for (int j = 0; j < block_size; ++j) {
11585
11890
  pairs[2*j] = xb[j];
11586
11891
  idx[2*j] = j;
11587
11892
  }
11588
- qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper);
11893
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
11589
11894
  {
11590
11895
  sumx[0] = sumw[0] = 0;
11591
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
11896
+ for (int j = 0; j < block_size; ++j) {
11592
11897
  int i = idx[2*j];
11593
11898
  sumx[j+1] = sumx[j] + weight[i]*xb[i];
11594
11899
  sumw[j+1] = sumw[j] + weight[i];
@@ -11596,16 +11901,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11596
11901
  }
11597
11902
  float best_score = 0, scale = max;
11598
11903
  int besti1 = -1, besti2 = -1, best_shift = 0;
11599
- for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) {
11600
- for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) {
11601
- float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2];
11602
- float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2];
11904
+ for (int i1 = 0; i1 <= block_size; ++i1) {
11905
+ for (int i2 = i1; i2 <= block_size; ++i2) {
11906
+ float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
11907
+ float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
11603
11908
  if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
11604
11909
  scale = sumqx/sumq2; best_score = scale*sumqx;
11605
11910
  besti1 = i1; besti2 = i2; best_shift = 1;
11606
11911
  }
11607
- sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2];
11608
- sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2];
11912
+ sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
11913
+ sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
11609
11914
  if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
11610
11915
  scale = sumqx/sumq2; best_score = scale*sumqx;
11611
11916
  besti1 = i1; besti2 = i2; best_shift = -1;
@@ -11615,14 +11920,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11615
11920
  GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
11616
11921
  for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
11617
11922
  for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
11618
- for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2;
11923
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
11619
11924
  if (scale < 0) {
11620
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j];
11925
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
11621
11926
  scale = -scale; best_shift = -best_shift;
11622
11927
  }
11623
11928
  bool all_on_grid = true;
11624
11929
  const float * xx = best_shift == 1 ? x_p : x_m;
11625
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11930
+ for (int k = 0; k < block_size/8; ++k) {
11626
11931
  uint16_t u = 0;
11627
11932
  for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
11628
11933
  int grid_index = kmap_q2xs[u];
@@ -11636,7 +11941,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11636
11941
  }
11637
11942
  if (!all_on_grid) {
11638
11943
  float sumqx = 0, sumq2 = 0;
11639
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11944
+ for (int k = 0; k < block_size/8; ++k) {
11640
11945
  const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
11641
11946
  for (int j = 0; j < 8; ++j) {
11642
11947
  float w = weight[8*k + j];
@@ -11648,8 +11953,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11648
11953
  if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
11649
11954
  }
11650
11955
  uint16_t h = 0;
11651
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11652
- y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255;
11956
+ for (int k = 0; k < block_size/8; ++k) {
11957
+ y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
11653
11958
  h |= (index[k] >> 8) << 3*k;
11654
11959
  }
11655
11960
  y[ibl].qh[ib] = h;
@@ -11660,14 +11965,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11660
11965
  }
11661
11966
 
11662
11967
  if (!max_scale) {
11663
- memset(y[ibl].qs, 0, QK_K/8);
11664
11968
  continue;
11665
11969
  }
11666
11970
 
11667
11971
  float d = max_scale/15;
11668
- y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
11972
+ y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
11669
11973
  float id = 1/d;
11670
- for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
11974
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
11671
11975
  int l = nearest_int(0.5f*(id*scales[ib]-1));
11672
11976
  l = MAX(0, MIN(7, l));
11673
11977
  if (shifts[ib] == -1) l |= 8;
@@ -11676,18 +11980,309 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11676
11980
  }
11677
11981
  }
11678
11982
 
11679
- size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11983
+ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11680
11984
  GGML_ASSERT(n_per_row%QK_K == 0);
11681
- int nblock = n_per_row/QK_K;
11985
+ float scales[QK_K/IQ1S_BLOCK_SIZE];
11986
+ float weight[IQ1S_BLOCK_SIZE];
11987
+ int8_t L[IQ1S_BLOCK_SIZE];
11988
+ float sumx[IQ1S_BLOCK_SIZE+1];
11989
+ float sumw[IQ1S_BLOCK_SIZE+1];
11990
+ float pairs[2*IQ1S_BLOCK_SIZE];
11991
+ uint16_t index[IQ1S_BLOCK_SIZE/8];
11992
+ int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11993
+ int64_t nblock = n_per_row/QK_K;
11682
11994
  char * qrow = (char *)dst;
11683
- for (int row = 0; row < nrow; ++row) {
11684
- quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
11995
+ for (int64_t row = 0; row < nrow; ++row) {
11996
+ quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
11685
11997
  src += n_per_row;
11686
11998
  qrow += nblock*sizeof(block_iq1_s);
11687
11999
  }
11688
12000
  return nrow * nblock * sizeof(block_iq1_s);
11689
12001
  }
11690
12002
 
12003
+ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
12004
+ float * scales,
12005
+ float * weight,
12006
+ float * pairs,
12007
+ int8_t * L,
12008
+ uint16_t * index,
12009
+ int8_t * shifts) {
12010
+
12011
+ const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
12012
+
12013
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
12014
+ const int * kmap_q2xs = iq2_data[gindex].map;
12015
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
12016
+
12017
+ //GGML_ASSERT(quant_weights && "missing quantization weights");
12018
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
12019
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
12020
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
12021
+ GGML_ASSERT(n%QK_K == 0);
12022
+
12023
+ block_iq1_m * y = vy;
12024
+
12025
+ const int64_t nbl = n/QK_K;
12026
+
12027
+ const int block_size = IQ1M_BLOCK_SIZE;
12028
+
12029
+ const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
12030
+ const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
12031
+ const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
12032
+
12033
+ int * idx = (int *)(pairs + 1);
12034
+
12035
+ float sumqx[4], sumq2[4];
12036
+
12037
+ iq1m_scale_t s;
12038
+ const float * xx;
12039
+
12040
+ for (int ibl = 0; ibl < nbl; ++ibl) {
12041
+
12042
+ #if QK_K == 64
12043
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
12044
+ #endif
12045
+ memset(y[ibl].qs, 0, QK_K/8);
12046
+ memset(y[ibl].qh, 0, QK_K/16);
12047
+ memset(y[ibl].scales, 0, QK_K/32);
12048
+
12049
+ float max_scale = 0;
12050
+
12051
+ const float * xbl = x + QK_K*ibl;
12052
+ float sumx2 = 0;
12053
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
12054
+ float sigma2 = 2*sumx2/QK_K;
12055
+
12056
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
12057
+ const float * xb = xbl + block_size*ib;
12058
+ if (quant_weights) {
12059
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
12060
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12061
+ } else {
12062
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
12063
+ }
12064
+ float max = fabsf(xb[0]);
12065
+ for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
12066
+ if (!max) {
12067
+ scales[ib] = 0;
12068
+ memset(L, 1, block_size);
12069
+ continue;
12070
+ }
12071
+ // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
12072
+ // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
12073
+ // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
12074
+ // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
12075
+ // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
12076
+ // for each possible and score for each split.
12077
+ for (int j = 0; j < block_size; ++j) {
12078
+ pairs[2*j] = xb[j];
12079
+ idx[2*j] = j;
12080
+ }
12081
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
12082
+ float best_score = 0, scale = max;
12083
+ int besti1 = -1, besti2 = -1, best_k = -1;
12084
+ // 0: +, +
12085
+ // 1: +, -
12086
+ // 2: -, +
12087
+ // 3: -, -
12088
+ for (int i1 = 0; i1 <= block_size; ++i1) {
12089
+ for (int i2 = i1; i2 <= block_size; ++i2) {
12090
+ memset(sumqx, 0, 4*sizeof(float));
12091
+ memset(sumq2, 0, 4*sizeof(float));
12092
+ for (int j = 0; j < i1; ++j) {
12093
+ int i = idx[2*j];
12094
+ if (i < block_size/2) {
12095
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
12096
+ sumqx[1] += weight[i]*x_p[0]*xb[i];
12097
+ sumqx[2] += weight[i]*x_m[0]*xb[i];
12098
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
12099
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
12100
+ sumq2[1] += weight[i]*x_p[0]*x_p[0];
12101
+ sumq2[2] += weight[i]*x_m[0]*x_m[0];
12102
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
12103
+ } else {
12104
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
12105
+ sumqx[2] += weight[i]*x_p[0]*xb[i];
12106
+ sumqx[1] += weight[i]*x_m[0]*xb[i];
12107
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
12108
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
12109
+ sumq2[2] += weight[i]*x_p[0]*x_p[0];
12110
+ sumq2[1] += weight[i]*x_m[0]*x_m[0];
12111
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
12112
+ }
12113
+ }
12114
+ for (int j = i1; j < i2; ++j) {
12115
+ int i = idx[2*j];
12116
+ if (i < block_size/2) {
12117
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
12118
+ sumqx[1] += weight[i]*x_p[1]*xb[i];
12119
+ sumqx[2] += weight[i]*x_m[1]*xb[i];
12120
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
12121
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
12122
+ sumq2[1] += weight[i]*x_p[1]*x_p[1];
12123
+ sumq2[2] += weight[i]*x_m[1]*x_m[1];
12124
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
12125
+ } else {
12126
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
12127
+ sumqx[2] += weight[i]*x_p[1]*xb[i];
12128
+ sumqx[1] += weight[i]*x_m[1]*xb[i];
12129
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
12130
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
12131
+ sumq2[2] += weight[i]*x_p[1]*x_p[1];
12132
+ sumq2[1] += weight[i]*x_m[1]*x_m[1];
12133
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
12134
+ }
12135
+ }
12136
+ for (int j = i2; j < block_size; ++j) {
12137
+ int i = idx[2*j];
12138
+ if (i < block_size/2) {
12139
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
12140
+ sumqx[1] += weight[i]*x_p[2]*xb[i];
12141
+ sumqx[2] += weight[i]*x_m[2]*xb[i];
12142
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
12143
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
12144
+ sumq2[1] += weight[i]*x_p[2]*x_p[2];
12145
+ sumq2[2] += weight[i]*x_m[2]*x_m[2];
12146
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
12147
+ } else {
12148
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
12149
+ sumqx[2] += weight[i]*x_p[2]*xb[i];
12150
+ sumqx[1] += weight[i]*x_m[2]*xb[i];
12151
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
12152
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
12153
+ sumq2[2] += weight[i]*x_p[2]*x_p[2];
12154
+ sumq2[1] += weight[i]*x_m[2]*x_m[2];
12155
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
12156
+ }
12157
+ }
12158
+ for (int k = 0; k < 4; ++k) {
12159
+ if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
12160
+ scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
12161
+ besti1 = i1; besti2 = i2; best_k = k;
12162
+ }
12163
+ }
12164
+ }
12165
+ }
12166
+ GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
12167
+ for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
12168
+ for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
12169
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
12170
+ if (scale < 0) {
12171
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
12172
+ scale = -scale;
12173
+ best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
12174
+ }
12175
+ bool all_on_grid = true;
12176
+ for (int k = 0; k < block_size/8; ++k) {
12177
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
12178
+ else xx = best_k%2 == 0 ? x_p : x_m;
12179
+ uint16_t u = 0;
12180
+ for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
12181
+ int grid_index = kmap_q2xs[u];
12182
+ if (grid_index < 0) {
12183
+ all_on_grid = false;
12184
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12185
+ grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
12186
+ GGML_ASSERT(grid_index >= 0);
12187
+ }
12188
+ index[k] = grid_index;
12189
+ }
12190
+ if (!all_on_grid) {
12191
+ float sumqx_f = 0, sumq2_f = 0;
12192
+ for (int k = 0; k < block_size/8; ++k) {
12193
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
12194
+ else xx = best_k%2 == 0 ? x_p : x_m;
12195
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
12196
+ for (int j = 0; j < 8; ++j) {
12197
+ float w = weight[8*k + j];
12198
+ float q = xx[(pg[j] - 1)/2];
12199
+ sumqx_f += w*q*xb[8*k+j];
12200
+ sumq2_f += w*q*q;
12201
+ }
12202
+ }
12203
+ if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
12204
+ }
12205
+ y[ibl].qs[2*ib + 0] = index[0] & 255;
12206
+ y[ibl].qs[2*ib + 1] = index[1] & 255;
12207
+ y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
12208
+ GGML_ASSERT(scale >= 0);
12209
+ scales[ib] = scale;
12210
+ shifts[ib] = best_k;
12211
+ max_scale = MAX(max_scale, scale);
12212
+ }
12213
+
12214
+ if (!max_scale) {
12215
+ continue;
12216
+ }
12217
+
12218
+ uint16_t * sc = (uint16_t *)y[ibl].scales;
12219
+ #if QK_K == 64
12220
+ float d = max_scale/31;
12221
+ #else
12222
+ float d = max_scale/15;
12223
+ #endif
12224
+ float id = 1/d;
12225
+ float sumqx_f = 0, sumq2_f = 0;
12226
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
12227
+ int l = nearest_int(0.5f*(id*scales[ib+0]-1));
12228
+ #if QK_K == 64
12229
+ l = MAX(0, MIN(15, l));
12230
+ sc[ib/4] |= (l << 4*(ib%4));
12231
+ #else
12232
+ l = MAX(0, MIN(7, l));
12233
+ sc[ib/4] |= (l << 3*(ib%4));
12234
+ #endif
12235
+ y[ibl].qh[ib] |= masks[shifts[ib]];
12236
+ const float * xb = xbl + block_size*ib;
12237
+ if (quant_weights) {
12238
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
12239
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12240
+ } else {
12241
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
12242
+ }
12243
+ for (int k = 0; k < block_size/8; ++k) {
12244
+ if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
12245
+ else xx = shifts[ib]%2 == 0 ? x_p : x_m;
12246
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
12247
+ for (int j = 0; j < 8; ++j) {
12248
+ float w = weight[8*k + j];
12249
+ float q = xx[(pg[j] - 1)/2]*(2*l+1);
12250
+ sumqx_f += w*q*xb[8*k+j];
12251
+ sumq2_f += w*q*q;
12252
+ }
12253
+ }
12254
+ }
12255
+ if (sumq2_f > 0) d = sumqx_f/sumq2_f;
12256
+ s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
12257
+ #if QK_K == 64
12258
+ y[ibl].d = s.f16;
12259
+ #else
12260
+ sc[0] |= ((s.u16 & 0x000f) << 12);
12261
+ sc[1] |= ((s.u16 & 0x00f0) << 8);
12262
+ sc[2] |= ((s.u16 & 0x0f00) << 4);
12263
+ sc[3] |= ((s.u16 & 0xf000) << 0);
12264
+ #endif
12265
+ }
12266
+ }
12267
+
12268
+ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12269
+ GGML_ASSERT(n_per_row%QK_K == 0);
12270
+ float scales[QK_K/IQ1M_BLOCK_SIZE];
12271
+ float weight[IQ1M_BLOCK_SIZE];
12272
+ int8_t L[IQ1M_BLOCK_SIZE];
12273
+ float pairs[2*IQ1M_BLOCK_SIZE];
12274
+ uint16_t index[IQ1M_BLOCK_SIZE/8];
12275
+ int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
12276
+ int64_t nblock = n_per_row/QK_K;
12277
+ char * qrow = (char *)dst;
12278
+ for (int64_t row = 0; row < nrow; ++row) {
12279
+ quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
12280
+ src += n_per_row;
12281
+ qrow += nblock*sizeof(block_iq1_m);
12282
+ }
12283
+ return nrow * nblock * sizeof(block_iq1_m);
12284
+ }
12285
+
11691
12286
  // ============================ 4-bit non-linear quants
11692
12287
 
11693
12288
  static inline int best_index_int8(int n, const int8_t * val, float x) {
@@ -11812,16 +12407,16 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11812
12407
  }
11813
12408
  }
11814
12409
 
11815
- size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12410
+ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11816
12411
  GGML_ASSERT(n_per_row%QK4_NL == 0);
11817
- int nblock = n_per_row/QK4_NL;
12412
+ int64_t nblock = n_per_row/QK4_NL;
11818
12413
  char * qrow = (char *)dst;
11819
12414
  uint8_t L[QK4_NL];
11820
12415
  float weight[QK4_NL];
11821
12416
  uint16_t unused_h;
11822
12417
  uint8_t * unused_l = NULL;
11823
12418
  float scale;
11824
- for (int row = 0; row < nrow; ++row) {
12419
+ for (int64_t row = 0; row < nrow; ++row) {
11825
12420
  block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
11826
12421
  for (int ibl = 0; ibl < nblock; ++ibl) {
11827
12422
  const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
@@ -11834,9 +12429,9 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
11834
12429
  return nrow * nblock * sizeof(block_iq4_nl);
11835
12430
  }
11836
12431
 
11837
- void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12432
+ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) {
11838
12433
  GGML_ASSERT(k%QK4_NL == 0);
11839
- int nblock = k/QK4_NL;
12434
+ int64_t nblock = k/QK4_NL;
11840
12435
  uint8_t L[QK4_NL];
11841
12436
  float weight[QK4_NL];
11842
12437
  uint16_t unused_h;
@@ -11849,22 +12444,22 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
11849
12444
  }
11850
12445
  }
11851
12446
 
11852
- void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
12447
+ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
11853
12448
  assert(k % QK4_NL == 0);
11854
12449
  quantize_row_iq4_nl(x, y, k);
11855
12450
  }
11856
12451
 
11857
- size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12452
+ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11858
12453
  #if QK_K == 64
11859
12454
  return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
11860
12455
  #else
11861
12456
  GGML_ASSERT(n_per_row%QK_K == 0);
11862
- int nblock = n_per_row/QK_K;
12457
+ int64_t nblock = n_per_row/QK_K;
11863
12458
  char * qrow = (char *)dst;
11864
12459
  uint8_t L[QK_K];
11865
12460
  float weight[32];
11866
12461
  float scales[QK_K/32];
11867
- for (int row = 0; row < nrow; ++row) {
12462
+ for (int64_t row = 0; row < nrow; ++row) {
11868
12463
  block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
11869
12464
  for (int ibl = 0; ibl < nblock; ++ibl) {
11870
12465
  const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
@@ -11878,20 +12473,20 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
11878
12473
  #endif
11879
12474
  }
11880
12475
 
11881
- void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
12476
+ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
11882
12477
  assert(k % QK_K == 0);
11883
12478
  block_iq4_xs * restrict y = vy;
11884
12479
  quantize_row_iq4_xs_reference(x, y, k);
11885
12480
  }
11886
12481
 
11887
- void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
12482
+ void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
11888
12483
  assert(k % QK_K == 0);
11889
12484
  quantize_iq4_xs(x, y, 1, k, NULL);
11890
12485
  }
11891
12486
 
11892
12487
  // =============================== 2.5625 bpw
11893
12488
 
11894
- static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12489
+ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
11895
12490
 
11896
12491
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
11897
12492
 
@@ -11906,7 +12501,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
11906
12501
 
11907
12502
  const int kMaxQ = 3;
11908
12503
 
11909
- const int nbl = n/QK_K;
12504
+ const int64_t nbl = n/QK_K;
11910
12505
 
11911
12506
  block_iq2_s * y = vy;
11912
12507
 
@@ -12059,11 +12654,11 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
12059
12654
  }
12060
12655
  }
12061
12656
 
12062
- size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12657
+ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12063
12658
  GGML_ASSERT(n_per_row%QK_K == 0);
12064
- int nblock = n_per_row/QK_K;
12659
+ int64_t nblock = n_per_row/QK_K;
12065
12660
  char * qrow = (char *)dst;
12066
- for (int row = 0; row < nrow; ++row) {
12661
+ for (int64_t row = 0; row < nrow; ++row) {
12067
12662
  quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
12068
12663
  src += n_per_row;
12069
12664
  qrow += nblock*sizeof(block_iq2_s);
@@ -12071,12 +12666,12 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow,
12071
12666
  return nrow * nblock * sizeof(block_iq2_s);
12072
12667
  }
12073
12668
 
12074
- void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
12669
+ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
12075
12670
  assert(k % QK_K == 0);
12076
12671
  quantize_iq2_s(x, y, 1, k, NULL);
12077
12672
  }
12078
12673
 
12079
- void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
12674
+ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
12080
12675
  assert(k % QK_K == 0);
12081
12676
  block_iq2_s * restrict y = vy;
12082
12677
  quantize_row_iq2_s_reference(x, y, k);