llama_cpp 0.14.4 → 0.14.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
132
132
  }
133
133
 
134
134
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
135
- #if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
135
+ #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
136
136
  const __m256i zero = _mm256_setzero_si256();
137
137
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
138
138
  return _mm256_cvtepi32_ps(summed_pairs);
@@ -544,7 +544,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
544
544
  #endif
545
545
 
546
546
  // reference implementation for deterministic creation of model files
547
- void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
547
+ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
548
548
  static const int qk = QK4_0;
549
549
 
550
550
  assert(k % qk == 0);
@@ -581,12 +581,12 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
581
581
  }
582
582
  }
583
583
 
584
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
584
+ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
585
585
  quantize_row_q4_0_reference(x, y, k);
586
586
  }
587
587
 
588
588
 
589
- void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
589
+ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
590
590
  const int qk = QK4_1;
591
591
 
592
592
  assert(k % qk == 0);
@@ -623,11 +623,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
623
623
  }
624
624
  }
625
625
 
626
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
626
+ void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
627
627
  quantize_row_q4_1_reference(x, y, k);
628
628
  }
629
629
 
630
- void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
630
+ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
631
631
  static const int qk = QK5_0;
632
632
 
633
633
  assert(k % qk == 0);
@@ -671,11 +671,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
671
671
  }
672
672
  }
673
673
 
674
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
674
+ void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
675
675
  quantize_row_q5_0_reference(x, y, k);
676
676
  }
677
677
 
678
- void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
678
+ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
679
679
  const int qk = QK5_1;
680
680
 
681
681
  assert(k % qk == 0);
@@ -719,12 +719,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
719
719
  }
720
720
  }
721
721
 
722
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
722
+ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
723
723
  quantize_row_q5_1_reference(x, y, k);
724
724
  }
725
725
 
726
726
  // reference implementation for deterministic creation of model files
727
- void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
727
+ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
728
728
  assert(k % QK8_0 == 0);
729
729
  const int nb = k / QK8_0;
730
730
 
@@ -749,7 +749,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
749
749
  }
750
750
  }
751
751
 
752
- void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
752
+ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
753
753
  assert(QK8_0 == 32);
754
754
  assert(k % QK8_0 == 0);
755
755
  const int nb = k / QK8_0;
@@ -938,7 +938,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
938
938
  }
939
939
 
940
940
  // reference implementation for deterministic creation of model files
941
- void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
941
+ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
942
942
  assert(QK8_1 == 32);
943
943
  assert(k % QK8_1 == 0);
944
944
  const int nb = k / QK8_1;
@@ -973,7 +973,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
973
973
  }
974
974
  }
975
975
 
976
- void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
976
+ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
977
977
  assert(k % QK8_1 == 0);
978
978
  const int nb = k / QK8_1;
979
979
 
@@ -1192,7 +1192,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1192
1192
  #endif
1193
1193
  }
1194
1194
 
1195
- void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
1195
+ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
1196
1196
  static const int qk = QK4_0;
1197
1197
 
1198
1198
  assert(k % qk == 0);
@@ -1212,7 +1212,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
1212
1212
  }
1213
1213
  }
1214
1214
 
1215
- void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
1215
+ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
1216
1216
  static const int qk = QK4_1;
1217
1217
 
1218
1218
  assert(k % qk == 0);
@@ -1233,7 +1233,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
1233
1233
  }
1234
1234
  }
1235
1235
 
1236
- void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
1236
+ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
1237
1237
  static const int qk = QK5_0;
1238
1238
 
1239
1239
  assert(k % qk == 0);
@@ -1259,7 +1259,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
1259
1259
  }
1260
1260
  }
1261
1261
 
1262
- void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
1262
+ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
1263
1263
  static const int qk = QK5_1;
1264
1264
 
1265
1265
  assert(k % qk == 0);
@@ -1286,7 +1286,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
1286
1286
  }
1287
1287
  }
1288
1288
 
1289
- void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
1289
+ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
1290
1290
  static const int qk = QK8_0;
1291
1291
 
1292
1292
  assert(k % qk == 0);
@@ -1581,7 +1581,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
1581
1581
 
1582
1582
  //========================- 2-bit (de)-quantization
1583
1583
 
1584
- void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
1584
+ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
1585
1585
  assert(k % QK_K == 0);
1586
1586
  const int nb = k / QK_K;
1587
1587
 
@@ -1658,7 +1658,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
1658
1658
  }
1659
1659
  }
1660
1660
 
1661
- void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
1661
+ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
1662
1662
  assert(k % QK_K == 0);
1663
1663
  const int nb = k / QK_K;
1664
1664
 
@@ -1704,7 +1704,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
1704
1704
  }
1705
1705
  }
1706
1706
 
1707
- void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
1707
+ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
1708
1708
  quantize_row_q2_K_reference(x, vy, k);
1709
1709
  }
1710
1710
 
@@ -1960,14 +1960,14 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1960
1960
  }
1961
1961
  }
1962
1962
 
1963
- size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
1963
+ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1964
1964
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1965
1965
  if (!quant_weights) {
1966
- quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
1966
+ quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
1967
1967
  }
1968
1968
  else {
1969
1969
  char * qrow = (char *)dst;
1970
- for (int row = 0; row < nrow; ++row) {
1970
+ for (int64_t row = 0; row < nrow; ++row) {
1971
1971
  quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
1972
1972
  src += n_per_row;
1973
1973
  qrow += row_size;
@@ -1978,7 +1978,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow,
1978
1978
 
1979
1979
  //========================= 3-bit (de)-quantization
1980
1980
 
1981
- void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
1981
+ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
1982
1982
  assert(k % QK_K == 0);
1983
1983
  const int nb = k / QK_K;
1984
1984
 
@@ -2092,7 +2092,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
2092
2092
  }
2093
2093
 
2094
2094
  #if QK_K == 256
2095
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
2095
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
2096
2096
  assert(k % QK_K == 0);
2097
2097
  const int nb = k / QK_K;
2098
2098
 
@@ -2142,7 +2142,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
2142
2142
  }
2143
2143
  }
2144
2144
  #else
2145
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
2145
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
2146
2146
  assert(k % QK_K == 0);
2147
2147
  assert(QK_K == 64);
2148
2148
  const int nb = k / QK_K;
@@ -2175,11 +2175,11 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
2175
2175
  }
2176
2176
  #endif
2177
2177
 
2178
- void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
2178
+ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
2179
2179
  quantize_row_q3_K_reference(x, vy, k);
2180
2180
  }
2181
2181
 
2182
- static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2182
+ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
2183
2183
  #if QK_K != 256
2184
2184
  (void)quant_weights;
2185
2185
  quantize_row_q3_K_reference(x, y, n_per_row);
@@ -2268,14 +2268,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
2268
2268
  #endif
2269
2269
  }
2270
2270
 
2271
- size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2271
+ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2272
2272
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2273
2273
  if (!quant_weights) {
2274
- quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
2274
+ quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
2275
2275
  }
2276
2276
  else {
2277
2277
  char * qrow = (char *)dst;
2278
- for (int row = 0; row < nrow; ++row) {
2278
+ for (int64_t row = 0; row < nrow; ++row) {
2279
2279
  quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
2280
2280
  src += n_per_row;
2281
2281
  qrow += row_size;
@@ -2286,7 +2286,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow,
2286
2286
 
2287
2287
  // ====================== 4-bit (de)-quantization
2288
2288
 
2289
- void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
2289
+ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
2290
2290
  assert(k % QK_K == 0);
2291
2291
  const int nb = k / QK_K;
2292
2292
 
@@ -2393,7 +2393,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
2393
2393
  }
2394
2394
  }
2395
2395
 
2396
- void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
2396
+ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
2397
2397
  assert(k % QK_K == 0);
2398
2398
  const int nb = k / QK_K;
2399
2399
 
@@ -2432,19 +2432,19 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
2432
2432
  }
2433
2433
  }
2434
2434
 
2435
- void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
2435
+ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
2436
2436
  assert(k % QK_K == 0);
2437
2437
  block_q4_K * restrict y = vy;
2438
2438
  quantize_row_q4_K_reference(x, y, k);
2439
2439
  }
2440
2440
 
2441
- static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2441
+ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2442
2442
  #if QK_K != 256
2443
2443
  (void)quant_weights;
2444
2444
  quantize_row_q4_K_reference(x, y, n_per_row);
2445
2445
  #else
2446
2446
  assert(n_per_row % QK_K == 0);
2447
- const int nb = n_per_row / QK_K;
2447
+ const int64_t nb = n_per_row / QK_K;
2448
2448
 
2449
2449
  uint8_t L[QK_K];
2450
2450
  uint8_t Laux[32];
@@ -2516,14 +2516,14 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2516
2516
  #endif
2517
2517
  }
2518
2518
 
2519
- size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2519
+ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2520
2520
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2521
2521
  if (!quant_weights) {
2522
- quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
2522
+ quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
2523
2523
  }
2524
2524
  else {
2525
2525
  char * qrow = (char *)dst;
2526
- for (int row = 0; row < nrow; ++row) {
2526
+ for (int64_t row = 0; row < nrow; ++row) {
2527
2527
  quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
2528
2528
  src += n_per_row;
2529
2529
  qrow += row_size;
@@ -2534,9 +2534,9 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow,
2534
2534
 
2535
2535
  // ====================== 5-bit (de)-quantization
2536
2536
 
2537
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
2537
+ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
2538
2538
  assert(k % QK_K == 0);
2539
- const int nb = k / QK_K;
2539
+ const int64_t nb = k / QK_K;
2540
2540
 
2541
2541
  #if QK_K == 256
2542
2542
  uint8_t L[QK_K];
@@ -2676,9 +2676,9 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
2676
2676
  }
2677
2677
  }
2678
2678
 
2679
- void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
2679
+ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
2680
2680
  assert(k % QK_K == 0);
2681
- const int nb = k / QK_K;
2681
+ const int64_t nb = k / QK_K;
2682
2682
 
2683
2683
  for (int i = 0; i < nb; i++) {
2684
2684
 
@@ -2721,19 +2721,19 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
2721
2721
  }
2722
2722
  }
2723
2723
 
2724
- void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
2724
+ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
2725
2725
  assert(k % QK_K == 0);
2726
2726
  block_q5_K * restrict y = vy;
2727
2727
  quantize_row_q5_K_reference(x, y, k);
2728
2728
  }
2729
2729
 
2730
- static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2730
+ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2731
2731
  #if QK_K != 256
2732
2732
  (void)quant_weights;
2733
2733
  quantize_row_q5_K_reference(x, y, n_per_row);
2734
2734
  #else
2735
2735
  assert(n_per_row % QK_K == 0);
2736
- const int nb = n_per_row / QK_K;
2736
+ const int64_t nb = n_per_row / QK_K;
2737
2737
 
2738
2738
  uint8_t L[QK_K];
2739
2739
  uint8_t Laux[32];
@@ -2825,14 +2825,14 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2825
2825
  #endif
2826
2826
  }
2827
2827
 
2828
- size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2828
+ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2829
2829
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2830
2830
  if (!quant_weights) {
2831
- quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
2831
+ quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
2832
2832
  }
2833
2833
  else {
2834
2834
  char * qrow = (char *)dst;
2835
- for (int row = 0; row < nrow; ++row) {
2835
+ for (int64_t row = 0; row < nrow; ++row) {
2836
2836
  quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
2837
2837
  src += n_per_row;
2838
2838
  qrow += row_size;
@@ -2843,9 +2843,9 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow,
2843
2843
 
2844
2844
  // ====================== 6-bit (de)-quantization
2845
2845
 
2846
- void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
2846
+ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
2847
2847
  assert(k % QK_K == 0);
2848
- const int nb = k / QK_K;
2848
+ const int64_t nb = k / QK_K;
2849
2849
 
2850
2850
  int8_t L[QK_K];
2851
2851
  float scales[QK_K/16];
@@ -2925,9 +2925,9 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2925
2925
  }
2926
2926
  }
2927
2927
 
2928
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
2928
+ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
2929
2929
  assert(k % QK_K == 0);
2930
- const int nb = k / QK_K;
2930
+ const int64_t nb = k / QK_K;
2931
2931
 
2932
2932
  for (int i = 0; i < nb; i++) {
2933
2933
 
@@ -2972,19 +2972,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
2972
2972
  }
2973
2973
  }
2974
2974
 
2975
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
2975
+ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
2976
2976
  assert(k % QK_K == 0);
2977
2977
  block_q6_K * restrict y = vy;
2978
2978
  quantize_row_q6_K_reference(x, y, k);
2979
2979
  }
2980
2980
 
2981
- static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
2981
+ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2982
2982
  #if QK_K != 256
2983
2983
  (void)quant_weights;
2984
2984
  quantize_row_q6_K_reference(x, y, n_per_row);
2985
2985
  #else
2986
2986
  assert(n_per_row % QK_K == 0);
2987
- const int nb = n_per_row / QK_K;
2987
+ const int64_t nb = n_per_row / QK_K;
2988
2988
 
2989
2989
  int8_t L[QK_K];
2990
2990
  float scales[QK_K/16];
@@ -3067,14 +3067,14 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
3067
3067
  #endif
3068
3068
  }
3069
3069
 
3070
- size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3070
+ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3071
3071
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3072
3072
  if (!quant_weights) {
3073
- quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
3073
+ quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
3074
3074
  }
3075
3075
  else {
3076
3076
  char * qrow = (char *)dst;
3077
- for (int row = 0; row < nrow; ++row) {
3077
+ for (int64_t row = 0; row < nrow; ++row) {
3078
3078
  quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
3079
3079
  src += n_per_row;
3080
3080
  qrow += row_size;
@@ -3083,7 +3083,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow,
3083
3083
  return nrow * row_size;
3084
3084
  }
3085
3085
 
3086
- static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
3086
+ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
3087
3087
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
3088
3088
 
3089
3089
  if (!quant_weights) {
@@ -3098,7 +3098,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3098
3098
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3099
3099
  float sigma2 = sum_x2/n_per_row;
3100
3100
 
3101
- const int nb = n_per_row/QK4_0;
3101
+ const int64_t nb = n_per_row/QK4_0;
3102
3102
  for (int ib = 0; ib < nb; ++ib) {
3103
3103
  const float * xb = x + QK4_0 * ib;
3104
3104
  const float * qw = quant_weights + QK4_0 * ib;
@@ -3111,14 +3111,14 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3111
3111
  }
3112
3112
  }
3113
3113
 
3114
- size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3114
+ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3115
3115
  if (!quant_weights) {
3116
- quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
3116
+ quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
3117
3117
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3118
3118
  }
3119
3119
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3120
3120
  char * qrow = (char *)dst;
3121
- for (int row = 0; row < nrow; ++row) {
3121
+ for (int64_t row = 0; row < nrow; ++row) {
3122
3122
  quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
3123
3123
  src += n_per_row;
3124
3124
  qrow += row_size;
@@ -3126,7 +3126,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow,
3126
3126
  return nrow * row_size;
3127
3127
  }
3128
3128
 
3129
- static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
3129
+ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
3130
3130
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
3131
3131
 
3132
3132
  if (!quant_weights) {
@@ -3141,7 +3141,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3141
3141
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3142
3142
  float sigma2 = sum_x2/n_per_row;
3143
3143
 
3144
- const int nb = n_per_row/QK4_1;
3144
+ const int64_t nb = n_per_row/QK4_1;
3145
3145
  for (int ib = 0; ib < nb; ++ib) {
3146
3146
  const float * xb = x + QK4_1 * ib;
3147
3147
  const float * qw = quant_weights + QK4_1 * ib;
@@ -3156,14 +3156,14 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3156
3156
  }
3157
3157
  }
3158
3158
 
3159
- size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3159
+ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3160
3160
  if (!quant_weights) {
3161
- quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
3161
+ quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
3162
3162
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3163
3163
  }
3164
3164
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3165
3165
  char * qrow = (char *)dst;
3166
- for (int row = 0; row < nrow; ++row) {
3166
+ for (int64_t row = 0; row < nrow; ++row) {
3167
3167
  quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
3168
3168
  src += n_per_row;
3169
3169
  qrow += row_size;
@@ -3171,7 +3171,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow,
3171
3171
  return nrow * row_size;
3172
3172
  }
3173
3173
 
3174
- static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
3174
+ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
3175
3175
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
3176
3176
 
3177
3177
  if (!quant_weights) {
@@ -3186,7 +3186,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3186
3186
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3187
3187
  float sigma2 = sum_x2/n_per_row;
3188
3188
 
3189
- const int nb = n_per_row/QK5_0;
3189
+ const int64_t nb = n_per_row/QK5_0;
3190
3190
  for (int ib = 0; ib < nb; ++ib) {
3191
3191
  const float * xb = x + QK5_0 * ib;
3192
3192
  const float * qw = quant_weights + QK5_0 * ib;
@@ -3210,14 +3210,14 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3210
3210
  }
3211
3211
  }
3212
3212
 
3213
- size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3213
+ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3214
3214
  if (!quant_weights) {
3215
- quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
3215
+ quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
3216
3216
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3217
3217
  }
3218
3218
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3219
3219
  char * qrow = (char *)dst;
3220
- for (int row = 0; row < nrow; ++row) {
3220
+ for (int64_t row = 0; row < nrow; ++row) {
3221
3221
  quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
3222
3222
  src += n_per_row;
3223
3223
  qrow += row_size;
@@ -3225,7 +3225,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow,
3225
3225
  return nrow * row_size;
3226
3226
  }
3227
3227
 
3228
- static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
3228
+ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
3229
3229
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
3230
3230
 
3231
3231
  if (!quant_weights) {
@@ -3240,7 +3240,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3240
3240
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3241
3241
  float sigma2 = sum_x2/n_per_row;
3242
3242
 
3243
- const int nb = n_per_row/QK5_1;
3243
+ const int64_t nb = n_per_row/QK5_1;
3244
3244
  for (int ib = 0; ib < nb; ++ib) {
3245
3245
  const float * xb = x + QK5_1 * ib;
3246
3246
  const float * qw = quant_weights + QK5_1 * ib;
@@ -3263,14 +3263,14 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3263
3263
  }
3264
3264
  }
3265
3265
 
3266
- size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3266
+ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3267
3267
  if (!quant_weights) {
3268
- quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
3268
+ quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
3269
3269
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3270
3270
  }
3271
3271
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3272
3272
  char * qrow = (char *)dst;
3273
- for (int row = 0; row < nrow; ++row) {
3273
+ for (int64_t row = 0; row < nrow; ++row) {
3274
3274
  quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
3275
3275
  src += n_per_row;
3276
3276
  qrow += row_size;
@@ -3278,18 +3278,18 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow,
3278
3278
  return nrow * row_size;
3279
3279
  }
3280
3280
 
3281
- size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3281
+ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3282
3282
  (void)quant_weights; // not used
3283
3283
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3284
- quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
3284
+ quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
3285
3285
  return nrow * row_size;
3286
3286
  }
3287
3287
 
3288
3288
  // ====================== "True" 2-bit (de)-quantization
3289
3289
 
3290
- void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
3290
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
3291
3291
  assert(k % QK_K == 0);
3292
- const int nb = k / QK_K;
3292
+ const int64_t nb = k / QK_K;
3293
3293
 
3294
3294
  uint32_t aux32[2];
3295
3295
  const uint8_t * aux8 = (const uint8_t *)aux32;
@@ -3315,9 +3315,9 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
3315
3315
 
3316
3316
  // ====================== 2.3125 bpw (de)-quantization
3317
3317
 
3318
- void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
3318
+ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
3319
3319
  assert(k % QK_K == 0);
3320
- const int nb = k / QK_K;
3320
+ const int64_t nb = k / QK_K;
3321
3321
 
3322
3322
  float db[2];
3323
3323
 
@@ -3342,9 +3342,9 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
3342
3342
 
3343
3343
  // ====================== 2.5625 bpw (de)-quantization
3344
3344
 
3345
- void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
3345
+ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
3346
3346
  assert(k % QK_K == 0);
3347
- const int nb = k / QK_K;
3347
+ const int64_t nb = k / QK_K;
3348
3348
 
3349
3349
  float db[2];
3350
3350
 
@@ -3374,9 +3374,9 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
3374
3374
 
3375
3375
  // ====================== 3.0625 bpw (de)-quantization
3376
3376
 
3377
- void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
3377
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
3378
3378
  assert(k % QK_K == 0);
3379
- const int nb = k / QK_K;
3379
+ const int64_t nb = k / QK_K;
3380
3380
 
3381
3381
  uint32_t aux32;
3382
3382
 
@@ -3406,9 +3406,9 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
3406
3406
 
3407
3407
  // ====================== 3.3125 bpw (de)-quantization
3408
3408
 
3409
- void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
3409
+ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
3410
3410
  assert(k % QK_K == 0);
3411
- const int nb = k / QK_K;
3411
+ const int64_t nb = k / QK_K;
3412
3412
 
3413
3413
  for (int i = 0; i < nb; i++) {
3414
3414
 
@@ -3449,9 +3449,9 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
3449
3449
 
3450
3450
  // ====================== 1.5625 bpw (de)-quantization
3451
3451
 
3452
- void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
3452
+ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
3453
3453
  assert(k % QK_K == 0);
3454
- const int nb = k / QK_K;
3454
+ const int64_t nb = k / QK_K;
3455
3455
 
3456
3456
  for (int i = 0; i < nb; i++) {
3457
3457
 
@@ -3474,9 +3474,9 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
3474
3474
  }
3475
3475
  }
3476
3476
 
3477
- void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
3477
+ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
3478
3478
  assert(k % QK_K == 0);
3479
- const int nb = k / QK_K;
3479
+ const int64_t nb = k / QK_K;
3480
3480
 
3481
3481
  float delta[4];
3482
3482
  uint16_t idx[4];
@@ -3535,9 +3535,9 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
3535
3535
 
3536
3536
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
3537
3537
 
3538
- void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
3538
+ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
3539
3539
  assert(k % QK4_NL == 0);
3540
- const int nb = k / QK4_NL;
3540
+ const int64_t nb = k / QK4_NL;
3541
3541
 
3542
3542
  for (int i = 0; i < nb; i++) {
3543
3543
 
@@ -3553,12 +3553,12 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
3553
3553
  }
3554
3554
  }
3555
3555
 
3556
- void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
3556
+ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
3557
3557
  assert(k % QK_K == 0);
3558
3558
  #if QK_K == 64
3559
3559
  dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
3560
3560
  #else
3561
- const int nb = k / QK_K;
3561
+ const int64_t nb = k / QK_K;
3562
3562
 
3563
3563
  for (int i = 0; i < nb; i++) {
3564
3564
 
@@ -3582,9 +3582,9 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
3582
3582
 
3583
3583
  //===================================== Q8_K ==============================================
3584
3584
 
3585
- void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
3585
+ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
3586
3586
  assert(k % QK_K == 0);
3587
- const int nb = k / QK_K;
3587
+ const int64_t nb = k / QK_K;
3588
3588
 
3589
3589
  for (int i = 0; i < nb; i++) {
3590
3590
 
@@ -3621,9 +3621,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
3621
3621
  }
3622
3622
  }
3623
3623
 
3624
- void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
3624
+ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
3625
3625
  assert(k % QK_K == 0);
3626
- const int nb = k / QK_K;
3626
+ const int64_t nb = k / QK_K;
3627
3627
 
3628
3628
  for (int i = 0; i < nb; i++) {
3629
3629
  for (int j = 0; j < QK_K; ++j) {
@@ -3632,7 +3632,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
3632
3632
  }
3633
3633
  }
3634
3634
 
3635
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
3635
+ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
3636
3636
  quantize_row_q8_K_reference(x, y, k);
3637
3637
  }
3638
3638
 
@@ -10648,7 +10648,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
10648
10648
  return grid_index;
10649
10649
  }
10650
10650
 
10651
- static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
10651
+ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
10652
10652
 
10653
10653
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
10654
10654
 
@@ -10664,7 +10664,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
10664
10664
 
10665
10665
  const int kMaxQ = 3;
10666
10666
 
10667
- const int nbl = n/QK_K;
10667
+ const int64_t nbl = n/QK_K;
10668
10668
 
10669
10669
  block_iq2_xxs * y = vy;
10670
10670
 
@@ -10821,7 +10821,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
10821
10821
  }
10822
10822
  }
10823
10823
 
10824
- static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
10824
+ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
10825
10825
 
10826
10826
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
10827
10827
 
@@ -10837,7 +10837,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
10837
10837
 
10838
10838
  const int kMaxQ = 3;
10839
10839
 
10840
- const int nbl = n/QK_K;
10840
+ const int64_t nbl = n/QK_K;
10841
10841
 
10842
10842
  block_iq2_xs * y = vy;
10843
10843
 
@@ -11001,11 +11001,11 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
11001
11001
  }
11002
11002
  }
11003
11003
 
11004
- size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11004
+ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11005
11005
  GGML_ASSERT(n_per_row%QK_K == 0);
11006
- int nblock = n_per_row/QK_K;
11006
+ int64_t nblock = n_per_row/QK_K;
11007
11007
  char * qrow = (char *)dst;
11008
- for (int row = 0; row < nrow; ++row) {
11008
+ for (int64_t row = 0; row < nrow; ++row) {
11009
11009
  quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
11010
11010
  src += n_per_row;
11011
11011
  qrow += nblock*sizeof(block_iq2_xxs);
@@ -11013,11 +11013,11 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nro
11013
11013
  return nrow * nblock * sizeof(block_iq2_xxs);
11014
11014
  }
11015
11015
 
11016
- size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11016
+ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11017
11017
  GGML_ASSERT(n_per_row%QK_K == 0);
11018
- int nblock = n_per_row/QK_K;
11018
+ int64_t nblock = n_per_row/QK_K;
11019
11019
  char * qrow = (char *)dst;
11020
- for (int row = 0; row < nrow; ++row) {
11020
+ for (int64_t row = 0; row < nrow; ++row) {
11021
11021
  quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
11022
11022
  src += n_per_row;
11023
11023
  qrow += nblock*sizeof(block_iq2_xs);
@@ -11242,7 +11242,7 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
11242
11242
  return grid_index;
11243
11243
  }
11244
11244
 
11245
- static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
11245
+ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
11246
11246
  const float * restrict quant_weights) {
11247
11247
 
11248
11248
  const int gindex = iq3_data_index(grid_size);
@@ -11259,7 +11259,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11259
11259
 
11260
11260
  const int kMaxQ = 8;
11261
11261
 
11262
- const int nbl = n/QK_K;
11262
+ const int64_t nbl = n/QK_K;
11263
11263
 
11264
11264
  ggml_fp16_t * dh;
11265
11265
  uint8_t * qs;
@@ -11455,11 +11455,11 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11455
11455
  }
11456
11456
  }
11457
11457
 
11458
- size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11458
+ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11459
11459
  GGML_ASSERT(n_per_row%QK_K == 0);
11460
- int nblock = n_per_row/QK_K;
11460
+ int64_t nblock = n_per_row/QK_K;
11461
11461
  char * qrow = (char *)dst;
11462
- for (int row = 0; row < nrow; ++row) {
11462
+ for (int64_t row = 0; row < nrow; ++row) {
11463
11463
  quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
11464
11464
  src += n_per_row;
11465
11465
  qrow += nblock*sizeof(block_iq3_xxs);
@@ -11467,13 +11467,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nro
11467
11467
  return nrow * nblock * sizeof(block_iq3_xxs);
11468
11468
  }
11469
11469
 
11470
- void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
11470
+ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
11471
11471
  assert(k % QK_K == 0);
11472
11472
  block_iq3_xxs * restrict y = vy;
11473
11473
  quantize_row_iq3_xxs_reference(x, y, k);
11474
11474
  }
11475
11475
 
11476
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
11476
+ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
11477
11477
  assert(k % QK_K == 0);
11478
11478
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
11479
11479
  }
@@ -11504,7 +11504,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11504
11504
 
11505
11505
  const int kMaxQ = 8;
11506
11506
 
11507
- const int nbl = n/QK_K;
11507
+ const int64_t nbl = n/QK_K;
11508
11508
 
11509
11509
  block_iq3_s * y = vy;
11510
11510
 
@@ -11661,9 +11661,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11661
11661
  }
11662
11662
 
11663
11663
  #define IQ3S_BLOCK_SIZE 32
11664
- size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11664
+ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11665
11665
  GGML_ASSERT(n_per_row%QK_K == 0);
11666
- int nblock = n_per_row/QK_K;
11666
+ int64_t nblock = n_per_row/QK_K;
11667
11667
  float scales[QK_K/IQ3S_BLOCK_SIZE];
11668
11668
  float weight[IQ3S_BLOCK_SIZE];
11669
11669
  float xval[IQ3S_BLOCK_SIZE];
@@ -11674,7 +11674,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
11674
11674
  bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
11675
11675
  uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
11676
11676
  char * qrow = (char *)dst;
11677
- for (int row = 0; row < nrow; ++row) {
11677
+ for (int64_t row = 0; row < nrow; ++row) {
11678
11678
  quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
11679
11679
  scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
11680
11680
  src += n_per_row;
@@ -11683,13 +11683,13 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
11683
11683
  return nrow * nblock * sizeof(block_iq3_s);
11684
11684
  }
11685
11685
 
11686
- void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
11686
+ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
11687
11687
  assert(k % QK_K == 0);
11688
11688
  block_iq3_s * restrict y = vy;
11689
11689
  quantize_row_iq3_s_reference(x, y, k);
11690
11690
  }
11691
11691
 
11692
- void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
11692
+ void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
11693
11693
  assert(k % QK_K == 0);
11694
11694
  quantize_iq3_s(x, y, 1, k, NULL);
11695
11695
  }
@@ -11822,7 +11822,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
11822
11822
 
11823
11823
  #define IQ1S_BLOCK_SIZE 32
11824
11824
  #define IQ1M_BLOCK_SIZE 16
11825
- static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
11825
+ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
11826
11826
  float * scales,
11827
11827
  float * weight,
11828
11828
  float * sumx,
@@ -11846,7 +11846,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11846
11846
 
11847
11847
  block_iq1_s * y = vy;
11848
11848
 
11849
- const int nbl = n/QK_K;
11849
+ const int64_t nbl = n/QK_K;
11850
11850
 
11851
11851
  const int block_size = IQ1S_BLOCK_SIZE;
11852
11852
 
@@ -11980,7 +11980,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11980
11980
  }
11981
11981
  }
11982
11982
 
11983
- size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11983
+ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11984
11984
  GGML_ASSERT(n_per_row%QK_K == 0);
11985
11985
  float scales[QK_K/IQ1S_BLOCK_SIZE];
11986
11986
  float weight[IQ1S_BLOCK_SIZE];
@@ -11990,9 +11990,9 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
11990
11990
  float pairs[2*IQ1S_BLOCK_SIZE];
11991
11991
  uint16_t index[IQ1S_BLOCK_SIZE/8];
11992
11992
  int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11993
- int nblock = n_per_row/QK_K;
11993
+ int64_t nblock = n_per_row/QK_K;
11994
11994
  char * qrow = (char *)dst;
11995
- for (int row = 0; row < nrow; ++row) {
11995
+ for (int64_t row = 0; row < nrow; ++row) {
11996
11996
  quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
11997
11997
  src += n_per_row;
11998
11998
  qrow += nblock*sizeof(block_iq1_s);
@@ -12000,7 +12000,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
12000
12000
  return nrow * nblock * sizeof(block_iq1_s);
12001
12001
  }
12002
12002
 
12003
- static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
12003
+ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
12004
12004
  float * scales,
12005
12005
  float * weight,
12006
12006
  float * pairs,
@@ -12022,7 +12022,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
12022
12022
 
12023
12023
  block_iq1_m * y = vy;
12024
12024
 
12025
- const int nbl = n/QK_K;
12025
+ const int64_t nbl = n/QK_K;
12026
12026
 
12027
12027
  const int block_size = IQ1M_BLOCK_SIZE;
12028
12028
 
@@ -12265,7 +12265,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
12265
12265
  }
12266
12266
  }
12267
12267
 
12268
- size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12268
+ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12269
12269
  GGML_ASSERT(n_per_row%QK_K == 0);
12270
12270
  float scales[QK_K/IQ1M_BLOCK_SIZE];
12271
12271
  float weight[IQ1M_BLOCK_SIZE];
@@ -12273,9 +12273,9 @@ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow,
12273
12273
  float pairs[2*IQ1M_BLOCK_SIZE];
12274
12274
  uint16_t index[IQ1M_BLOCK_SIZE/8];
12275
12275
  int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
12276
- int nblock = n_per_row/QK_K;
12276
+ int64_t nblock = n_per_row/QK_K;
12277
12277
  char * qrow = (char *)dst;
12278
- for (int row = 0; row < nrow; ++row) {
12278
+ for (int64_t row = 0; row < nrow; ++row) {
12279
12279
  quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
12280
12280
  src += n_per_row;
12281
12281
  qrow += nblock*sizeof(block_iq1_m);
@@ -12407,16 +12407,16 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
12407
12407
  }
12408
12408
  }
12409
12409
 
12410
- size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12410
+ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12411
12411
  GGML_ASSERT(n_per_row%QK4_NL == 0);
12412
- int nblock = n_per_row/QK4_NL;
12412
+ int64_t nblock = n_per_row/QK4_NL;
12413
12413
  char * qrow = (char *)dst;
12414
12414
  uint8_t L[QK4_NL];
12415
12415
  float weight[QK4_NL];
12416
12416
  uint16_t unused_h;
12417
12417
  uint8_t * unused_l = NULL;
12418
12418
  float scale;
12419
- for (int row = 0; row < nrow; ++row) {
12419
+ for (int64_t row = 0; row < nrow; ++row) {
12420
12420
  block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
12421
12421
  for (int ibl = 0; ibl < nblock; ++ibl) {
12422
12422
  const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
@@ -12429,9 +12429,9 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
12429
12429
  return nrow * nblock * sizeof(block_iq4_nl);
12430
12430
  }
12431
12431
 
12432
- void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12432
+ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) {
12433
12433
  GGML_ASSERT(k%QK4_NL == 0);
12434
- int nblock = k/QK4_NL;
12434
+ int64_t nblock = k/QK4_NL;
12435
12435
  uint8_t L[QK4_NL];
12436
12436
  float weight[QK4_NL];
12437
12437
  uint16_t unused_h;
@@ -12444,22 +12444,22 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12444
12444
  }
12445
12445
  }
12446
12446
 
12447
- void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
12447
+ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
12448
12448
  assert(k % QK4_NL == 0);
12449
12449
  quantize_row_iq4_nl(x, y, k);
12450
12450
  }
12451
12451
 
12452
- size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12452
+ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12453
12453
  #if QK_K == 64
12454
12454
  return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
12455
12455
  #else
12456
12456
  GGML_ASSERT(n_per_row%QK_K == 0);
12457
- int nblock = n_per_row/QK_K;
12457
+ int64_t nblock = n_per_row/QK_K;
12458
12458
  char * qrow = (char *)dst;
12459
12459
  uint8_t L[QK_K];
12460
12460
  float weight[32];
12461
12461
  float scales[QK_K/32];
12462
- for (int row = 0; row < nrow; ++row) {
12462
+ for (int64_t row = 0; row < nrow; ++row) {
12463
12463
  block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
12464
12464
  for (int ibl = 0; ibl < nblock; ++ibl) {
12465
12465
  const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
@@ -12473,20 +12473,20 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
12473
12473
  #endif
12474
12474
  }
12475
12475
 
12476
- void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
12476
+ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
12477
12477
  assert(k % QK_K == 0);
12478
12478
  block_iq4_xs * restrict y = vy;
12479
12479
  quantize_row_iq4_xs_reference(x, y, k);
12480
12480
  }
12481
12481
 
12482
- void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
12482
+ void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
12483
12483
  assert(k % QK_K == 0);
12484
12484
  quantize_iq4_xs(x, y, 1, k, NULL);
12485
12485
  }
12486
12486
 
12487
12487
  // =============================== 2.5625 bpw
12488
12488
 
12489
- static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12489
+ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
12490
12490
 
12491
12491
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
12492
12492
 
@@ -12501,7 +12501,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
12501
12501
 
12502
12502
  const int kMaxQ = 3;
12503
12503
 
12504
- const int nbl = n/QK_K;
12504
+ const int64_t nbl = n/QK_K;
12505
12505
 
12506
12506
  block_iq2_s * y = vy;
12507
12507
 
@@ -12654,11 +12654,11 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
12654
12654
  }
12655
12655
  }
12656
12656
 
12657
- size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12657
+ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12658
12658
  GGML_ASSERT(n_per_row%QK_K == 0);
12659
- int nblock = n_per_row/QK_K;
12659
+ int64_t nblock = n_per_row/QK_K;
12660
12660
  char * qrow = (char *)dst;
12661
- for (int row = 0; row < nrow; ++row) {
12661
+ for (int64_t row = 0; row < nrow; ++row) {
12662
12662
  quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
12663
12663
  src += n_per_row;
12664
12664
  qrow += nblock*sizeof(block_iq2_s);
@@ -12666,12 +12666,12 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow,
12666
12666
  return nrow * nblock * sizeof(block_iq2_s);
12667
12667
  }
12668
12668
 
12669
- void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
12669
+ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
12670
12670
  assert(k % QK_K == 0);
12671
12671
  quantize_iq2_s(x, y, 1, k, NULL);
12672
12672
  }
12673
12673
 
12674
- void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
12674
+ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
12675
12675
  assert(k % QK_K == 0);
12676
12676
  block_iq2_s * restrict y = vy;
12677
12677
  quantize_row_iq2_s_reference(x, y, k);