llama_cpp 0.14.4 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -544,7 +544,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
544
544
  #endif
545
545
 
546
546
  // reference implementation for deterministic creation of model files
547
- void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
547
+ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
548
548
  static const int qk = QK4_0;
549
549
 
550
550
  assert(k % qk == 0);
@@ -581,12 +581,12 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
581
581
  }
582
582
  }
583
583
 
584
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
584
+ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
585
585
  quantize_row_q4_0_reference(x, y, k);
586
586
  }
587
587
 
588
588
 
589
- void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
589
+ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
590
590
  const int qk = QK4_1;
591
591
 
592
592
  assert(k % qk == 0);
@@ -623,11 +623,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
623
623
  }
624
624
  }
625
625
 
626
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
626
+ void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
627
627
  quantize_row_q4_1_reference(x, y, k);
628
628
  }
629
629
 
630
- void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
630
+ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
631
631
  static const int qk = QK5_0;
632
632
 
633
633
  assert(k % qk == 0);
@@ -671,11 +671,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
671
671
  }
672
672
  }
673
673
 
674
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
674
+ void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
675
675
  quantize_row_q5_0_reference(x, y, k);
676
676
  }
677
677
 
678
- void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
678
+ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
679
679
  const int qk = QK5_1;
680
680
 
681
681
  assert(k % qk == 0);
@@ -719,12 +719,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
719
719
  }
720
720
  }
721
721
 
722
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
722
+ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
723
723
  quantize_row_q5_1_reference(x, y, k);
724
724
  }
725
725
 
726
726
  // reference implementation for deterministic creation of model files
727
- void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
727
+ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
728
728
  assert(k % QK8_0 == 0);
729
729
  const int nb = k / QK8_0;
730
730
 
@@ -749,7 +749,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
749
749
  }
750
750
  }
751
751
 
752
- void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
752
+ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
753
753
  assert(QK8_0 == 32);
754
754
  assert(k % QK8_0 == 0);
755
755
  const int nb = k / QK8_0;
@@ -938,7 +938,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
938
938
  }
939
939
 
940
940
  // reference implementation for deterministic creation of model files
941
- void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
941
+ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
942
942
  assert(QK8_1 == 32);
943
943
  assert(k % QK8_1 == 0);
944
944
  const int nb = k / QK8_1;
@@ -973,7 +973,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
973
973
  }
974
974
  }
975
975
 
976
- void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
976
+ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
977
977
  assert(k % QK8_1 == 0);
978
978
  const int nb = k / QK8_1;
979
979
 
@@ -1192,7 +1192,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1192
1192
  #endif
1193
1193
  }
1194
1194
 
1195
- void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
1195
+ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
1196
1196
  static const int qk = QK4_0;
1197
1197
 
1198
1198
  assert(k % qk == 0);
@@ -1212,7 +1212,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
1212
1212
  }
1213
1213
  }
1214
1214
 
1215
- void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
1215
+ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
1216
1216
  static const int qk = QK4_1;
1217
1217
 
1218
1218
  assert(k % qk == 0);
@@ -1233,7 +1233,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
1233
1233
  }
1234
1234
  }
1235
1235
 
1236
- void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
1236
+ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
1237
1237
  static const int qk = QK5_0;
1238
1238
 
1239
1239
  assert(k % qk == 0);
@@ -1259,7 +1259,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
1259
1259
  }
1260
1260
  }
1261
1261
 
1262
- void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
1262
+ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
1263
1263
  static const int qk = QK5_1;
1264
1264
 
1265
1265
  assert(k % qk == 0);
@@ -1286,7 +1286,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
1286
1286
  }
1287
1287
  }
1288
1288
 
1289
- void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
1289
+ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
1290
1290
  static const int qk = QK8_0;
1291
1291
 
1292
1292
  assert(k % qk == 0);
@@ -1581,7 +1581,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
1581
1581
 
1582
1582
  //========================- 2-bit (de)-quantization
1583
1583
 
1584
- void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
1584
+ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
1585
1585
  assert(k % QK_K == 0);
1586
1586
  const int nb = k / QK_K;
1587
1587
 
@@ -1658,7 +1658,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
1658
1658
  }
1659
1659
  }
1660
1660
 
1661
- void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
1661
+ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
1662
1662
  assert(k % QK_K == 0);
1663
1663
  const int nb = k / QK_K;
1664
1664
 
@@ -1704,7 +1704,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
1704
1704
  }
1705
1705
  }
1706
1706
 
1707
- void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
1707
+ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
1708
1708
  quantize_row_q2_K_reference(x, vy, k);
1709
1709
  }
1710
1710
 
@@ -1960,14 +1960,14 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1960
1960
  }
1961
1961
  }
1962
1962
 
1963
- size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
1963
+ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1964
1964
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1965
1965
  if (!quant_weights) {
1966
- quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
1966
+ quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
1967
1967
  }
1968
1968
  else {
1969
1969
  char * qrow = (char *)dst;
1970
- for (int row = 0; row < nrow; ++row) {
1970
+ for (int64_t row = 0; row < nrow; ++row) {
1971
1971
  quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
1972
1972
  src += n_per_row;
1973
1973
  qrow += row_size;
@@ -1978,7 +1978,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow,
1978
1978
 
1979
1979
  //========================= 3-bit (de)-quantization
1980
1980
 
1981
- void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
1981
+ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
1982
1982
  assert(k % QK_K == 0);
1983
1983
  const int nb = k / QK_K;
1984
1984
 
@@ -2092,7 +2092,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
2092
2092
  }
2093
2093
 
2094
2094
  #if QK_K == 256
2095
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
2095
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
2096
2096
  assert(k % QK_K == 0);
2097
2097
  const int nb = k / QK_K;
2098
2098
 
@@ -2142,7 +2142,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
2142
2142
  }
2143
2143
  }
2144
2144
  #else
2145
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
2145
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
2146
2146
  assert(k % QK_K == 0);
2147
2147
  assert(QK_K == 64);
2148
2148
  const int nb = k / QK_K;
@@ -2175,11 +2175,11 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
2175
2175
  }
2176
2176
  #endif
2177
2177
 
2178
- void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
2178
+ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
2179
2179
  quantize_row_q3_K_reference(x, vy, k);
2180
2180
  }
2181
2181
 
2182
- static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2182
+ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
2183
2183
  #if QK_K != 256
2184
2184
  (void)quant_weights;
2185
2185
  quantize_row_q3_K_reference(x, y, n_per_row);
@@ -2268,14 +2268,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
2268
2268
  #endif
2269
2269
  }
2270
2270
 
2271
- size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2271
+ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2272
2272
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2273
2273
  if (!quant_weights) {
2274
- quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
2274
+ quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
2275
2275
  }
2276
2276
  else {
2277
2277
  char * qrow = (char *)dst;
2278
- for (int row = 0; row < nrow; ++row) {
2278
+ for (int64_t row = 0; row < nrow; ++row) {
2279
2279
  quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
2280
2280
  src += n_per_row;
2281
2281
  qrow += row_size;
@@ -2286,7 +2286,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow,
2286
2286
 
2287
2287
  // ====================== 4-bit (de)-quantization
2288
2288
 
2289
- void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
2289
+ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
2290
2290
  assert(k % QK_K == 0);
2291
2291
  const int nb = k / QK_K;
2292
2292
 
@@ -2393,7 +2393,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
2393
2393
  }
2394
2394
  }
2395
2395
 
2396
- void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
2396
+ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
2397
2397
  assert(k % QK_K == 0);
2398
2398
  const int nb = k / QK_K;
2399
2399
 
@@ -2432,19 +2432,19 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
2432
2432
  }
2433
2433
  }
2434
2434
 
2435
- void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
2435
+ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
2436
2436
  assert(k % QK_K == 0);
2437
2437
  block_q4_K * restrict y = vy;
2438
2438
  quantize_row_q4_K_reference(x, y, k);
2439
2439
  }
2440
2440
 
2441
- static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2441
+ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2442
2442
  #if QK_K != 256
2443
2443
  (void)quant_weights;
2444
2444
  quantize_row_q4_K_reference(x, y, n_per_row);
2445
2445
  #else
2446
2446
  assert(n_per_row % QK_K == 0);
2447
- const int nb = n_per_row / QK_K;
2447
+ const int64_t nb = n_per_row / QK_K;
2448
2448
 
2449
2449
  uint8_t L[QK_K];
2450
2450
  uint8_t Laux[32];
@@ -2516,14 +2516,14 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2516
2516
  #endif
2517
2517
  }
2518
2518
 
2519
- size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2519
+ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2520
2520
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2521
2521
  if (!quant_weights) {
2522
- quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
2522
+ quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
2523
2523
  }
2524
2524
  else {
2525
2525
  char * qrow = (char *)dst;
2526
- for (int row = 0; row < nrow; ++row) {
2526
+ for (int64_t row = 0; row < nrow; ++row) {
2527
2527
  quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
2528
2528
  src += n_per_row;
2529
2529
  qrow += row_size;
@@ -2534,9 +2534,9 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow,
2534
2534
 
2535
2535
  // ====================== 5-bit (de)-quantization
2536
2536
 
2537
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
2537
+ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
2538
2538
  assert(k % QK_K == 0);
2539
- const int nb = k / QK_K;
2539
+ const int64_t nb = k / QK_K;
2540
2540
 
2541
2541
  #if QK_K == 256
2542
2542
  uint8_t L[QK_K];
@@ -2676,9 +2676,9 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
2676
2676
  }
2677
2677
  }
2678
2678
 
2679
- void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
2679
+ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
2680
2680
  assert(k % QK_K == 0);
2681
- const int nb = k / QK_K;
2681
+ const int64_t nb = k / QK_K;
2682
2682
 
2683
2683
  for (int i = 0; i < nb; i++) {
2684
2684
 
@@ -2721,19 +2721,19 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
2721
2721
  }
2722
2722
  }
2723
2723
 
2724
- void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
2724
+ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
2725
2725
  assert(k % QK_K == 0);
2726
2726
  block_q5_K * restrict y = vy;
2727
2727
  quantize_row_q5_K_reference(x, y, k);
2728
2728
  }
2729
2729
 
2730
- static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2730
+ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2731
2731
  #if QK_K != 256
2732
2732
  (void)quant_weights;
2733
2733
  quantize_row_q5_K_reference(x, y, n_per_row);
2734
2734
  #else
2735
2735
  assert(n_per_row % QK_K == 0);
2736
- const int nb = n_per_row / QK_K;
2736
+ const int64_t nb = n_per_row / QK_K;
2737
2737
 
2738
2738
  uint8_t L[QK_K];
2739
2739
  uint8_t Laux[32];
@@ -2825,14 +2825,14 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2825
2825
  #endif
2826
2826
  }
2827
2827
 
2828
- size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2828
+ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2829
2829
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2830
2830
  if (!quant_weights) {
2831
- quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
2831
+ quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
2832
2832
  }
2833
2833
  else {
2834
2834
  char * qrow = (char *)dst;
2835
- for (int row = 0; row < nrow; ++row) {
2835
+ for (int64_t row = 0; row < nrow; ++row) {
2836
2836
  quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
2837
2837
  src += n_per_row;
2838
2838
  qrow += row_size;
@@ -2843,9 +2843,9 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow,
2843
2843
 
2844
2844
  // ====================== 6-bit (de)-quantization
2845
2845
 
2846
- void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
2846
+ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
2847
2847
  assert(k % QK_K == 0);
2848
- const int nb = k / QK_K;
2848
+ const int64_t nb = k / QK_K;
2849
2849
 
2850
2850
  int8_t L[QK_K];
2851
2851
  float scales[QK_K/16];
@@ -2925,9 +2925,9 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2925
2925
  }
2926
2926
  }
2927
2927
 
2928
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
2928
+ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
2929
2929
  assert(k % QK_K == 0);
2930
- const int nb = k / QK_K;
2930
+ const int64_t nb = k / QK_K;
2931
2931
 
2932
2932
  for (int i = 0; i < nb; i++) {
2933
2933
 
@@ -2972,19 +2972,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
2972
2972
  }
2973
2973
  }
2974
2974
 
2975
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
2975
+ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
2976
2976
  assert(k % QK_K == 0);
2977
2977
  block_q6_K * restrict y = vy;
2978
2978
  quantize_row_q6_K_reference(x, y, k);
2979
2979
  }
2980
2980
 
2981
- static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
2981
+ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
2982
2982
  #if QK_K != 256
2983
2983
  (void)quant_weights;
2984
2984
  quantize_row_q6_K_reference(x, y, n_per_row);
2985
2985
  #else
2986
2986
  assert(n_per_row % QK_K == 0);
2987
- const int nb = n_per_row / QK_K;
2987
+ const int64_t nb = n_per_row / QK_K;
2988
2988
 
2989
2989
  int8_t L[QK_K];
2990
2990
  float scales[QK_K/16];
@@ -3067,14 +3067,14 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
3067
3067
  #endif
3068
3068
  }
3069
3069
 
3070
- size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3070
+ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3071
3071
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3072
3072
  if (!quant_weights) {
3073
- quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
3073
+ quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
3074
3074
  }
3075
3075
  else {
3076
3076
  char * qrow = (char *)dst;
3077
- for (int row = 0; row < nrow; ++row) {
3077
+ for (int64_t row = 0; row < nrow; ++row) {
3078
3078
  quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
3079
3079
  src += n_per_row;
3080
3080
  qrow += row_size;
@@ -3083,7 +3083,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow,
3083
3083
  return nrow * row_size;
3084
3084
  }
3085
3085
 
3086
- static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
3086
+ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
3087
3087
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
3088
3088
 
3089
3089
  if (!quant_weights) {
@@ -3098,7 +3098,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3098
3098
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3099
3099
  float sigma2 = sum_x2/n_per_row;
3100
3100
 
3101
- const int nb = n_per_row/QK4_0;
3101
+ const int64_t nb = n_per_row/QK4_0;
3102
3102
  for (int ib = 0; ib < nb; ++ib) {
3103
3103
  const float * xb = x + QK4_0 * ib;
3104
3104
  const float * qw = quant_weights + QK4_0 * ib;
@@ -3111,14 +3111,14 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3111
3111
  }
3112
3112
  }
3113
3113
 
3114
- size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3114
+ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3115
3115
  if (!quant_weights) {
3116
- quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
3116
+ quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
3117
3117
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3118
3118
  }
3119
3119
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3120
3120
  char * qrow = (char *)dst;
3121
- for (int row = 0; row < nrow; ++row) {
3121
+ for (int64_t row = 0; row < nrow; ++row) {
3122
3122
  quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
3123
3123
  src += n_per_row;
3124
3124
  qrow += row_size;
@@ -3126,7 +3126,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow,
3126
3126
  return nrow * row_size;
3127
3127
  }
3128
3128
 
3129
- static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
3129
+ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
3130
3130
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
3131
3131
 
3132
3132
  if (!quant_weights) {
@@ -3141,7 +3141,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3141
3141
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3142
3142
  float sigma2 = sum_x2/n_per_row;
3143
3143
 
3144
- const int nb = n_per_row/QK4_1;
3144
+ const int64_t nb = n_per_row/QK4_1;
3145
3145
  for (int ib = 0; ib < nb; ++ib) {
3146
3146
  const float * xb = x + QK4_1 * ib;
3147
3147
  const float * qw = quant_weights + QK4_1 * ib;
@@ -3156,14 +3156,14 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3156
3156
  }
3157
3157
  }
3158
3158
 
3159
- size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3159
+ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3160
3160
  if (!quant_weights) {
3161
- quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
3161
+ quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
3162
3162
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3163
3163
  }
3164
3164
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3165
3165
  char * qrow = (char *)dst;
3166
- for (int row = 0; row < nrow; ++row) {
3166
+ for (int64_t row = 0; row < nrow; ++row) {
3167
3167
  quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
3168
3168
  src += n_per_row;
3169
3169
  qrow += row_size;
@@ -3171,7 +3171,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow,
3171
3171
  return nrow * row_size;
3172
3172
  }
3173
3173
 
3174
- static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
3174
+ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
3175
3175
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
3176
3176
 
3177
3177
  if (!quant_weights) {
@@ -3186,7 +3186,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3186
3186
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3187
3187
  float sigma2 = sum_x2/n_per_row;
3188
3188
 
3189
- const int nb = n_per_row/QK5_0;
3189
+ const int64_t nb = n_per_row/QK5_0;
3190
3190
  for (int ib = 0; ib < nb; ++ib) {
3191
3191
  const float * xb = x + QK5_0 * ib;
3192
3192
  const float * qw = quant_weights + QK5_0 * ib;
@@ -3210,14 +3210,14 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3210
3210
  }
3211
3211
  }
3212
3212
 
3213
- size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3213
+ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3214
3214
  if (!quant_weights) {
3215
- quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
3215
+ quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
3216
3216
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3217
3217
  }
3218
3218
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3219
3219
  char * qrow = (char *)dst;
3220
- for (int row = 0; row < nrow; ++row) {
3220
+ for (int64_t row = 0; row < nrow; ++row) {
3221
3221
  quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
3222
3222
  src += n_per_row;
3223
3223
  qrow += row_size;
@@ -3225,7 +3225,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow,
3225
3225
  return nrow * row_size;
3226
3226
  }
3227
3227
 
3228
- static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
3228
+ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
3229
3229
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
3230
3230
 
3231
3231
  if (!quant_weights) {
@@ -3240,7 +3240,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3240
3240
  for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3241
3241
  float sigma2 = sum_x2/n_per_row;
3242
3242
 
3243
- const int nb = n_per_row/QK5_1;
3243
+ const int64_t nb = n_per_row/QK5_1;
3244
3244
  for (int ib = 0; ib < nb; ++ib) {
3245
3245
  const float * xb = x + QK5_1 * ib;
3246
3246
  const float * qw = quant_weights + QK5_1 * ib;
@@ -3263,14 +3263,14 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3263
3263
  }
3264
3264
  }
3265
3265
 
3266
- size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3266
+ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3267
3267
  if (!quant_weights) {
3268
- quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
3268
+ quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
3269
3269
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3270
3270
  }
3271
3271
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3272
3272
  char * qrow = (char *)dst;
3273
- for (int row = 0; row < nrow; ++row) {
3273
+ for (int64_t row = 0; row < nrow; ++row) {
3274
3274
  quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
3275
3275
  src += n_per_row;
3276
3276
  qrow += row_size;
@@ -3278,18 +3278,18 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow,
3278
3278
  return nrow * row_size;
3279
3279
  }
3280
3280
 
3281
- size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3281
+ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3282
3282
  (void)quant_weights; // not used
3283
3283
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3284
- quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
3284
+ quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
3285
3285
  return nrow * row_size;
3286
3286
  }
3287
3287
 
3288
3288
  // ====================== "True" 2-bit (de)-quantization
3289
3289
 
3290
- void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
3290
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
3291
3291
  assert(k % QK_K == 0);
3292
- const int nb = k / QK_K;
3292
+ const int64_t nb = k / QK_K;
3293
3293
 
3294
3294
  uint32_t aux32[2];
3295
3295
  const uint8_t * aux8 = (const uint8_t *)aux32;
@@ -3315,9 +3315,9 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
3315
3315
 
3316
3316
  // ====================== 2.3125 bpw (de)-quantization
3317
3317
 
3318
- void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
3318
+ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
3319
3319
  assert(k % QK_K == 0);
3320
- const int nb = k / QK_K;
3320
+ const int64_t nb = k / QK_K;
3321
3321
 
3322
3322
  float db[2];
3323
3323
 
@@ -3342,9 +3342,9 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
3342
3342
 
3343
3343
  // ====================== 2.5625 bpw (de)-quantization
3344
3344
 
3345
- void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
3345
+ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
3346
3346
  assert(k % QK_K == 0);
3347
- const int nb = k / QK_K;
3347
+ const int64_t nb = k / QK_K;
3348
3348
 
3349
3349
  float db[2];
3350
3350
 
@@ -3374,9 +3374,9 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
3374
3374
 
3375
3375
  // ====================== 3.0625 bpw (de)-quantization
3376
3376
 
3377
- void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
3377
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
3378
3378
  assert(k % QK_K == 0);
3379
- const int nb = k / QK_K;
3379
+ const int64_t nb = k / QK_K;
3380
3380
 
3381
3381
  uint32_t aux32;
3382
3382
 
@@ -3406,9 +3406,9 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
3406
3406
 
3407
3407
  // ====================== 3.3125 bpw (de)-quantization
3408
3408
 
3409
- void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
3409
+ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
3410
3410
  assert(k % QK_K == 0);
3411
- const int nb = k / QK_K;
3411
+ const int64_t nb = k / QK_K;
3412
3412
 
3413
3413
  for (int i = 0; i < nb; i++) {
3414
3414
 
@@ -3449,9 +3449,9 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
3449
3449
 
3450
3450
  // ====================== 1.5625 bpw (de)-quantization
3451
3451
 
3452
- void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
3452
+ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
3453
3453
  assert(k % QK_K == 0);
3454
- const int nb = k / QK_K;
3454
+ const int64_t nb = k / QK_K;
3455
3455
 
3456
3456
  for (int i = 0; i < nb; i++) {
3457
3457
 
@@ -3474,9 +3474,9 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
3474
3474
  }
3475
3475
  }
3476
3476
 
3477
- void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
3477
+ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
3478
3478
  assert(k % QK_K == 0);
3479
- const int nb = k / QK_K;
3479
+ const int64_t nb = k / QK_K;
3480
3480
 
3481
3481
  float delta[4];
3482
3482
  uint16_t idx[4];
@@ -3535,9 +3535,9 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
3535
3535
 
3536
3536
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
3537
3537
 
3538
- void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
3538
+ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
3539
3539
  assert(k % QK4_NL == 0);
3540
- const int nb = k / QK4_NL;
3540
+ const int64_t nb = k / QK4_NL;
3541
3541
 
3542
3542
  for (int i = 0; i < nb; i++) {
3543
3543
 
@@ -3553,12 +3553,12 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
3553
3553
  }
3554
3554
  }
3555
3555
 
3556
- void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
3556
+ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
3557
3557
  assert(k % QK_K == 0);
3558
3558
  #if QK_K == 64
3559
3559
  dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
3560
3560
  #else
3561
- const int nb = k / QK_K;
3561
+ const int64_t nb = k / QK_K;
3562
3562
 
3563
3563
  for (int i = 0; i < nb; i++) {
3564
3564
 
@@ -3582,9 +3582,9 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
3582
3582
 
3583
3583
  //===================================== Q8_K ==============================================
3584
3584
 
3585
- void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
3585
+ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
3586
3586
  assert(k % QK_K == 0);
3587
- const int nb = k / QK_K;
3587
+ const int64_t nb = k / QK_K;
3588
3588
 
3589
3589
  for (int i = 0; i < nb; i++) {
3590
3590
 
@@ -3621,9 +3621,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
3621
3621
  }
3622
3622
  }
3623
3623
 
3624
- void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
3624
+ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
3625
3625
  assert(k % QK_K == 0);
3626
- const int nb = k / QK_K;
3626
+ const int64_t nb = k / QK_K;
3627
3627
 
3628
3628
  for (int i = 0; i < nb; i++) {
3629
3629
  for (int j = 0; j < QK_K; ++j) {
@@ -3632,7 +3632,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
3632
3632
  }
3633
3633
  }
3634
3634
 
3635
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
3635
+ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
3636
3636
  quantize_row_q8_K_reference(x, y, k);
3637
3637
  }
3638
3638
 
@@ -10648,7 +10648,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
10648
10648
  return grid_index;
10649
10649
  }
10650
10650
 
10651
- static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
10651
+ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
10652
10652
 
10653
10653
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
10654
10654
 
@@ -10664,7 +10664,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
10664
10664
 
10665
10665
  const int kMaxQ = 3;
10666
10666
 
10667
- const int nbl = n/QK_K;
10667
+ const int64_t nbl = n/QK_K;
10668
10668
 
10669
10669
  block_iq2_xxs * y = vy;
10670
10670
 
@@ -10821,7 +10821,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
10821
10821
  }
10822
10822
  }
10823
10823
 
10824
- static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
10824
+ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
10825
10825
 
10826
10826
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
10827
10827
 
@@ -10837,7 +10837,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
10837
10837
 
10838
10838
  const int kMaxQ = 3;
10839
10839
 
10840
- const int nbl = n/QK_K;
10840
+ const int64_t nbl = n/QK_K;
10841
10841
 
10842
10842
  block_iq2_xs * y = vy;
10843
10843
 
@@ -11001,11 +11001,11 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
11001
11001
  }
11002
11002
  }
11003
11003
 
11004
- size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11004
+ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11005
11005
  GGML_ASSERT(n_per_row%QK_K == 0);
11006
- int nblock = n_per_row/QK_K;
11006
+ int64_t nblock = n_per_row/QK_K;
11007
11007
  char * qrow = (char *)dst;
11008
- for (int row = 0; row < nrow; ++row) {
11008
+ for (int64_t row = 0; row < nrow; ++row) {
11009
11009
  quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
11010
11010
  src += n_per_row;
11011
11011
  qrow += nblock*sizeof(block_iq2_xxs);
@@ -11013,11 +11013,11 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nro
11013
11013
  return nrow * nblock * sizeof(block_iq2_xxs);
11014
11014
  }
11015
11015
 
11016
- size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11016
+ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11017
11017
  GGML_ASSERT(n_per_row%QK_K == 0);
11018
- int nblock = n_per_row/QK_K;
11018
+ int64_t nblock = n_per_row/QK_K;
11019
11019
  char * qrow = (char *)dst;
11020
- for (int row = 0; row < nrow; ++row) {
11020
+ for (int64_t row = 0; row < nrow; ++row) {
11021
11021
  quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
11022
11022
  src += n_per_row;
11023
11023
  qrow += nblock*sizeof(block_iq2_xs);
@@ -11242,7 +11242,7 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
11242
11242
  return grid_index;
11243
11243
  }
11244
11244
 
11245
- static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
11245
+ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
11246
11246
  const float * restrict quant_weights) {
11247
11247
 
11248
11248
  const int gindex = iq3_data_index(grid_size);
@@ -11259,7 +11259,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11259
11259
 
11260
11260
  const int kMaxQ = 8;
11261
11261
 
11262
- const int nbl = n/QK_K;
11262
+ const int64_t nbl = n/QK_K;
11263
11263
 
11264
11264
  ggml_fp16_t * dh;
11265
11265
  uint8_t * qs;
@@ -11455,11 +11455,11 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11455
11455
  }
11456
11456
  }
11457
11457
 
11458
- size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11458
+ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11459
11459
  GGML_ASSERT(n_per_row%QK_K == 0);
11460
- int nblock = n_per_row/QK_K;
11460
+ int64_t nblock = n_per_row/QK_K;
11461
11461
  char * qrow = (char *)dst;
11462
- for (int row = 0; row < nrow; ++row) {
11462
+ for (int64_t row = 0; row < nrow; ++row) {
11463
11463
  quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
11464
11464
  src += n_per_row;
11465
11465
  qrow += nblock*sizeof(block_iq3_xxs);
@@ -11467,13 +11467,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nro
11467
11467
  return nrow * nblock * sizeof(block_iq3_xxs);
11468
11468
  }
11469
11469
 
11470
- void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
11470
+ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
11471
11471
  assert(k % QK_K == 0);
11472
11472
  block_iq3_xxs * restrict y = vy;
11473
11473
  quantize_row_iq3_xxs_reference(x, y, k);
11474
11474
  }
11475
11475
 
11476
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
11476
+ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
11477
11477
  assert(k % QK_K == 0);
11478
11478
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
11479
11479
  }
@@ -11504,7 +11504,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11504
11504
 
11505
11505
  const int kMaxQ = 8;
11506
11506
 
11507
- const int nbl = n/QK_K;
11507
+ const int64_t nbl = n/QK_K;
11508
11508
 
11509
11509
  block_iq3_s * y = vy;
11510
11510
 
@@ -11661,9 +11661,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11661
11661
  }
11662
11662
 
11663
11663
  #define IQ3S_BLOCK_SIZE 32
11664
- size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11664
+ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11665
11665
  GGML_ASSERT(n_per_row%QK_K == 0);
11666
- int nblock = n_per_row/QK_K;
11666
+ int64_t nblock = n_per_row/QK_K;
11667
11667
  float scales[QK_K/IQ3S_BLOCK_SIZE];
11668
11668
  float weight[IQ3S_BLOCK_SIZE];
11669
11669
  float xval[IQ3S_BLOCK_SIZE];
@@ -11674,7 +11674,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
11674
11674
  bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
11675
11675
  uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
11676
11676
  char * qrow = (char *)dst;
11677
- for (int row = 0; row < nrow; ++row) {
11677
+ for (int64_t row = 0; row < nrow; ++row) {
11678
11678
  quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
11679
11679
  scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
11680
11680
  src += n_per_row;
@@ -11683,13 +11683,13 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
11683
11683
  return nrow * nblock * sizeof(block_iq3_s);
11684
11684
  }
11685
11685
 
11686
- void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
11686
+ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
11687
11687
  assert(k % QK_K == 0);
11688
11688
  block_iq3_s * restrict y = vy;
11689
11689
  quantize_row_iq3_s_reference(x, y, k);
11690
11690
  }
11691
11691
 
11692
- void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
11692
+ void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
11693
11693
  assert(k % QK_K == 0);
11694
11694
  quantize_iq3_s(x, y, 1, k, NULL);
11695
11695
  }
@@ -11822,7 +11822,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
11822
11822
 
11823
11823
  #define IQ1S_BLOCK_SIZE 32
11824
11824
  #define IQ1M_BLOCK_SIZE 16
11825
- static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
11825
+ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
11826
11826
  float * scales,
11827
11827
  float * weight,
11828
11828
  float * sumx,
@@ -11846,7 +11846,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11846
11846
 
11847
11847
  block_iq1_s * y = vy;
11848
11848
 
11849
- const int nbl = n/QK_K;
11849
+ const int64_t nbl = n/QK_K;
11850
11850
 
11851
11851
  const int block_size = IQ1S_BLOCK_SIZE;
11852
11852
 
@@ -11980,7 +11980,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11980
11980
  }
11981
11981
  }
11982
11982
 
11983
- size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11983
+ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
11984
11984
  GGML_ASSERT(n_per_row%QK_K == 0);
11985
11985
  float scales[QK_K/IQ1S_BLOCK_SIZE];
11986
11986
  float weight[IQ1S_BLOCK_SIZE];
@@ -11990,9 +11990,9 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
11990
11990
  float pairs[2*IQ1S_BLOCK_SIZE];
11991
11991
  uint16_t index[IQ1S_BLOCK_SIZE/8];
11992
11992
  int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11993
- int nblock = n_per_row/QK_K;
11993
+ int64_t nblock = n_per_row/QK_K;
11994
11994
  char * qrow = (char *)dst;
11995
- for (int row = 0; row < nrow; ++row) {
11995
+ for (int64_t row = 0; row < nrow; ++row) {
11996
11996
  quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
11997
11997
  src += n_per_row;
11998
11998
  qrow += nblock*sizeof(block_iq1_s);
@@ -12000,7 +12000,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
12000
12000
  return nrow * nblock * sizeof(block_iq1_s);
12001
12001
  }
12002
12002
 
12003
- static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
12003
+ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
12004
12004
  float * scales,
12005
12005
  float * weight,
12006
12006
  float * pairs,
@@ -12022,7 +12022,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
12022
12022
 
12023
12023
  block_iq1_m * y = vy;
12024
12024
 
12025
- const int nbl = n/QK_K;
12025
+ const int64_t nbl = n/QK_K;
12026
12026
 
12027
12027
  const int block_size = IQ1M_BLOCK_SIZE;
12028
12028
 
@@ -12265,7 +12265,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
12265
12265
  }
12266
12266
  }
12267
12267
 
12268
- size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12268
+ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12269
12269
  GGML_ASSERT(n_per_row%QK_K == 0);
12270
12270
  float scales[QK_K/IQ1M_BLOCK_SIZE];
12271
12271
  float weight[IQ1M_BLOCK_SIZE];
@@ -12273,9 +12273,9 @@ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow,
12273
12273
  float pairs[2*IQ1M_BLOCK_SIZE];
12274
12274
  uint16_t index[IQ1M_BLOCK_SIZE/8];
12275
12275
  int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
12276
- int nblock = n_per_row/QK_K;
12276
+ int64_t nblock = n_per_row/QK_K;
12277
12277
  char * qrow = (char *)dst;
12278
- for (int row = 0; row < nrow; ++row) {
12278
+ for (int64_t row = 0; row < nrow; ++row) {
12279
12279
  quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
12280
12280
  src += n_per_row;
12281
12281
  qrow += nblock*sizeof(block_iq1_m);
@@ -12407,16 +12407,16 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
12407
12407
  }
12408
12408
  }
12409
12409
 
12410
- size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12410
+ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12411
12411
  GGML_ASSERT(n_per_row%QK4_NL == 0);
12412
- int nblock = n_per_row/QK4_NL;
12412
+ int64_t nblock = n_per_row/QK4_NL;
12413
12413
  char * qrow = (char *)dst;
12414
12414
  uint8_t L[QK4_NL];
12415
12415
  float weight[QK4_NL];
12416
12416
  uint16_t unused_h;
12417
12417
  uint8_t * unused_l = NULL;
12418
12418
  float scale;
12419
- for (int row = 0; row < nrow; ++row) {
12419
+ for (int64_t row = 0; row < nrow; ++row) {
12420
12420
  block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
12421
12421
  for (int ibl = 0; ibl < nblock; ++ibl) {
12422
12422
  const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
@@ -12429,9 +12429,9 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
12429
12429
  return nrow * nblock * sizeof(block_iq4_nl);
12430
12430
  }
12431
12431
 
12432
- void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12432
+ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) {
12433
12433
  GGML_ASSERT(k%QK4_NL == 0);
12434
- int nblock = k/QK4_NL;
12434
+ int64_t nblock = k/QK4_NL;
12435
12435
  uint8_t L[QK4_NL];
12436
12436
  float weight[QK4_NL];
12437
12437
  uint16_t unused_h;
@@ -12444,22 +12444,22 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12444
12444
  }
12445
12445
  }
12446
12446
 
12447
- void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
12447
+ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
12448
12448
  assert(k % QK4_NL == 0);
12449
12449
  quantize_row_iq4_nl(x, y, k);
12450
12450
  }
12451
12451
 
12452
- size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12452
+ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12453
12453
  #if QK_K == 64
12454
12454
  return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
12455
12455
  #else
12456
12456
  GGML_ASSERT(n_per_row%QK_K == 0);
12457
- int nblock = n_per_row/QK_K;
12457
+ int64_t nblock = n_per_row/QK_K;
12458
12458
  char * qrow = (char *)dst;
12459
12459
  uint8_t L[QK_K];
12460
12460
  float weight[32];
12461
12461
  float scales[QK_K/32];
12462
- for (int row = 0; row < nrow; ++row) {
12462
+ for (int64_t row = 0; row < nrow; ++row) {
12463
12463
  block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
12464
12464
  for (int ibl = 0; ibl < nblock; ++ibl) {
12465
12465
  const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
@@ -12473,20 +12473,20 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
12473
12473
  #endif
12474
12474
  }
12475
12475
 
12476
- void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
12476
+ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
12477
12477
  assert(k % QK_K == 0);
12478
12478
  block_iq4_xs * restrict y = vy;
12479
12479
  quantize_row_iq4_xs_reference(x, y, k);
12480
12480
  }
12481
12481
 
12482
- void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
12482
+ void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
12483
12483
  assert(k % QK_K == 0);
12484
12484
  quantize_iq4_xs(x, y, 1, k, NULL);
12485
12485
  }
12486
12486
 
12487
12487
  // =============================== 2.5625 bpw
12488
12488
 
12489
- static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12489
+ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
12490
12490
 
12491
12491
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
12492
12492
 
@@ -12501,7 +12501,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
12501
12501
 
12502
12502
  const int kMaxQ = 3;
12503
12503
 
12504
- const int nbl = n/QK_K;
12504
+ const int64_t nbl = n/QK_K;
12505
12505
 
12506
12506
  block_iq2_s * y = vy;
12507
12507
 
@@ -12654,11 +12654,11 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
12654
12654
  }
12655
12655
  }
12656
12656
 
12657
- size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12657
+ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
12658
12658
  GGML_ASSERT(n_per_row%QK_K == 0);
12659
- int nblock = n_per_row/QK_K;
12659
+ int64_t nblock = n_per_row/QK_K;
12660
12660
  char * qrow = (char *)dst;
12661
- for (int row = 0; row < nrow; ++row) {
12661
+ for (int64_t row = 0; row < nrow; ++row) {
12662
12662
  quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
12663
12663
  src += n_per_row;
12664
12664
  qrow += nblock*sizeof(block_iq2_s);
@@ -12666,12 +12666,12 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow,
12666
12666
  return nrow * nblock * sizeof(block_iq2_s);
12667
12667
  }
12668
12668
 
12669
- void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
12669
+ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
12670
12670
  assert(k % QK_K == 0);
12671
12671
  quantize_iq2_s(x, y, 1, k, NULL);
12672
12672
  }
12673
12673
 
12674
- void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
12674
+ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
12675
12675
  assert(k % QK_K == 0);
12676
12676
  block_iq2_s * restrict y = vy;
12677
12677
  quantize_row_iq2_s_reference(x, y, k);