RubyGems - llama_cpp - Versions diffs - 0.14.4 → 0.14.6 - Mend

llama_cpp 0.14.4 → 0.14.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -1
data/examples/chat.rb +2 -4
data/ext/llama_cpp/extconf.rb +1 -0
data/ext/llama_cpp/llama_cpp.cpp +23 -0
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +10 -0
data/vendor/tmp/llama.cpp/LICENSE +1 -1
data/vendor/tmp/llama.cpp/Makefile +29 -9
data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
data/vendor/tmp/llama.cpp/ggml-cuda.cu +142 -49
data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
data/vendor/tmp/llama.cpp/ggml-quants.c +156 -156
data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +942 -267
data/vendor/tmp/llama.cpp/ggml.c +161 -95
data/vendor/tmp/llama.cpp/ggml.h +12 -11
data/vendor/tmp/llama.cpp/llama.cpp +1577 -274
data/vendor/tmp/llama.cpp/llama.h +81 -13
data/vendor/tmp/llama.cpp/sgemm.cpp +1148 -0
data/vendor/tmp/llama.cpp/sgemm.h +12 -0
metadata +4 -2

data/vendor/tmp/llama.cpp/ggml-quants.c CHANGED Viewed

@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 }
 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
+#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
     const __m256i zero = _mm256_setzero_si256();
     const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
     return _mm256_cvtepi32_ps(summed_pairs);
@@ -544,7 +544,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif
 // reference implementation for deterministic creation of model files
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
+void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
     static const int qk = QK4_0;
     assert(k % qk == 0);
@@ -581,12 +581,12 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
     }
 }
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q4_0_reference(x, y, k);
 }
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
+void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
     const int qk = QK4_1;
     assert(k % qk == 0);
@@ -623,11 +623,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
     }
 }
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q4_1_reference(x, y, k);
 }
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
+void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
     static const int qk = QK5_0;
     assert(k % qk == 0);
@@ -671,11 +671,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
     }
 }
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q5_0_reference(x, y, k);
 }
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
+void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
     const int qk = QK5_1;
     assert(k % qk == 0);
@@ -719,12 +719,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
     }
 }
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q5_1_reference(x, y, k);
 }
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
+void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
@@ -749,7 +749,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
     }
 }
-void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
@@ -938,7 +938,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
 }
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
     assert(QK8_1 == 32);
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
@@ -973,7 +973,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
     }
 }
-void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
@@ -1192,7 +1192,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
 #endif
 }
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
     static const int qk = QK4_0;
     assert(k % qk == 0);
@@ -1212,7 +1212,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
     }
 }
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
     static const int qk = QK4_1;
     assert(k % qk == 0);
@@ -1233,7 +1233,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
     }
 }
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
     static const int qk = QK5_0;
     assert(k % qk == 0);
@@ -1259,7 +1259,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
     }
 }
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
     static const int qk = QK5_1;
     assert(k % qk == 0);
@@ -1286,7 +1286,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
     }
 }
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
     static const int qk = QK8_0;
     assert(k % qk == 0);
@@ -1581,7 +1581,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
 //========================- 2-bit (de)-quantization
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
+void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -1658,7 +1658,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
     }
 }
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -1704,7 +1704,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
     }
 }
-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
     quantize_row_q2_K_reference(x, vy, k);
 }
@@ -1960,14 +1960,14 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
     }
 }
-size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
     if (!quant_weights) {
-        quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
+        quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
     }
     else {
         char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
+        for (int64_t row = 0; row < nrow; ++row) {
             quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
             src += n_per_row;
             qrow += row_size;
@@ -1978,7 +1978,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow,
 //========================= 3-bit (de)-quantization
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
+void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -2092,7 +2092,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
 }
 #if QK_K == 256
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -2142,7 +2142,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
     }
 }
 #else
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     assert(QK_K == 64);
     const int nb = k / QK_K;
@@ -2175,11 +2175,11 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 }
 #endif
-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
     quantize_row_q3_K_reference(x, vy, k);
 }
-static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
+static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
 #if QK_K != 256
     (void)quant_weights;
     quantize_row_q3_K_reference(x, y, n_per_row);
@@ -2268,14 +2268,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
 #endif
 }
-size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
     if (!quant_weights) {
-        quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
+        quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
     }
     else {
         char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
+        for (int64_t row = 0; row < nrow; ++row) {
             quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
             src += n_per_row;
             qrow += row_size;
@@ -2286,7 +2286,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow,
 // ====================== 4-bit (de)-quantization
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
+void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -2393,7 +2393,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
     }
 }
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -2432,19 +2432,19 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
     }
 }
-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
     assert(k % QK_K == 0);
     block_q4_K * restrict y = vy;
     quantize_row_q4_K_reference(x, y, k);
 }
-static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
+static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
 #if QK_K != 256
     (void)quant_weights;
     quantize_row_q4_K_reference(x, y, n_per_row);
 #else
     assert(n_per_row % QK_K == 0);
-    const int nb = n_per_row / QK_K;
+    const int64_t nb = n_per_row / QK_K;
     uint8_t L[QK_K];
     uint8_t Laux[32];
@@ -2516,14 +2516,14 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
 #endif
 }
-size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
     if (!quant_weights) {
-        quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
+        quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
     }
     else {
         char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
+        for (int64_t row = 0; row < nrow; ++row) {
             quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
             src += n_per_row;
             qrow += row_size;
@@ -2534,9 +2534,9 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow,
 // ====================== 5-bit (de)-quantization
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
+void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
 #if QK_K == 256
     uint8_t L[QK_K];
@@ -2676,9 +2676,9 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
     }
 }
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     for (int i = 0; i < nb; i++) {
@@ -2721,19 +2721,19 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
     }
 }
-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
     assert(k % QK_K == 0);
     block_q5_K * restrict y = vy;
     quantize_row_q5_K_reference(x, y, k);
 }
-static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
+static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
 #if QK_K != 256
     (void)quant_weights;
     quantize_row_q5_K_reference(x, y, n_per_row);
 #else
     assert(n_per_row % QK_K == 0);
-    const int nb = n_per_row / QK_K;
+    const int64_t nb = n_per_row / QK_K;
     uint8_t L[QK_K];
     uint8_t Laux[32];
@@ -2825,14 +2825,14 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
 #endif
 }
-size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
     if (!quant_weights) {
-        quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
+        quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
     }
     else {
         char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
+        for (int64_t row = 0; row < nrow; ++row) {
             quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
             src += n_per_row;
             qrow += row_size;
@@ -2843,9 +2843,9 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow,
 // ====================== 6-bit (de)-quantization
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
+void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     int8_t L[QK_K];
     float   scales[QK_K/16];
@@ -2925,9 +2925,9 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
     }
 }
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     for (int i = 0; i < nb; i++) {
@@ -2972,19 +2972,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
     }
 }
-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
     assert(k % QK_K == 0);
     block_q6_K * restrict y = vy;
     quantize_row_q6_K_reference(x, y, k);
 }
-static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
+static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
 #if QK_K != 256
     (void)quant_weights;
     quantize_row_q6_K_reference(x, y, n_per_row);
 #else
     assert(n_per_row % QK_K == 0);
-    const int nb = n_per_row / QK_K;
+    const int64_t nb = n_per_row / QK_K;
     int8_t L[QK_K];
     float   scales[QK_K/16];
@@ -3067,14 +3067,14 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
 #endif
 }
-size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
     if (!quant_weights) {
-        quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
+        quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
     }
     else {
         char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
+        for (int64_t row = 0; row < nrow; ++row) {
             quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
             src += n_per_row;
             qrow += row_size;
@@ -3083,7 +3083,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow,
     return nrow * row_size;
 }
-static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
+static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
     static_assert(QK4_0 == 32, "QK4_0 must be 32");
     if (!quant_weights) {
@@ -3098,7 +3098,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
     for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
     float sigma2 = sum_x2/n_per_row;
-    const int nb = n_per_row/QK4_0;
+    const int64_t nb = n_per_row/QK4_0;
     for (int ib = 0; ib < nb; ++ib) {
         const float * xb = x + QK4_0 * ib;
         const float * qw = quant_weights + QK4_0 * ib;
@@ -3111,14 +3111,14 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
     }
 }
-size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
-        quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
+        quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
     }
     size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
         src += n_per_row;
         qrow += row_size;
@@ -3126,7 +3126,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow,
     return nrow * row_size;
 }
-static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
+static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
     static_assert(QK4_1 == 32, "QK4_1 must be 32");
     if (!quant_weights) {
@@ -3141,7 +3141,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
     for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
     float sigma2 = sum_x2/n_per_row;
-    const int nb = n_per_row/QK4_1;
+    const int64_t nb = n_per_row/QK4_1;
     for (int ib = 0; ib < nb; ++ib) {
         const float * xb = x + QK4_1 * ib;
         const float * qw = quant_weights + QK4_1 * ib;
@@ -3156,14 +3156,14 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
     }
 }
-size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
-        quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
+        quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
     }
     size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
         src += n_per_row;
         qrow += row_size;
@@ -3171,7 +3171,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow,
     return nrow * row_size;
 }
-static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
+static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
     static_assert(QK5_0 == 32, "QK5_0 must be 32");
     if (!quant_weights) {
@@ -3186,7 +3186,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
     for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
     float sigma2 = sum_x2/n_per_row;
-    const int nb = n_per_row/QK5_0;
+    const int64_t nb = n_per_row/QK5_0;
     for (int ib = 0; ib < nb; ++ib) {
         const float * xb = x + QK5_0 * ib;
         const float * qw = quant_weights + QK5_0 * ib;
@@ -3210,14 +3210,14 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
     }
 }
-size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
-        quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
+        quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
     }
     size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
         src += n_per_row;
         qrow += row_size;
@@ -3225,7 +3225,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow,
     return nrow * row_size;
 }
-static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
+static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
     static_assert(QK5_1 == 32, "QK5_1 must be 32");
     if (!quant_weights) {
@@ -3240,7 +3240,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
     for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
     float sigma2 = sum_x2/n_per_row;
-    const int nb = n_per_row/QK5_1;
+    const int64_t nb = n_per_row/QK5_1;
     for (int ib = 0; ib < nb; ++ib) {
         const float * xb = x + QK5_1 * ib;
         const float * qw = quant_weights + QK5_1 * ib;
@@ -3263,14 +3263,14 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
     }
 }
-size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
-        quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
+        quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
     }
     size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
         src += n_per_row;
         qrow += row_size;
@@ -3278,18 +3278,18 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow,
     return nrow * row_size;
 }
-size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     (void)quant_weights; // not used
     const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
-    quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
+    quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
     return nrow * row_size;
 }
 // ====================== "True" 2-bit (de)-quantization
-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
+void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     uint32_t aux32[2];
     const uint8_t * aux8 = (const uint8_t *)aux32;
@@ -3315,9 +3315,9 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
 // ====================== 2.3125 bpw (de)-quantization
-void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
+void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     float db[2];
@@ -3342,9 +3342,9 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
 // ====================== 2.5625 bpw (de)-quantization
-void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
+void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     float db[2];
@@ -3374,9 +3374,9 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
 // ====================== 3.0625 bpw (de)-quantization
-void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
+void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     uint32_t aux32;
@@ -3406,9 +3406,9 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
 // ====================== 3.3125 bpw (de)-quantization
-void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
+void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     for (int i = 0; i < nb; i++) {
@@ -3449,9 +3449,9 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
 // ====================== 1.5625 bpw (de)-quantization
-void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
+void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     for (int i = 0; i < nb; i++) {
@@ -3474,9 +3474,9 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
     }
 }
-void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
+void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     float delta[4];
     uint16_t idx[4];
@@ -3535,9 +3535,9 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
 static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
+void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
     assert(k % QK4_NL == 0);
-    const int nb = k / QK4_NL;
+    const int64_t nb = k / QK4_NL;
     for (int i = 0; i < nb; i++) {
@@ -3553,12 +3553,12 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
     }
 }
-void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
+void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
 #if QK_K == 64
     dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
 #else
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     for (int i = 0; i < nb; i++) {
@@ -3582,9 +3582,9 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
 //===================================== Q8_K ==============================================
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
+void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     for (int i = 0; i < nb; i++) {
@@ -3621,9 +3621,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
     }
 }
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    const int64_t nb = k / QK_K;
     for (int i = 0; i < nb; i++) {
         for (int j = 0; j < QK_K; ++j) {
@@ -3632,7 +3632,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
     }
 }
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q8_K_reference(x, y, k);
 }
@@ -10648,7 +10648,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
     return grid_index;
 }
-static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
+static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
     const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
@@ -10664,7 +10664,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
     const int kMaxQ = 3;
-    const int nbl = n/QK_K;
+    const int64_t nbl = n/QK_K;
     block_iq2_xxs * y = vy;
@@ -10821,7 +10821,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
     }
 }
-static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
+static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
     const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
@@ -10837,7 +10837,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
     const int kMaxQ = 3;
-    const int nbl = n/QK_K;
+    const int64_t nbl = n/QK_K;
     block_iq2_xs * y = vy;
@@ -11001,11 +11001,11 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
     }
 }
-size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
+    int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
         src += n_per_row;
         qrow += nblock*sizeof(block_iq2_xxs);
@@ -11013,11 +11013,11 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nro
     return nrow * nblock * sizeof(block_iq2_xxs);
 }
-size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
+    int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
         src += n_per_row;
         qrow += nblock*sizeof(block_iq2_xs);
@@ -11242,7 +11242,7 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
     return grid_index;
 }
-static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
+static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
         const float * restrict quant_weights) {
     const int gindex = iq3_data_index(grid_size);
@@ -11259,7 +11259,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
     const int kMaxQ = 8;
-    const int nbl = n/QK_K;
+    const int64_t nbl = n/QK_K;
     ggml_fp16_t * dh;
     uint8_t * qs;
@@ -11455,11 +11455,11 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
     }
 }
-size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
+    int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
         src += n_per_row;
         qrow += nblock*sizeof(block_iq3_xxs);
@@ -11467,13 +11467,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nro
     return nrow * nblock * sizeof(block_iq3_xxs);
 }
-void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
     assert(k % QK_K == 0);
     block_iq3_xxs * restrict y = vy;
     quantize_row_iq3_xxs_reference(x, y, k);
 }
-void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
+void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
 }
@@ -11504,7 +11504,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
     const int kMaxQ = 8;
-    const int nbl = n/QK_K;
+    const int64_t nbl = n/QK_K;
     block_iq3_s * y = vy;
@@ -11661,9 +11661,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
 }
 #define IQ3S_BLOCK_SIZE 32
-size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
+    int64_t nblock = n_per_row/QK_K;
     float scales[QK_K/IQ3S_BLOCK_SIZE];
     float weight[IQ3S_BLOCK_SIZE];
     float xval[IQ3S_BLOCK_SIZE];
@@ -11674,7 +11674,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
     bool   is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
     uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
                 scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
         src += n_per_row;
@@ -11683,13 +11683,13 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow,
     return nrow * nblock * sizeof(block_iq3_s);
 }
-void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
     assert(k % QK_K == 0);
     block_iq3_s * restrict y = vy;
     quantize_row_iq3_s_reference(x, y, k);
 }
-void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
+void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_iq3_s(x, y, 1, k, NULL);
 }
@@ -11822,7 +11822,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
 #define IQ1S_BLOCK_SIZE 32
 #define IQ1M_BLOCK_SIZE 16
-static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
+static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
         float    * scales,
         float    * weight,
         float    * sumx,
@@ -11846,7 +11846,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
     block_iq1_s * y = vy;
-    const int nbl = n/QK_K;
+    const int64_t nbl = n/QK_K;
     const int block_size = IQ1S_BLOCK_SIZE;
@@ -11980,7 +11980,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
     }
 }
-size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     float  scales[QK_K/IQ1S_BLOCK_SIZE];
     float  weight[IQ1S_BLOCK_SIZE];
@@ -11990,9 +11990,9 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
     float  pairs[2*IQ1S_BLOCK_SIZE];
     uint16_t index[IQ1S_BLOCK_SIZE/8];
     int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
-    int nblock = n_per_row/QK_K;
+    int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
         src += n_per_row;
         qrow += nblock*sizeof(block_iq1_s);
@@ -12000,7 +12000,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
     return nrow * nblock * sizeof(block_iq1_s);
 }
-static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
+static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
         float    * scales,
         float    * weight,
         float    * pairs,
@@ -12022,7 +12022,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
     block_iq1_m * y = vy;
-    const int nbl = n/QK_K;
+    const int64_t nbl = n/QK_K;
     const int block_size = IQ1M_BLOCK_SIZE;
@@ -12265,7 +12265,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
     }
 }
-size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     float  scales[QK_K/IQ1M_BLOCK_SIZE];
     float  weight[IQ1M_BLOCK_SIZE];
@@ -12273,9 +12273,9 @@ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow,
     float  pairs[2*IQ1M_BLOCK_SIZE];
     uint16_t index[IQ1M_BLOCK_SIZE/8];
     int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
-    int nblock = n_per_row/QK_K;
+    int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
         src += n_per_row;
         qrow += nblock*sizeof(block_iq1_m);
@@ -12407,16 +12407,16 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
     }
 }
-size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK4_NL == 0);
-    int nblock = n_per_row/QK4_NL;
+    int64_t nblock = n_per_row/QK4_NL;
     char * qrow = (char *)dst;
     uint8_t L[QK4_NL];
     float weight[QK4_NL];
     uint16_t unused_h;
     uint8_t * unused_l = NULL;
     float scale;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
         for (int ibl = 0; ibl < nblock; ++ibl) {
             const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
@@ -12429,9 +12429,9 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
     return nrow * nblock * sizeof(block_iq4_nl);
 }
-void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) {
     GGML_ASSERT(k%QK4_NL == 0);
-    int nblock = k/QK4_NL;
+    int64_t nblock = k/QK4_NL;
     uint8_t L[QK4_NL];
     float weight[QK4_NL];
     uint16_t unused_h;
@@ -12444,22 +12444,22 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
     }
 }
-void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
+void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
     assert(k % QK4_NL == 0);
     quantize_row_iq4_nl(x, y, k);
 }
-size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
 #if QK_K == 64
     return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
 #else
     GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
+    int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
     uint8_t L[QK_K];
     float weight[32];
     float scales[QK_K/32];
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
         for (int ibl = 0; ibl < nblock; ++ibl) {
             const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
@@ -12473,20 +12473,20 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
 #endif
 }
-void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
     assert(k % QK_K == 0);
     block_iq4_xs * restrict y = vy;
     quantize_row_iq4_xs_reference(x, y, k);
 }
-void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
+void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_iq4_xs(x, y, 1, k, NULL);
 }
 // =============================== 2.5625 bpw
-static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
+static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
     const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
@@ -12501,7 +12501,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
     const int kMaxQ = 3;
-    const int nbl = n/QK_K;
+    const int64_t nbl = n/QK_K;
     block_iq2_s * y = vy;
@@ -12654,11 +12654,11 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
     }
 }
-size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
+size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
+    int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
+    for (int64_t row = 0; row < nrow; ++row) {
         quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
         src += n_per_row;
         qrow += nblock*sizeof(block_iq2_s);
@@ -12666,12 +12666,12 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow,
     return nrow * nblock * sizeof(block_iq2_s);
 }
-void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
+void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_iq2_s(x, y, 1, k, NULL);
 }
-void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
     assert(k % QK_K == 0);
     block_iq2_s * restrict y = vy;
     quantize_row_iq2_s_reference(x, y, k);