llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +16 -0
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +1 -0
 - data/ext/llama_cpp/llama_cpp.cpp +27 -0
 - data/lib/llama_cpp/version.rb +2 -2
 - data/sig/llama_cpp.rbs +14 -0
 - data/vendor/tmp/llama.cpp/LICENSE +1 -1
 - data/vendor/tmp/llama.cpp/Makefile +81 -20
 - data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
 - data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
 - data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
 - data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
 - data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
 - data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
 - data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
 - data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
 - data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
 - data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
 - data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
 - data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
 - data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
 - data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
 - data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
 - data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
 - data/vendor/tmp/llama.cpp/ggml.c +141 -101
 - data/vendor/tmp/llama.cpp/ggml.h +18 -12
 - data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
 - data/vendor/tmp/llama.cpp/llama.h +145 -29
 - data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
 - data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
 - data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
 - data/vendor/tmp/llama.cpp/unicode.h +2 -0
 - metadata +5 -3
 
| 
         @@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) { 
     | 
|
| 
       132 
132 
     | 
    
         
             
            }
         
     | 
| 
       133 
133 
     | 
    
         | 
| 
       134 
134 
     | 
    
         
             
            static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
         
     | 
| 
       135 
     | 
    
         
            -
            #if __AVXVNNI__
         
     | 
| 
      
 135 
     | 
    
         
            +
            #if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
         
     | 
| 
       136 
136 
     | 
    
         
             
                const __m256i zero = _mm256_setzero_si256();
         
     | 
| 
       137 
137 
     | 
    
         
             
                const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
         
     | 
| 
       138 
138 
     | 
    
         
             
                return _mm256_cvtepi32_ps(summed_pairs);
         
     | 
| 
         @@ -544,7 +544,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 
     | 
|
| 
       544 
544 
     | 
    
         
             
            #endif
         
     | 
| 
       545 
545 
     | 
    
         | 
| 
       546 
546 
     | 
    
         
             
            // reference implementation for deterministic creation of model files
         
     | 
| 
       547 
     | 
    
         
            -
            void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y,  
     | 
| 
      
 547 
     | 
    
         
            +
            void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
         
     | 
| 
       548 
548 
     | 
    
         
             
                static const int qk = QK4_0;
         
     | 
| 
       549 
549 
     | 
    
         | 
| 
       550 
550 
     | 
    
         
             
                assert(k % qk == 0);
         
     | 
| 
         @@ -581,12 +581,12 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict 
     | 
|
| 
       581 
581 
     | 
    
         
             
                }
         
     | 
| 
       582 
582 
     | 
    
         
             
            }
         
     | 
| 
       583 
583 
     | 
    
         | 
| 
       584 
     | 
    
         
            -
            void quantize_row_q4_0(const float * restrict x, void * restrict y,  
     | 
| 
      
 584 
     | 
    
         
            +
            void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
         
     | 
| 
       585 
585 
     | 
    
         
             
                quantize_row_q4_0_reference(x, y, k);
         
     | 
| 
       586 
586 
     | 
    
         
             
            }
         
     | 
| 
       587 
587 
     | 
    
         | 
| 
       588 
588 
     | 
    
         | 
| 
       589 
     | 
    
         
            -
            void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y,  
     | 
| 
      
 589 
     | 
    
         
            +
            void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
         
     | 
| 
       590 
590 
     | 
    
         
             
                const int qk = QK4_1;
         
     | 
| 
       591 
591 
     | 
    
         | 
| 
       592 
592 
     | 
    
         
             
                assert(k % qk == 0);
         
     | 
| 
         @@ -623,11 +623,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict 
     | 
|
| 
       623 
623 
     | 
    
         
             
                }
         
     | 
| 
       624 
624 
     | 
    
         
             
            }
         
     | 
| 
       625 
625 
     | 
    
         | 
| 
       626 
     | 
    
         
            -
            void quantize_row_q4_1(const float * restrict x, void * restrict y,  
     | 
| 
      
 626 
     | 
    
         
            +
            void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
         
     | 
| 
       627 
627 
     | 
    
         
             
                quantize_row_q4_1_reference(x, y, k);
         
     | 
| 
       628 
628 
     | 
    
         
             
            }
         
     | 
| 
       629 
629 
     | 
    
         | 
| 
       630 
     | 
    
         
            -
            void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y,  
     | 
| 
      
 630 
     | 
    
         
            +
            void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
         
     | 
| 
       631 
631 
     | 
    
         
             
                static const int qk = QK5_0;
         
     | 
| 
       632 
632 
     | 
    
         | 
| 
       633 
633 
     | 
    
         
             
                assert(k % qk == 0);
         
     | 
| 
         @@ -671,11 +671,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict 
     | 
|
| 
       671 
671 
     | 
    
         
             
                }
         
     | 
| 
       672 
672 
     | 
    
         
             
            }
         
     | 
| 
       673 
673 
     | 
    
         | 
| 
       674 
     | 
    
         
            -
            void quantize_row_q5_0(const float * restrict x, void * restrict y,  
     | 
| 
      
 674 
     | 
    
         
            +
            void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
         
     | 
| 
       675 
675 
     | 
    
         
             
                quantize_row_q5_0_reference(x, y, k);
         
     | 
| 
       676 
676 
     | 
    
         
             
            }
         
     | 
| 
       677 
677 
     | 
    
         | 
| 
       678 
     | 
    
         
            -
            void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y,  
     | 
| 
      
 678 
     | 
    
         
            +
            void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
         
     | 
| 
       679 
679 
     | 
    
         
             
                const int qk = QK5_1;
         
     | 
| 
       680 
680 
     | 
    
         | 
| 
       681 
681 
     | 
    
         
             
                assert(k % qk == 0);
         
     | 
| 
         @@ -719,12 +719,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict 
     | 
|
| 
       719 
719 
     | 
    
         
             
                }
         
     | 
| 
       720 
720 
     | 
    
         
             
            }
         
     | 
| 
       721 
721 
     | 
    
         | 
| 
       722 
     | 
    
         
            -
            void quantize_row_q5_1(const float * restrict x, void * restrict y,  
     | 
| 
      
 722 
     | 
    
         
            +
            void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
         
     | 
| 
       723 
723 
     | 
    
         
             
                quantize_row_q5_1_reference(x, y, k);
         
     | 
| 
       724 
724 
     | 
    
         
             
            }
         
     | 
| 
       725 
725 
     | 
    
         | 
| 
       726 
726 
     | 
    
         
             
            // reference implementation for deterministic creation of model files
         
     | 
| 
       727 
     | 
    
         
            -
            void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y,  
     | 
| 
      
 727 
     | 
    
         
            +
            void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
         
     | 
| 
       728 
728 
     | 
    
         
             
                assert(k % QK8_0 == 0);
         
     | 
| 
       729 
729 
     | 
    
         
             
                const int nb = k / QK8_0;
         
     | 
| 
       730 
730 
     | 
    
         | 
| 
         @@ -749,7 +749,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict 
     | 
|
| 
       749 
749 
     | 
    
         
             
                }
         
     | 
| 
       750 
750 
     | 
    
         
             
            }
         
     | 
| 
       751 
751 
     | 
    
         | 
| 
       752 
     | 
    
         
            -
            void quantize_row_q8_0(const float * restrict x, void * restrict vy,  
     | 
| 
      
 752 
     | 
    
         
            +
            void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       753 
753 
     | 
    
         
             
                assert(QK8_0 == 32);
         
     | 
| 
       754 
754 
     | 
    
         
             
                assert(k % QK8_0 == 0);
         
     | 
| 
       755 
755 
     | 
    
         
             
                const int nb = k / QK8_0;
         
     | 
| 
         @@ -938,7 +938,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { 
     | 
|
| 
       938 
938 
     | 
    
         
             
            }
         
     | 
| 
       939 
939 
     | 
    
         | 
| 
       940 
940 
     | 
    
         
             
            // reference implementation for deterministic creation of model files
         
     | 
| 
       941 
     | 
    
         
            -
            void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y,  
     | 
| 
      
 941 
     | 
    
         
            +
            void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
         
     | 
| 
       942 
942 
     | 
    
         
             
                assert(QK8_1 == 32);
         
     | 
| 
       943 
943 
     | 
    
         
             
                assert(k % QK8_1 == 0);
         
     | 
| 
       944 
944 
     | 
    
         
             
                const int nb = k / QK8_1;
         
     | 
| 
         @@ -973,7 +973,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict 
     | 
|
| 
       973 
973 
     | 
    
         
             
                }
         
     | 
| 
       974 
974 
     | 
    
         
             
            }
         
     | 
| 
       975 
975 
     | 
    
         | 
| 
       976 
     | 
    
         
            -
            void quantize_row_q8_1(const float * restrict x, void * restrict vy,  
     | 
| 
      
 976 
     | 
    
         
            +
            void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       977 
977 
     | 
    
         
             
                assert(k % QK8_1 == 0);
         
     | 
| 
       978 
978 
     | 
    
         
             
                const int nb = k / QK8_1;
         
     | 
| 
       979 
979 
     | 
    
         | 
| 
         @@ -1192,7 +1192,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { 
     | 
|
| 
       1192 
1192 
     | 
    
         
             
            #endif
         
     | 
| 
       1193 
1193 
     | 
    
         
             
            }
         
     | 
| 
       1194 
1194 
     | 
    
         | 
| 
       1195 
     | 
    
         
            -
            void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y,  
     | 
| 
      
 1195 
     | 
    
         
            +
            void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       1196 
1196 
     | 
    
         
             
                static const int qk = QK4_0;
         
     | 
| 
       1197 
1197 
     | 
    
         | 
| 
       1198 
1198 
     | 
    
         
             
                assert(k % qk == 0);
         
     | 
| 
         @@ -1212,7 +1212,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int 
     | 
|
| 
       1212 
1212 
     | 
    
         
             
                }
         
     | 
| 
       1213 
1213 
     | 
    
         
             
            }
         
     | 
| 
       1214 
1214 
     | 
    
         | 
| 
       1215 
     | 
    
         
            -
            void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y,  
     | 
| 
      
 1215 
     | 
    
         
            +
            void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       1216 
1216 
     | 
    
         
             
                static const int qk = QK4_1;
         
     | 
| 
       1217 
1217 
     | 
    
         | 
| 
       1218 
1218 
     | 
    
         
             
                assert(k % qk == 0);
         
     | 
| 
         @@ -1233,7 +1233,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int 
     | 
|
| 
       1233 
1233 
     | 
    
         
             
                }
         
     | 
| 
       1234 
1234 
     | 
    
         
             
            }
         
     | 
| 
       1235 
1235 
     | 
    
         | 
| 
       1236 
     | 
    
         
            -
            void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y,  
     | 
| 
      
 1236 
     | 
    
         
            +
            void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       1237 
1237 
     | 
    
         
             
                static const int qk = QK5_0;
         
     | 
| 
       1238 
1238 
     | 
    
         | 
| 
       1239 
1239 
     | 
    
         
             
                assert(k % qk == 0);
         
     | 
| 
         @@ -1259,7 +1259,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int 
     | 
|
| 
       1259 
1259 
     | 
    
         
             
                }
         
     | 
| 
       1260 
1260 
     | 
    
         
             
            }
         
     | 
| 
       1261 
1261 
     | 
    
         | 
| 
       1262 
     | 
    
         
            -
            void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y,  
     | 
| 
      
 1262 
     | 
    
         
            +
            void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       1263 
1263 
     | 
    
         
             
                static const int qk = QK5_1;
         
     | 
| 
       1264 
1264 
     | 
    
         | 
| 
       1265 
1265 
     | 
    
         
             
                assert(k % qk == 0);
         
     | 
| 
         @@ -1286,7 +1286,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int 
     | 
|
| 
       1286 
1286 
     | 
    
         
             
                }
         
     | 
| 
       1287 
1287 
     | 
    
         
             
            }
         
     | 
| 
       1288 
1288 
     | 
    
         | 
| 
       1289 
     | 
    
         
            -
            void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y,  
     | 
| 
      
 1289 
     | 
    
         
            +
            void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       1290 
1290 
     | 
    
         
             
                static const int qk = QK8_0;
         
     | 
| 
       1291 
1291 
     | 
    
         | 
| 
       1292 
1292 
     | 
    
         
             
                assert(k % qk == 0);
         
     | 
| 
         @@ -1581,7 +1581,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * 
     | 
|
| 
       1581 
1581 
     | 
    
         | 
| 
       1582 
1582 
     | 
    
         
             
            //========================- 2-bit (de)-quantization
         
     | 
| 
       1583 
1583 
     | 
    
         | 
| 
       1584 
     | 
    
         
            -
            void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y,  
     | 
| 
      
 1584 
     | 
    
         
            +
            void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
         
     | 
| 
       1585 
1585 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       1586 
1586 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       1587 
1587 
     | 
    
         | 
| 
         @@ -1658,7 +1658,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict 
     | 
|
| 
       1658 
1658 
     | 
    
         
             
                }
         
     | 
| 
       1659 
1659 
     | 
    
         
             
            }
         
     | 
| 
       1660 
1660 
     | 
    
         | 
| 
       1661 
     | 
    
         
            -
            void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y,  
     | 
| 
      
 1661 
     | 
    
         
            +
            void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       1662 
1662 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       1663 
1663 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       1664 
1664 
     | 
    
         | 
| 
         @@ -1704,7 +1704,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int 
     | 
|
| 
       1704 
1704 
     | 
    
         
             
                }
         
     | 
| 
       1705 
1705 
     | 
    
         
             
            }
         
     | 
| 
       1706 
1706 
     | 
    
         | 
| 
       1707 
     | 
    
         
            -
            void quantize_row_q2_K(const float * restrict x, void * restrict vy,  
     | 
| 
      
 1707 
     | 
    
         
            +
            void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       1708 
1708 
     | 
    
         
             
                quantize_row_q2_K_reference(x, vy, k);
         
     | 
| 
       1709 
1709 
     | 
    
         
             
            }
         
     | 
| 
       1710 
1710 
     | 
    
         | 
| 
         @@ -1960,14 +1960,14 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri 
     | 
|
| 
       1960 
1960 
     | 
    
         
             
                }
         
     | 
| 
       1961 
1961 
     | 
    
         
             
            }
         
     | 
| 
       1962 
1962 
     | 
    
         | 
| 
       1963 
     | 
    
         
            -
            size_t quantize_q2_K(const float * restrict src, void * restrict dst,  
     | 
| 
      
 1963 
     | 
    
         
            +
            size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       1964 
1964 
     | 
    
         
             
                size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
         
     | 
| 
       1965 
1965 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
       1966 
     | 
    
         
            -
                    quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 1966 
     | 
    
         
            +
                    quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       1967 
1967 
     | 
    
         
             
                }
         
     | 
| 
       1968 
1968 
     | 
    
         
             
                else {
         
     | 
| 
       1969 
1969 
     | 
    
         
             
                    char * qrow = (char *)dst;
         
     | 
| 
       1970 
     | 
    
         
            -
                    for ( 
     | 
| 
      
 1970 
     | 
    
         
            +
                    for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       1971 
1971 
     | 
    
         
             
                        quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
         
     | 
| 
       1972 
1972 
     | 
    
         
             
                        src += n_per_row;
         
     | 
| 
       1973 
1973 
     | 
    
         
             
                        qrow += row_size;
         
     | 
| 
         @@ -1978,7 +1978,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       1978 
1978 
     | 
    
         | 
| 
       1979 
1979 
     | 
    
         
             
            //========================= 3-bit (de)-quantization
         
     | 
| 
       1980 
1980 
     | 
    
         | 
| 
       1981 
     | 
    
         
            -
            void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y,  
     | 
| 
      
 1981 
     | 
    
         
            +
            void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
         
     | 
| 
       1982 
1982 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       1983 
1983 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       1984 
1984 
     | 
    
         | 
| 
         @@ -2092,7 +2092,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict 
     | 
|
| 
       2092 
2092 
     | 
    
         
             
            }
         
     | 
| 
       2093 
2093 
     | 
    
         | 
| 
       2094 
2094 
     | 
    
         
             
            #if QK_K == 256
         
     | 
| 
       2095 
     | 
    
         
            -
            void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y,  
     | 
| 
      
 2095 
     | 
    
         
            +
            void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       2096 
2096 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2097 
2097 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       2098 
2098 
     | 
    
         | 
| 
         @@ -2142,7 +2142,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int 
     | 
|
| 
       2142 
2142 
     | 
    
         
             
                }
         
     | 
| 
       2143 
2143 
     | 
    
         
             
            }
         
     | 
| 
       2144 
2144 
     | 
    
         
             
            #else
         
     | 
| 
       2145 
     | 
    
         
            -
            void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y,  
     | 
| 
      
 2145 
     | 
    
         
            +
            void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       2146 
2146 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2147 
2147 
     | 
    
         
             
                assert(QK_K == 64);
         
     | 
| 
       2148 
2148 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
         @@ -2175,11 +2175,11 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int 
     | 
|
| 
       2175 
2175 
     | 
    
         
             
            }
         
     | 
| 
       2176 
2176 
     | 
    
         
             
            #endif
         
     | 
| 
       2177 
2177 
     | 
    
         | 
| 
       2178 
     | 
    
         
            -
            void quantize_row_q3_K(const float * restrict x, void * restrict vy,  
     | 
| 
      
 2178 
     | 
    
         
            +
            void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       2179 
2179 
     | 
    
         
             
                quantize_row_q3_K_reference(x, vy, k);
         
     | 
| 
       2180 
2180 
     | 
    
         
             
            }
         
     | 
| 
       2181 
2181 
     | 
    
         | 
| 
       2182 
     | 
    
         
            -
            static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y,  
     | 
| 
      
 2182 
     | 
    
         
            +
            static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
         
     | 
| 
       2183 
2183 
     | 
    
         
             
            #if QK_K != 256
         
     | 
| 
       2184 
2184 
     | 
    
         
             
                (void)quant_weights;
         
     | 
| 
       2185 
2185 
     | 
    
         
             
                quantize_row_q3_K_reference(x, y, n_per_row);
         
     | 
| 
         @@ -2268,14 +2268,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri 
     | 
|
| 
       2268 
2268 
     | 
    
         
             
            #endif
         
     | 
| 
       2269 
2269 
     | 
    
         
             
            }
         
     | 
| 
       2270 
2270 
     | 
    
         | 
| 
       2271 
     | 
    
         
            -
            size_t quantize_q3_K(const float * restrict src, void * restrict dst,  
     | 
| 
      
 2271 
     | 
    
         
            +
            size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       2272 
2272 
     | 
    
         
             
                size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
         
     | 
| 
       2273 
2273 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
       2274 
     | 
    
         
            -
                    quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 2274 
     | 
    
         
            +
                    quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       2275 
2275 
     | 
    
         
             
                }
         
     | 
| 
       2276 
2276 
     | 
    
         
             
                else {
         
     | 
| 
       2277 
2277 
     | 
    
         
             
                    char * qrow = (char *)dst;
         
     | 
| 
       2278 
     | 
    
         
            -
                    for ( 
     | 
| 
      
 2278 
     | 
    
         
            +
                    for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       2279 
2279 
     | 
    
         
             
                        quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
         
     | 
| 
       2280 
2280 
     | 
    
         
             
                        src += n_per_row;
         
     | 
| 
       2281 
2281 
     | 
    
         
             
                        qrow += row_size;
         
     | 
| 
         @@ -2286,7 +2286,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       2286 
2286 
     | 
    
         | 
| 
       2287 
2287 
     | 
    
         
             
            // ====================== 4-bit (de)-quantization
         
     | 
| 
       2288 
2288 
     | 
    
         | 
| 
       2289 
     | 
    
         
            -
            void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y,  
     | 
| 
      
 2289 
     | 
    
         
            +
            void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
         
     | 
| 
       2290 
2290 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2291 
2291 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       2292 
2292 
     | 
    
         | 
| 
         @@ -2393,7 +2393,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict 
     | 
|
| 
       2393 
2393 
     | 
    
         
             
                }
         
     | 
| 
       2394 
2394 
     | 
    
         
             
            }
         
     | 
| 
       2395 
2395 
     | 
    
         | 
| 
       2396 
     | 
    
         
            -
            void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y,  
     | 
| 
      
 2396 
     | 
    
         
            +
            void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       2397 
2397 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2398 
2398 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       2399 
2399 
     | 
    
         | 
| 
         @@ -2432,19 +2432,19 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int 
     | 
|
| 
       2432 
2432 
     | 
    
         
             
                }
         
     | 
| 
       2433 
2433 
     | 
    
         
             
            }
         
     | 
| 
       2434 
2434 
     | 
    
         | 
| 
       2435 
     | 
    
         
            -
            void quantize_row_q4_K(const float * restrict x, void * restrict vy,  
     | 
| 
      
 2435 
     | 
    
         
            +
            void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       2436 
2436 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2437 
2437 
     | 
    
         
             
                block_q4_K * restrict y = vy;
         
     | 
| 
       2438 
2438 
     | 
    
         
             
                quantize_row_q4_K_reference(x, y, k);
         
     | 
| 
       2439 
2439 
     | 
    
         
             
            }
         
     | 
| 
       2440 
2440 
     | 
    
         | 
| 
       2441 
     | 
    
         
            -
            static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y,  
     | 
| 
      
 2441 
     | 
    
         
            +
            static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       2442 
2442 
     | 
    
         
             
            #if QK_K != 256
         
     | 
| 
       2443 
2443 
     | 
    
         
             
                (void)quant_weights;
         
     | 
| 
       2444 
2444 
     | 
    
         
             
                quantize_row_q4_K_reference(x, y, n_per_row);
         
     | 
| 
       2445 
2445 
     | 
    
         
             
            #else
         
     | 
| 
       2446 
2446 
     | 
    
         
             
                assert(n_per_row % QK_K == 0);
         
     | 
| 
       2447 
     | 
    
         
            -
                const  
     | 
| 
      
 2447 
     | 
    
         
            +
                const int64_t nb = n_per_row / QK_K;
         
     | 
| 
       2448 
2448 
     | 
    
         | 
| 
       2449 
2449 
     | 
    
         
             
                uint8_t L[QK_K];
         
     | 
| 
       2450 
2450 
     | 
    
         
             
                uint8_t Laux[32];
         
     | 
| 
         @@ -2516,14 +2516,14 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri 
     | 
|
| 
       2516 
2516 
     | 
    
         
             
            #endif
         
     | 
| 
       2517 
2517 
     | 
    
         
             
            }
         
     | 
| 
       2518 
2518 
     | 
    
         | 
| 
       2519 
     | 
    
         
            -
            size_t quantize_q4_K(const float * restrict src, void * restrict dst,  
     | 
| 
      
 2519 
     | 
    
         
            +
            size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       2520 
2520 
     | 
    
         
             
                size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
         
     | 
| 
       2521 
2521 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
       2522 
     | 
    
         
            -
                    quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 2522 
     | 
    
         
            +
                    quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       2523 
2523 
     | 
    
         
             
                }
         
     | 
| 
       2524 
2524 
     | 
    
         
             
                else {
         
     | 
| 
       2525 
2525 
     | 
    
         
             
                    char * qrow = (char *)dst;
         
     | 
| 
       2526 
     | 
    
         
            -
                    for ( 
     | 
| 
      
 2526 
     | 
    
         
            +
                    for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       2527 
2527 
     | 
    
         
             
                        quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
         
     | 
| 
       2528 
2528 
     | 
    
         
             
                        src += n_per_row;
         
     | 
| 
       2529 
2529 
     | 
    
         
             
                        qrow += row_size;
         
     | 
| 
         @@ -2534,9 +2534,9 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       2534 
2534 
     | 
    
         | 
| 
       2535 
2535 
     | 
    
         
             
            // ====================== 5-bit (de)-quantization
         
     | 
| 
       2536 
2536 
     | 
    
         | 
| 
       2537 
     | 
    
         
            -
            void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y,  
     | 
| 
      
 2537 
     | 
    
         
            +
            void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
         
     | 
| 
       2538 
2538 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2539 
     | 
    
         
            -
                const  
     | 
| 
      
 2539 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       2540 
2540 
     | 
    
         | 
| 
       2541 
2541 
     | 
    
         
             
            #if QK_K == 256
         
     | 
| 
       2542 
2542 
     | 
    
         
             
                uint8_t L[QK_K];
         
     | 
| 
         @@ -2676,9 +2676,9 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict 
     | 
|
| 
       2676 
2676 
     | 
    
         
             
                }
         
     | 
| 
       2677 
2677 
     | 
    
         
             
            }
         
     | 
| 
       2678 
2678 
     | 
    
         | 
| 
       2679 
     | 
    
         
            -
            void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y,  
     | 
| 
      
 2679 
     | 
    
         
            +
            void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       2680 
2680 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2681 
     | 
    
         
            -
                const  
     | 
| 
      
 2681 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       2682 
2682 
     | 
    
         | 
| 
       2683 
2683 
     | 
    
         
             
                for (int i = 0; i < nb; i++) {
         
     | 
| 
       2684 
2684 
     | 
    
         | 
| 
         @@ -2721,19 +2721,19 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int 
     | 
|
| 
       2721 
2721 
     | 
    
         
             
                }
         
     | 
| 
       2722 
2722 
     | 
    
         
             
            }
         
     | 
| 
       2723 
2723 
     | 
    
         | 
| 
       2724 
     | 
    
         
            -
            void quantize_row_q5_K(const float * restrict x, void * restrict vy,  
     | 
| 
      
 2724 
     | 
    
         
            +
            void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       2725 
2725 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2726 
2726 
     | 
    
         
             
                block_q5_K * restrict y = vy;
         
     | 
| 
       2727 
2727 
     | 
    
         
             
                quantize_row_q5_K_reference(x, y, k);
         
     | 
| 
       2728 
2728 
     | 
    
         
             
            }
         
     | 
| 
       2729 
2729 
     | 
    
         | 
| 
       2730 
     | 
    
         
            -
            static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y,  
     | 
| 
      
 2730 
     | 
    
         
            +
            static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       2731 
2731 
     | 
    
         
             
            #if QK_K != 256
         
     | 
| 
       2732 
2732 
     | 
    
         
             
                (void)quant_weights;
         
     | 
| 
       2733 
2733 
     | 
    
         
             
                quantize_row_q5_K_reference(x, y, n_per_row);
         
     | 
| 
       2734 
2734 
     | 
    
         
             
            #else
         
     | 
| 
       2735 
2735 
     | 
    
         
             
                assert(n_per_row % QK_K == 0);
         
     | 
| 
       2736 
     | 
    
         
            -
                const  
     | 
| 
      
 2736 
     | 
    
         
            +
                const int64_t nb = n_per_row / QK_K;
         
     | 
| 
       2737 
2737 
     | 
    
         | 
| 
       2738 
2738 
     | 
    
         
             
                uint8_t L[QK_K];
         
     | 
| 
       2739 
2739 
     | 
    
         
             
                uint8_t Laux[32];
         
     | 
| 
         @@ -2825,14 +2825,14 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri 
     | 
|
| 
       2825 
2825 
     | 
    
         
             
            #endif
         
     | 
| 
       2826 
2826 
     | 
    
         
             
            }
         
     | 
| 
       2827 
2827 
     | 
    
         | 
| 
       2828 
     | 
    
         
            -
            size_t quantize_q5_K(const float * restrict src, void * restrict dst,  
     | 
| 
      
 2828 
     | 
    
         
            +
            size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       2829 
2829 
     | 
    
         
             
                size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
         
     | 
| 
       2830 
2830 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
       2831 
     | 
    
         
            -
                    quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 2831 
     | 
    
         
            +
                    quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       2832 
2832 
     | 
    
         
             
                }
         
     | 
| 
       2833 
2833 
     | 
    
         
             
                else {
         
     | 
| 
       2834 
2834 
     | 
    
         
             
                    char * qrow = (char *)dst;
         
     | 
| 
       2835 
     | 
    
         
            -
                    for ( 
     | 
| 
      
 2835 
     | 
    
         
            +
                    for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       2836 
2836 
     | 
    
         
             
                        quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
         
     | 
| 
       2837 
2837 
     | 
    
         
             
                        src += n_per_row;
         
     | 
| 
       2838 
2838 
     | 
    
         
             
                        qrow += row_size;
         
     | 
| 
         @@ -2843,9 +2843,9 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       2843 
2843 
     | 
    
         | 
| 
       2844 
2844 
     | 
    
         
             
            // ====================== 6-bit (de)-quantization
         
     | 
| 
       2845 
2845 
     | 
    
         | 
| 
       2846 
     | 
    
         
            -
            void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y,  
     | 
| 
      
 2846 
     | 
    
         
            +
            void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
         
     | 
| 
       2847 
2847 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2848 
     | 
    
         
            -
                const  
     | 
| 
      
 2848 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       2849 
2849 
     | 
    
         | 
| 
       2850 
2850 
     | 
    
         
             
                int8_t L[QK_K];
         
     | 
| 
       2851 
2851 
     | 
    
         
             
                float   scales[QK_K/16];
         
     | 
| 
         @@ -2925,9 +2925,9 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict 
     | 
|
| 
       2925 
2925 
     | 
    
         
             
                }
         
     | 
| 
       2926 
2926 
     | 
    
         
             
            }
         
     | 
| 
       2927 
2927 
     | 
    
         | 
| 
       2928 
     | 
    
         
            -
            void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y,  
     | 
| 
      
 2928 
     | 
    
         
            +
            void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       2929 
2929 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2930 
     | 
    
         
            -
                const  
     | 
| 
      
 2930 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       2931 
2931 
     | 
    
         | 
| 
       2932 
2932 
     | 
    
         
             
                for (int i = 0; i < nb; i++) {
         
     | 
| 
       2933 
2933 
     | 
    
         | 
| 
         @@ -2972,19 +2972,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int 
     | 
|
| 
       2972 
2972 
     | 
    
         
             
                }
         
     | 
| 
       2973 
2973 
     | 
    
         
             
            }
         
     | 
| 
       2974 
2974 
     | 
    
         | 
| 
       2975 
     | 
    
         
            -
            void quantize_row_q6_K(const float * restrict x, void * restrict vy,  
     | 
| 
      
 2975 
     | 
    
         
            +
            void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       2976 
2976 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       2977 
2977 
     | 
    
         
             
                block_q6_K * restrict y = vy;
         
     | 
| 
       2978 
2978 
     | 
    
         
             
                quantize_row_q6_K_reference(x, y, k);
         
     | 
| 
       2979 
2979 
     | 
    
         
             
            }
         
     | 
| 
       2980 
2980 
     | 
    
         | 
| 
       2981 
     | 
    
         
            -
            static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y,  
     | 
| 
      
 2981 
     | 
    
         
            +
            static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       2982 
2982 
     | 
    
         
             
            #if QK_K != 256
         
     | 
| 
       2983 
2983 
     | 
    
         
             
                (void)quant_weights;
         
     | 
| 
       2984 
2984 
     | 
    
         
             
                quantize_row_q6_K_reference(x, y, n_per_row);
         
     | 
| 
       2985 
2985 
     | 
    
         
             
            #else
         
     | 
| 
       2986 
2986 
     | 
    
         
             
                assert(n_per_row % QK_K == 0);
         
     | 
| 
       2987 
     | 
    
         
            -
                const  
     | 
| 
      
 2987 
     | 
    
         
            +
                const int64_t nb = n_per_row / QK_K;
         
     | 
| 
       2988 
2988 
     | 
    
         | 
| 
       2989 
2989 
     | 
    
         
             
                int8_t L[QK_K];
         
     | 
| 
       2990 
2990 
     | 
    
         
             
                float   scales[QK_K/16];
         
     | 
| 
         @@ -3067,14 +3067,14 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri 
     | 
|
| 
       3067 
3067 
     | 
    
         
             
            #endif
         
     | 
| 
       3068 
3068 
     | 
    
         
             
            }
         
     | 
| 
       3069 
3069 
     | 
    
         | 
| 
       3070 
     | 
    
         
            -
            size_t quantize_q6_K(const float * restrict src, void * restrict dst,  
     | 
| 
      
 3070 
     | 
    
         
            +
            size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3071 
3071 
     | 
    
         
             
                size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
         
     | 
| 
       3072 
3072 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
       3073 
     | 
    
         
            -
                    quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 3073 
     | 
    
         
            +
                    quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       3074 
3074 
     | 
    
         
             
                }
         
     | 
| 
       3075 
3075 
     | 
    
         
             
                else {
         
     | 
| 
       3076 
3076 
     | 
    
         
             
                    char * qrow = (char *)dst;
         
     | 
| 
       3077 
     | 
    
         
            -
                    for ( 
     | 
| 
      
 3077 
     | 
    
         
            +
                    for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       3078 
3078 
     | 
    
         
             
                        quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
         
     | 
| 
       3079 
3079 
     | 
    
         
             
                        src += n_per_row;
         
     | 
| 
       3080 
3080 
     | 
    
         
             
                        qrow += row_size;
         
     | 
| 
         @@ -3083,7 +3083,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       3083 
3083 
     | 
    
         
             
                return nrow * row_size;
         
     | 
| 
       3084 
3084 
     | 
    
         
             
            }
         
     | 
| 
       3085 
3085 
     | 
    
         | 
| 
       3086 
     | 
    
         
            -
            static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y,  
     | 
| 
      
 3086 
     | 
    
         
            +
            static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3087 
3087 
     | 
    
         
             
                static_assert(QK4_0 == 32, "QK4_0 must be 32");
         
     | 
| 
       3088 
3088 
     | 
    
         | 
| 
       3089 
3089 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
         @@ -3098,7 +3098,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri 
     | 
|
| 
       3098 
3098 
     | 
    
         
             
                for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
         
     | 
| 
       3099 
3099 
     | 
    
         
             
                float sigma2 = sum_x2/n_per_row;
         
     | 
| 
       3100 
3100 
     | 
    
         | 
| 
       3101 
     | 
    
         
            -
                const  
     | 
| 
      
 3101 
     | 
    
         
            +
                const int64_t nb = n_per_row/QK4_0;
         
     | 
| 
       3102 
3102 
     | 
    
         
             
                for (int ib = 0; ib < nb; ++ib) {
         
     | 
| 
       3103 
3103 
     | 
    
         
             
                    const float * xb = x + QK4_0 * ib;
         
     | 
| 
       3104 
3104 
     | 
    
         
             
                    const float * qw = quant_weights + QK4_0 * ib;
         
     | 
| 
         @@ -3111,14 +3111,14 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri 
     | 
|
| 
       3111 
3111 
     | 
    
         
             
                }
         
     | 
| 
       3112 
3112 
     | 
    
         
             
            }
         
     | 
| 
       3113 
3113 
     | 
    
         | 
| 
       3114 
     | 
    
         
            -
            size_t quantize_q4_0(const float * restrict src, void * restrict dst,  
     | 
| 
      
 3114 
     | 
    
         
            +
            size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3115 
3115 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
       3116 
     | 
    
         
            -
                    quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 3116 
     | 
    
         
            +
                    quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       3117 
3117 
     | 
    
         
             
                    return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
         
     | 
| 
       3118 
3118 
     | 
    
         
             
                }
         
     | 
| 
       3119 
3119 
     | 
    
         
             
                size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
         
     | 
| 
       3120 
3120 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       3121 
     | 
    
         
            -
                for ( 
     | 
| 
      
 3121 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       3122 
3122 
     | 
    
         
             
                    quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
         
     | 
| 
       3123 
3123 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
       3124 
3124 
     | 
    
         
             
                    qrow += row_size;
         
     | 
| 
         @@ -3126,7 +3126,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       3126 
3126 
     | 
    
         
             
                return nrow * row_size;
         
     | 
| 
       3127 
3127 
     | 
    
         
             
            }
         
     | 
| 
       3128 
3128 
     | 
    
         | 
| 
       3129 
     | 
    
         
            -
            static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y,  
     | 
| 
      
 3129 
     | 
    
         
            +
            static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3130 
3130 
     | 
    
         
             
                static_assert(QK4_1 == 32, "QK4_1 must be 32");
         
     | 
| 
       3131 
3131 
     | 
    
         | 
| 
       3132 
3132 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
         @@ -3141,7 +3141,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri 
     | 
|
| 
       3141 
3141 
     | 
    
         
             
                for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
         
     | 
| 
       3142 
3142 
     | 
    
         
             
                float sigma2 = sum_x2/n_per_row;
         
     | 
| 
       3143 
3143 
     | 
    
         | 
| 
       3144 
     | 
    
         
            -
                const  
     | 
| 
      
 3144 
     | 
    
         
            +
                const int64_t nb = n_per_row/QK4_1;
         
     | 
| 
       3145 
3145 
     | 
    
         
             
                for (int ib = 0; ib < nb; ++ib) {
         
     | 
| 
       3146 
3146 
     | 
    
         
             
                    const float * xb = x + QK4_1 * ib;
         
     | 
| 
       3147 
3147 
     | 
    
         
             
                    const float * qw = quant_weights + QK4_1 * ib;
         
     | 
| 
         @@ -3156,14 +3156,14 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri 
     | 
|
| 
       3156 
3156 
     | 
    
         
             
                }
         
     | 
| 
       3157 
3157 
     | 
    
         
             
            }
         
     | 
| 
       3158 
3158 
     | 
    
         | 
| 
       3159 
     | 
    
         
            -
            size_t quantize_q4_1(const float * restrict src, void * restrict dst,  
     | 
| 
      
 3159 
     | 
    
         
            +
            size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3160 
3160 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
       3161 
     | 
    
         
            -
                    quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 3161 
     | 
    
         
            +
                    quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       3162 
3162 
     | 
    
         
             
                    return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
         
     | 
| 
       3163 
3163 
     | 
    
         
             
                }
         
     | 
| 
       3164 
3164 
     | 
    
         
             
                size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
         
     | 
| 
       3165 
3165 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       3166 
     | 
    
         
            -
                for ( 
     | 
| 
      
 3166 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       3167 
3167 
     | 
    
         
             
                    quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
         
     | 
| 
       3168 
3168 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
       3169 
3169 
     | 
    
         
             
                    qrow += row_size;
         
     | 
| 
         @@ -3171,7 +3171,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       3171 
3171 
     | 
    
         
             
                return nrow * row_size;
         
     | 
| 
       3172 
3172 
     | 
    
         
             
            }
         
     | 
| 
       3173 
3173 
     | 
    
         | 
| 
       3174 
     | 
    
         
            -
            static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y,  
     | 
| 
      
 3174 
     | 
    
         
            +
            static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3175 
3175 
     | 
    
         
             
                static_assert(QK5_0 == 32, "QK5_0 must be 32");
         
     | 
| 
       3176 
3176 
     | 
    
         | 
| 
       3177 
3177 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
         @@ -3186,7 +3186,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri 
     | 
|
| 
       3186 
3186 
     | 
    
         
             
                for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
         
     | 
| 
       3187 
3187 
     | 
    
         
             
                float sigma2 = sum_x2/n_per_row;
         
     | 
| 
       3188 
3188 
     | 
    
         | 
| 
       3189 
     | 
    
         
            -
                const  
     | 
| 
      
 3189 
     | 
    
         
            +
                const int64_t nb = n_per_row/QK5_0;
         
     | 
| 
       3190 
3190 
     | 
    
         
             
                for (int ib = 0; ib < nb; ++ib) {
         
     | 
| 
       3191 
3191 
     | 
    
         
             
                    const float * xb = x + QK5_0 * ib;
         
     | 
| 
       3192 
3192 
     | 
    
         
             
                    const float * qw = quant_weights + QK5_0 * ib;
         
     | 
| 
         @@ -3210,14 +3210,14 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri 
     | 
|
| 
       3210 
3210 
     | 
    
         
             
                }
         
     | 
| 
       3211 
3211 
     | 
    
         
             
            }
         
     | 
| 
       3212 
3212 
     | 
    
         | 
| 
       3213 
     | 
    
         
            -
            size_t quantize_q5_0(const float * restrict src, void * restrict dst,  
     | 
| 
      
 3213 
     | 
    
         
            +
            size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3214 
3214 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
       3215 
     | 
    
         
            -
                    quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 3215 
     | 
    
         
            +
                    quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       3216 
3216 
     | 
    
         
             
                    return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
         
     | 
| 
       3217 
3217 
     | 
    
         
             
                }
         
     | 
| 
       3218 
3218 
     | 
    
         
             
                size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
         
     | 
| 
       3219 
3219 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       3220 
     | 
    
         
            -
                for ( 
     | 
| 
      
 3220 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       3221 
3221 
     | 
    
         
             
                    quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
         
     | 
| 
       3222 
3222 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
       3223 
3223 
     | 
    
         
             
                    qrow += row_size;
         
     | 
| 
         @@ -3225,7 +3225,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       3225 
3225 
     | 
    
         
             
                return nrow * row_size;
         
     | 
| 
       3226 
3226 
     | 
    
         
             
            }
         
     | 
| 
       3227 
3227 
     | 
    
         | 
| 
       3228 
     | 
    
         
            -
            static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y,  
     | 
| 
      
 3228 
     | 
    
         
            +
            static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3229 
3229 
     | 
    
         
             
                static_assert(QK5_1 == 32, "QK5_1 must be 32");
         
     | 
| 
       3230 
3230 
     | 
    
         | 
| 
       3231 
3231 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
         @@ -3240,7 +3240,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri 
     | 
|
| 
       3240 
3240 
     | 
    
         
             
                for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
         
     | 
| 
       3241 
3241 
     | 
    
         
             
                float sigma2 = sum_x2/n_per_row;
         
     | 
| 
       3242 
3242 
     | 
    
         | 
| 
       3243 
     | 
    
         
            -
                const  
     | 
| 
      
 3243 
     | 
    
         
            +
                const int64_t nb = n_per_row/QK5_1;
         
     | 
| 
       3244 
3244 
     | 
    
         
             
                for (int ib = 0; ib < nb; ++ib) {
         
     | 
| 
       3245 
3245 
     | 
    
         
             
                    const float * xb = x + QK5_1 * ib;
         
     | 
| 
       3246 
3246 
     | 
    
         
             
                    const float * qw = quant_weights + QK5_1 * ib;
         
     | 
| 
         @@ -3263,14 +3263,14 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri 
     | 
|
| 
       3263 
3263 
     | 
    
         
             
                }
         
     | 
| 
       3264 
3264 
     | 
    
         
             
            }
         
     | 
| 
       3265 
3265 
     | 
    
         | 
| 
       3266 
     | 
    
         
            -
            size_t quantize_q5_1(const float * restrict src, void * restrict dst,  
     | 
| 
      
 3266 
     | 
    
         
            +
            size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3267 
3267 
     | 
    
         
             
                if (!quant_weights) {
         
     | 
| 
       3268 
     | 
    
         
            -
                    quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 3268 
     | 
    
         
            +
                    quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       3269 
3269 
     | 
    
         
             
                    return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
         
     | 
| 
       3270 
3270 
     | 
    
         
             
                }
         
     | 
| 
       3271 
3271 
     | 
    
         
             
                size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
         
     | 
| 
       3272 
3272 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       3273 
     | 
    
         
            -
                for ( 
     | 
| 
      
 3273 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       3274 
3274 
     | 
    
         
             
                    quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
         
     | 
| 
       3275 
3275 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
       3276 
3276 
     | 
    
         
             
                    qrow += row_size;
         
     | 
| 
         @@ -3278,18 +3278,18 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       3278 
3278 
     | 
    
         
             
                return nrow * row_size;
         
     | 
| 
       3279 
3279 
     | 
    
         
             
            }
         
     | 
| 
       3280 
3280 
     | 
    
         | 
| 
       3281 
     | 
    
         
            -
            size_t quantize_q8_0(const float * restrict src, void * restrict dst,  
     | 
| 
      
 3281 
     | 
    
         
            +
            size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       3282 
3282 
     | 
    
         
             
                (void)quant_weights; // not used
         
     | 
| 
       3283 
3283 
     | 
    
         
             
                const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
         
     | 
| 
       3284 
     | 
    
         
            -
                quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
         
     | 
| 
      
 3284 
     | 
    
         
            +
                quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
         
     | 
| 
       3285 
3285 
     | 
    
         
             
                return nrow * row_size;
         
     | 
| 
       3286 
3286 
     | 
    
         
             
            }
         
     | 
| 
       3287 
3287 
     | 
    
         | 
| 
       3288 
3288 
     | 
    
         
             
            // ====================== "True" 2-bit (de)-quantization
         
     | 
| 
       3289 
3289 
     | 
    
         | 
| 
       3290 
     | 
    
         
            -
            void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y,  
     | 
| 
      
 3290 
     | 
    
         
            +
            void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       3291 
3291 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       3292 
     | 
    
         
            -
                const  
     | 
| 
      
 3292 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       3293 
3293 
     | 
    
         | 
| 
       3294 
3294 
     | 
    
         
             
                uint32_t aux32[2];
         
     | 
| 
       3295 
3295 
     | 
    
         
             
                const uint8_t * aux8 = (const uint8_t *)aux32;
         
     | 
| 
         @@ -3315,9 +3315,9 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y 
     | 
|
| 
       3315 
3315 
     | 
    
         | 
| 
       3316 
3316 
     | 
    
         
             
            // ====================== 2.3125 bpw (de)-quantization
         
     | 
| 
       3317 
3317 
     | 
    
         | 
| 
       3318 
     | 
    
         
            -
            void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,  
     | 
| 
      
 3318 
     | 
    
         
            +
            void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       3319 
3319 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       3320 
     | 
    
         
            -
                const  
     | 
| 
      
 3320 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       3321 
3321 
     | 
    
         | 
| 
       3322 
3322 
     | 
    
         
             
                float db[2];
         
     | 
| 
       3323 
3323 
     | 
    
         | 
| 
         @@ -3342,9 +3342,9 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, 
     | 
|
| 
       3342 
3342 
     | 
    
         | 
| 
       3343 
3343 
     | 
    
         
             
            // ====================== 2.5625 bpw (de)-quantization
         
     | 
| 
       3344 
3344 
     | 
    
         | 
| 
       3345 
     | 
    
         
            -
            void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y,  
     | 
| 
      
 3345 
     | 
    
         
            +
            void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       3346 
3346 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       3347 
     | 
    
         
            -
                const  
     | 
| 
      
 3347 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       3348 
3348 
     | 
    
         | 
| 
       3349 
3349 
     | 
    
         
             
                float db[2];
         
     | 
| 
       3350 
3350 
     | 
    
         | 
| 
         @@ -3374,9 +3374,9 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in 
     | 
|
| 
       3374 
3374 
     | 
    
         | 
| 
       3375 
3375 
     | 
    
         
             
            // ====================== 3.0625 bpw (de)-quantization
         
     | 
| 
       3376 
3376 
     | 
    
         | 
| 
       3377 
     | 
    
         
            -
            void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y,  
     | 
| 
      
 3377 
     | 
    
         
            +
            void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       3378 
3378 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       3379 
     | 
    
         
            -
                const  
     | 
| 
      
 3379 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       3380 
3380 
     | 
    
         | 
| 
       3381 
3381 
     | 
    
         
             
                uint32_t aux32;
         
     | 
| 
       3382 
3382 
     | 
    
         | 
| 
         @@ -3406,9 +3406,9 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y 
     | 
|
| 
       3406 
3406 
     | 
    
         | 
| 
       3407 
3407 
     | 
    
         
             
            // ====================== 3.3125 bpw (de)-quantization
         
     | 
| 
       3408 
3408 
     | 
    
         | 
| 
       3409 
     | 
    
         
            -
            void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y,  
     | 
| 
      
 3409 
     | 
    
         
            +
            void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       3410 
3410 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       3411 
     | 
    
         
            -
                const  
     | 
| 
      
 3411 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       3412 
3412 
     | 
    
         | 
| 
       3413 
3413 
     | 
    
         
             
                for (int i = 0; i < nb; i++) {
         
     | 
| 
       3414 
3414 
     | 
    
         | 
| 
         @@ -3449,9 +3449,9 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in 
     | 
|
| 
       3449 
3449 
     | 
    
         | 
| 
       3450 
3450 
     | 
    
         
             
            // ====================== 1.5625 bpw (de)-quantization
         
     | 
| 
       3451 
3451 
     | 
    
         | 
| 
       3452 
     | 
    
         
            -
            void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y,  
     | 
| 
      
 3452 
     | 
    
         
            +
            void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       3453 
3453 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       3454 
     | 
    
         
            -
                const  
     | 
| 
      
 3454 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       3455 
3455 
     | 
    
         | 
| 
       3456 
3456 
     | 
    
         
             
                for (int i = 0; i < nb; i++) {
         
     | 
| 
       3457 
3457 
     | 
    
         | 
| 
         @@ -3474,11 +3474,70 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in 
     | 
|
| 
       3474 
3474 
     | 
    
         
             
                }
         
     | 
| 
       3475 
3475 
     | 
    
         
             
            }
         
     | 
| 
       3476 
3476 
     | 
    
         | 
| 
      
 3477 
     | 
    
         
            +
            void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
      
 3478 
     | 
    
         
            +
                assert(k % QK_K == 0);
         
     | 
| 
      
 3479 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
      
 3480 
     | 
    
         
            +
             
     | 
| 
      
 3481 
     | 
    
         
            +
                float delta[4];
         
     | 
| 
      
 3482 
     | 
    
         
            +
                uint16_t idx[4];
         
     | 
| 
      
 3483 
     | 
    
         
            +
             
     | 
| 
      
 3484 
     | 
    
         
            +
            #if QK_K != 64
         
     | 
| 
      
 3485 
     | 
    
         
            +
                iq1m_scale_t scale;
         
     | 
| 
      
 3486 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 3487 
     | 
    
         
            +
             
     | 
| 
      
 3488 
     | 
    
         
            +
                for (int i = 0; i < nb; i++) {
         
     | 
| 
      
 3489 
     | 
    
         
            +
             
     | 
| 
      
 3490 
     | 
    
         
            +
                    const uint16_t * sc = (const uint16_t *)x[i].scales;
         
     | 
| 
      
 3491 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 3492 
     | 
    
         
            +
                    const float d = GGML_FP16_TO_FP32(x[i].d);
         
     | 
| 
      
 3493 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3494 
     | 
    
         
            +
                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         
     | 
| 
      
 3495 
     | 
    
         
            +
                    const float d = GGML_FP16_TO_FP32(scale.f16);
         
     | 
| 
      
 3496 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 3497 
     | 
    
         
            +
                    const uint8_t * qs = x[i].qs;
         
     | 
| 
      
 3498 
     | 
    
         
            +
                    const uint8_t * qh = x[i].qh;
         
     | 
| 
      
 3499 
     | 
    
         
            +
             
     | 
| 
      
 3500 
     | 
    
         
            +
                    for (int ib = 0; ib < QK_K/32; ++ib) {
         
     | 
| 
      
 3501 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 3502 
     | 
    
         
            +
                        const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1);
         
     | 
| 
      
 3503 
     | 
    
         
            +
                        const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1);
         
     | 
| 
      
 3504 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3505 
     | 
    
         
            +
                        const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
         
     | 
| 
      
 3506 
     | 
    
         
            +
                        const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
         
     | 
| 
      
 3507 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 3508 
     | 
    
         
            +
                        idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
         
     | 
| 
      
 3509 
     | 
    
         
            +
                        idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
         
     | 
| 
      
 3510 
     | 
    
         
            +
                        idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
         
     | 
| 
      
 3511 
     | 
    
         
            +
                        idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
         
     | 
| 
      
 3512 
     | 
    
         
            +
                        delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
         
     | 
| 
      
 3513 
     | 
    
         
            +
                        delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
         
     | 
| 
      
 3514 
     | 
    
         
            +
                        delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
         
     | 
| 
      
 3515 
     | 
    
         
            +
                        delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
         
     | 
| 
      
 3516 
     | 
    
         
            +
                        for (int l = 0; l < 2; ++l) {
         
     | 
| 
      
 3517 
     | 
    
         
            +
                            const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
         
     | 
| 
      
 3518 
     | 
    
         
            +
                            for (int j = 0; j < 8; ++j) {
         
     | 
| 
      
 3519 
     | 
    
         
            +
                                y[j] = dl1 * (grid[j] + delta[l]);
         
     | 
| 
      
 3520 
     | 
    
         
            +
                            }
         
     | 
| 
      
 3521 
     | 
    
         
            +
                            y += 8;
         
     | 
| 
      
 3522 
     | 
    
         
            +
                        }
         
     | 
| 
      
 3523 
     | 
    
         
            +
                        for (int l = 2; l < 4; ++l) {
         
     | 
| 
      
 3524 
     | 
    
         
            +
                            const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
         
     | 
| 
      
 3525 
     | 
    
         
            +
                            for (int j = 0; j < 8; ++j) {
         
     | 
| 
      
 3526 
     | 
    
         
            +
                                y[j] = dl2 * (grid[j] + delta[l]);
         
     | 
| 
      
 3527 
     | 
    
         
            +
                            }
         
     | 
| 
      
 3528 
     | 
    
         
            +
                            y += 8;
         
     | 
| 
      
 3529 
     | 
    
         
            +
                        }
         
     | 
| 
      
 3530 
     | 
    
         
            +
                        qs += 4;
         
     | 
| 
      
 3531 
     | 
    
         
            +
                        qh += 2;
         
     | 
| 
      
 3532 
     | 
    
         
            +
                    }
         
     | 
| 
      
 3533 
     | 
    
         
            +
                }
         
     | 
| 
      
 3534 
     | 
    
         
            +
            }
         
     | 
| 
      
 3535 
     | 
    
         
            +
             
     | 
| 
       3477 
3536 
     | 
    
         
             
            static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
         
     | 
| 
       3478 
3537 
     | 
    
         | 
| 
       3479 
     | 
    
         
            -
            void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,  
     | 
| 
      
 3538 
     | 
    
         
            +
            void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       3480 
3539 
     | 
    
         
             
                assert(k % QK4_NL == 0);
         
     | 
| 
       3481 
     | 
    
         
            -
                const  
     | 
| 
      
 3540 
     | 
    
         
            +
                const int64_t nb = k / QK4_NL;
         
     | 
| 
       3482 
3541 
     | 
    
         | 
| 
       3483 
3542 
     | 
    
         
             
                for (int i = 0; i < nb; i++) {
         
     | 
| 
       3484 
3543 
     | 
    
         | 
| 
         @@ -3494,12 +3553,12 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, 
     | 
|
| 
       3494 
3553 
     | 
    
         
             
                }
         
     | 
| 
       3495 
3554 
     | 
    
         
             
            }
         
     | 
| 
       3496 
3555 
     | 
    
         | 
| 
       3497 
     | 
    
         
            -
            void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,  
     | 
| 
      
 3556 
     | 
    
         
            +
            void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       3498 
3557 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       3499 
3558 
     | 
    
         
             
            #if QK_K == 64
         
     | 
| 
       3500 
3559 
     | 
    
         
             
                dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
         
     | 
| 
       3501 
3560 
     | 
    
         
             
            #else
         
     | 
| 
       3502 
     | 
    
         
            -
                const  
     | 
| 
      
 3561 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       3503 
3562 
     | 
    
         | 
| 
       3504 
3563 
     | 
    
         
             
                for (int i = 0; i < nb; i++) {
         
     | 
| 
       3505 
3564 
     | 
    
         | 
| 
         @@ -3523,9 +3582,9 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, 
     | 
|
| 
       3523 
3582 
     | 
    
         | 
| 
       3524 
3583 
     | 
    
         
             
            //===================================== Q8_K ==============================================
         
     | 
| 
       3525 
3584 
     | 
    
         | 
| 
       3526 
     | 
    
         
            -
            void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y,  
     | 
| 
      
 3585 
     | 
    
         
            +
            void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
         
     | 
| 
       3527 
3586 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       3528 
     | 
    
         
            -
                const  
     | 
| 
      
 3587 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       3529 
3588 
     | 
    
         | 
| 
       3530 
3589 
     | 
    
         
             
                for (int i = 0; i < nb; i++) {
         
     | 
| 
       3531 
3590 
     | 
    
         | 
| 
         @@ -3562,9 +3621,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict 
     | 
|
| 
       3562 
3621 
     | 
    
         
             
                }
         
     | 
| 
       3563 
3622 
     | 
    
         
             
            }
         
     | 
| 
       3564 
3623 
     | 
    
         | 
| 
       3565 
     | 
    
         
            -
            void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y,  
     | 
| 
      
 3624 
     | 
    
         
            +
            void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
         
     | 
| 
       3566 
3625 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       3567 
     | 
    
         
            -
                const  
     | 
| 
      
 3626 
     | 
    
         
            +
                const int64_t nb = k / QK_K;
         
     | 
| 
       3568 
3627 
     | 
    
         | 
| 
       3569 
3628 
     | 
    
         
             
                for (int i = 0; i < nb; i++) {
         
     | 
| 
       3570 
3629 
     | 
    
         
             
                    for (int j = 0; j < QK_K; ++j) {
         
     | 
| 
         @@ -3573,7 +3632,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int 
     | 
|
| 
       3573 
3632 
     | 
    
         
             
                }
         
     | 
| 
       3574 
3633 
     | 
    
         
             
            }
         
     | 
| 
       3575 
3634 
     | 
    
         | 
| 
       3576 
     | 
    
         
            -
            void quantize_row_q8_K(const float * restrict x, void * restrict y,  
     | 
| 
      
 3635 
     | 
    
         
            +
            void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
         
     | 
| 
       3577 
3636 
     | 
    
         
             
                quantize_row_q8_K_reference(x, y, k);
         
     | 
| 
       3578 
3637 
     | 
    
         
             
            }
         
     | 
| 
       3579 
3638 
     | 
    
         | 
| 
         @@ -9695,6 +9754,248 @@ void ggml_vec_dot_iq1_s_q8_K  (int n, float * restrict s, size_t bs, const void 
     | 
|
| 
       9695 
9754 
     | 
    
         
             
            #endif
         
     | 
| 
       9696 
9755 
     | 
    
         
             
            }
         
     | 
| 
       9697 
9756 
     | 
    
         | 
| 
      
 9757 
     | 
    
         
            +
            void ggml_vec_dot_iq1_m_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
         
     | 
| 
      
 9758 
     | 
    
         
            +
                assert(n % QK_K == 0);
         
     | 
| 
      
 9759 
     | 
    
         
            +
                assert(nrc == 1);
         
     | 
| 
      
 9760 
     | 
    
         
            +
                UNUSED(nrc);
         
     | 
| 
      
 9761 
     | 
    
         
            +
                UNUSED(bx);
         
     | 
| 
      
 9762 
     | 
    
         
            +
                UNUSED(by);
         
     | 
| 
      
 9763 
     | 
    
         
            +
                UNUSED(bs);
         
     | 
| 
      
 9764 
     | 
    
         
            +
             
     | 
| 
      
 9765 
     | 
    
         
            +
                const block_iq1_m * restrict x = vx;
         
     | 
| 
      
 9766 
     | 
    
         
            +
                const block_q8_K  * restrict y = vy;
         
     | 
| 
      
 9767 
     | 
    
         
            +
             
     | 
| 
      
 9768 
     | 
    
         
            +
                const int nb = n / QK_K;
         
     | 
| 
      
 9769 
     | 
    
         
            +
             
     | 
| 
      
 9770 
     | 
    
         
            +
            #if QK_K != 64
         
     | 
| 
      
 9771 
     | 
    
         
            +
                iq1m_scale_t scale;
         
     | 
| 
      
 9772 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9773 
     | 
    
         
            +
             
     | 
| 
      
 9774 
     | 
    
         
            +
            #if defined __ARM_NEON
         
     | 
| 
      
 9775 
     | 
    
         
            +
             
     | 
| 
      
 9776 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 9777 
     | 
    
         
            +
                const int32x4_t mask  = vdupq_n_s32(0xf);
         
     | 
| 
      
 9778 
     | 
    
         
            +
            #else
         
     | 
| 
      
 9779 
     | 
    
         
            +
                const int32x4_t mask  = vdupq_n_s32(0x7);
         
     | 
| 
      
 9780 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9781 
     | 
    
         
            +
                const int32x4_t mone  = vdupq_n_s32(1);
         
     | 
| 
      
 9782 
     | 
    
         
            +
                const int32x4_t mzero = vdupq_n_s32(0);
         
     | 
| 
      
 9783 
     | 
    
         
            +
             
     | 
| 
      
 9784 
     | 
    
         
            +
                ggml_int8x16x4_t deltas;
         
     | 
| 
      
 9785 
     | 
    
         
            +
                deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
         
     | 
| 
      
 9786 
     | 
    
         
            +
                deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
         
     | 
| 
      
 9787 
     | 
    
         
            +
                deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
         
     | 
| 
      
 9788 
     | 
    
         
            +
                deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
         
     | 
| 
      
 9789 
     | 
    
         
            +
             
     | 
| 
      
 9790 
     | 
    
         
            +
                ggml_int8x16x4_t q1b;
         
     | 
| 
      
 9791 
     | 
    
         
            +
                ggml_int8x16x4_t q8b;
         
     | 
| 
      
 9792 
     | 
    
         
            +
             
     | 
| 
      
 9793 
     | 
    
         
            +
                uint32_t aux32;
         
     | 
| 
      
 9794 
     | 
    
         
            +
                const uint8_t * aux8 = (const uint8_t *)&aux32;
         
     | 
| 
      
 9795 
     | 
    
         
            +
             
     | 
| 
      
 9796 
     | 
    
         
            +
                float sumf = 0;
         
     | 
| 
      
 9797 
     | 
    
         
            +
                for (int i = 0; i < nb; ++i) {
         
     | 
| 
      
 9798 
     | 
    
         
            +
             
     | 
| 
      
 9799 
     | 
    
         
            +
                    const int8_t   * q8 = y[i].qs;
         
     | 
| 
      
 9800 
     | 
    
         
            +
                    const uint8_t  * qs = x[i].qs;
         
     | 
| 
      
 9801 
     | 
    
         
            +
                    const uint8_t  * qh = x[i].qh;
         
     | 
| 
      
 9802 
     | 
    
         
            +
                    const uint16_t * sc = (const uint16_t *)x[i].scales;
         
     | 
| 
      
 9803 
     | 
    
         
            +
             
     | 
| 
      
 9804 
     | 
    
         
            +
            #if QK_K != 64
         
     | 
| 
      
 9805 
     | 
    
         
            +
                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         
     | 
| 
      
 9806 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9807 
     | 
    
         
            +
             
     | 
| 
      
 9808 
     | 
    
         
            +
                    int32x4_t sumi1 = mzero;
         
     | 
| 
      
 9809 
     | 
    
         
            +
                    int32x4_t sumi2 = mzero;
         
     | 
| 
      
 9810 
     | 
    
         
            +
             
     | 
| 
      
 9811 
     | 
    
         
            +
                    for (int ib = 0; ib < QK_K/32; ib += 2) {
         
     | 
| 
      
 9812 
     | 
    
         
            +
             
     | 
| 
      
 9813 
     | 
    
         
            +
                        q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
         
     | 
| 
      
 9814 
     | 
    
         
            +
                                                 vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
         
     | 
| 
      
 9815 
     | 
    
         
            +
                        q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
         
     | 
| 
      
 9816 
     | 
    
         
            +
                                                 vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
         
     | 
| 
      
 9817 
     | 
    
         
            +
                        q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
         
     | 
| 
      
 9818 
     | 
    
         
            +
                                                 vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
         
     | 
| 
      
 9819 
     | 
    
         
            +
                        q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
         
     | 
| 
      
 9820 
     | 
    
         
            +
                                                 vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
         
     | 
| 
      
 9821 
     | 
    
         
            +
             
     | 
| 
      
 9822 
     | 
    
         
            +
                        q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
         
     | 
| 
      
 9823 
     | 
    
         
            +
             
     | 
| 
      
 9824 
     | 
    
         
            +
                        const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
         
     | 
| 
      
 9825 
     | 
    
         
            +
                        const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
         
     | 
| 
      
 9826 
     | 
    
         
            +
                        const int32x4_t p12 = vpaddq_s32(p1, p2);
         
     | 
| 
      
 9827 
     | 
    
         
            +
             
     | 
| 
      
 9828 
     | 
    
         
            +
                        const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
         
     | 
| 
      
 9829 
     | 
    
         
            +
                        aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
         
     | 
| 
      
 9830 
     | 
    
         
            +
             
     | 
| 
      
 9831 
     | 
    
         
            +
                        const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
         
     | 
| 
      
 9832 
     | 
    
         
            +
                        const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
         
     | 
| 
      
 9833 
     | 
    
         
            +
                        const int32x4_t p34 = vpaddq_s32(p3, p4);
         
     | 
| 
      
 9834 
     | 
    
         
            +
             
     | 
| 
      
 9835 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 9836 
     | 
    
         
            +
                        int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12);
         
     | 
| 
      
 9837 
     | 
    
         
            +
            #else
         
     | 
| 
      
 9838 
     | 
    
         
            +
                        int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
         
     | 
| 
      
 9839 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9840 
     | 
    
         
            +
                        scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
         
     | 
| 
      
 9841 
     | 
    
         
            +
             
     | 
| 
      
 9842 
     | 
    
         
            +
                        sumi1 = vmlaq_s32(sumi1, scales_4, p12);
         
     | 
| 
      
 9843 
     | 
    
         
            +
                        sumi2 = vmlaq_s32(sumi2, scales_4, p34);
         
     | 
| 
      
 9844 
     | 
    
         
            +
             
     | 
| 
      
 9845 
     | 
    
         
            +
                        qs += 8; qh += 4;
         
     | 
| 
      
 9846 
     | 
    
         
            +
             
     | 
| 
      
 9847 
     | 
    
         
            +
                    }
         
     | 
| 
      
 9848 
     | 
    
         
            +
             
     | 
| 
      
 9849 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 9850 
     | 
    
         
            +
                    sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
         
     | 
| 
      
 9851 
     | 
    
         
            +
            #else
         
     | 
| 
      
 9852 
     | 
    
         
            +
                    sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
         
     | 
| 
      
 9853 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9854 
     | 
    
         
            +
                }
         
     | 
| 
      
 9855 
     | 
    
         
            +
             
     | 
| 
      
 9856 
     | 
    
         
            +
                *s = sumf;
         
     | 
| 
      
 9857 
     | 
    
         
            +
             
     | 
| 
      
 9858 
     | 
    
         
            +
            #elif defined __AVX2__
         
     | 
| 
      
 9859 
     | 
    
         
            +
             
     | 
| 
      
 9860 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 9861 
     | 
    
         
            +
                const __m256i mask = _mm256_set1_epi16(0xf);
         
     | 
| 
      
 9862 
     | 
    
         
            +
            #else
         
     | 
| 
      
 9863 
     | 
    
         
            +
                const __m256i mask = _mm256_set1_epi16(0x7);
         
     | 
| 
      
 9864 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9865 
     | 
    
         
            +
                const __m256i mone = _mm256_set1_epi16(1);
         
     | 
| 
      
 9866 
     | 
    
         
            +
             
     | 
| 
      
 9867 
     | 
    
         
            +
                __m256 accum1 = _mm256_setzero_ps();
         
     | 
| 
      
 9868 
     | 
    
         
            +
                __m256 accum2 = _mm256_setzero_ps();
         
     | 
| 
      
 9869 
     | 
    
         
            +
                for (int i = 0; i < nb; ++i) {
         
     | 
| 
      
 9870 
     | 
    
         
            +
             
     | 
| 
      
 9871 
     | 
    
         
            +
                    const int8_t   * q8 = y[i].qs;
         
     | 
| 
      
 9872 
     | 
    
         
            +
                    const uint8_t  * qs = x[i].qs;
         
     | 
| 
      
 9873 
     | 
    
         
            +
                    const uint8_t  * qh = x[i].qh;
         
     | 
| 
      
 9874 
     | 
    
         
            +
                    const uint16_t * sc = (const uint16_t *)x[i].scales;
         
     | 
| 
      
 9875 
     | 
    
         
            +
             
     | 
| 
      
 9876 
     | 
    
         
            +
            #if QK_K != 64
         
     | 
| 
      
 9877 
     | 
    
         
            +
                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         
     | 
| 
      
 9878 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9879 
     | 
    
         
            +
             
     | 
| 
      
 9880 
     | 
    
         
            +
                    __m256i sumi1 = _mm256_setzero_si256();
         
     | 
| 
      
 9881 
     | 
    
         
            +
                    __m256i sumi2 = _mm256_setzero_si256();
         
     | 
| 
      
 9882 
     | 
    
         
            +
                    for (int ib = 0; ib < QK_K/32; ib += 2) {
         
     | 
| 
      
 9883 
     | 
    
         
            +
                        const __m256i q1b_1 = _mm256_set_epi64x(
         
     | 
| 
      
 9884 
     | 
    
         
            +
                                iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
         
     | 
| 
      
 9885 
     | 
    
         
            +
                                iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
         
     | 
| 
      
 9886 
     | 
    
         
            +
                        );
         
     | 
| 
      
 9887 
     | 
    
         
            +
                        const __m256i q1b_2 = _mm256_set_epi64x(
         
     | 
| 
      
 9888 
     | 
    
         
            +
                                iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
         
     | 
| 
      
 9889 
     | 
    
         
            +
                                iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
         
     | 
| 
      
 9890 
     | 
    
         
            +
                        );
         
     | 
| 
      
 9891 
     | 
    
         
            +
                        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
         
     | 
| 
      
 9892 
     | 
    
         
            +
                        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
         
     | 
| 
      
 9893 
     | 
    
         
            +
             
     | 
| 
      
 9894 
     | 
    
         
            +
                        const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
         
     | 
| 
      
 9895 
     | 
    
         
            +
                        const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
         
     | 
| 
      
 9896 
     | 
    
         
            +
             
     | 
| 
      
 9897 
     | 
    
         
            +
                        const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
         
     | 
| 
      
 9898 
     | 
    
         
            +
                                                                 qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
         
     | 
| 
      
 9899 
     | 
    
         
            +
                                                                 qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
         
     | 
| 
      
 9900 
     | 
    
         
            +
                                                                 qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
         
     | 
| 
      
 9901 
     | 
    
         
            +
                        const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
         
     | 
| 
      
 9902 
     | 
    
         
            +
                                                                 qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
         
     | 
| 
      
 9903 
     | 
    
         
            +
                                                                 qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
         
     | 
| 
      
 9904 
     | 
    
         
            +
                                                                 qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
         
     | 
| 
      
 9905 
     | 
    
         
            +
             
     | 
| 
      
 9906 
     | 
    
         
            +
                        const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
         
     | 
| 
      
 9907 
     | 
    
         
            +
                        const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
         
     | 
| 
      
 9908 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 9909 
     | 
    
         
            +
                        __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >>  4), _mm_set1_epi16(sc[0] >> 0));
         
     | 
| 
      
 9910 
     | 
    
         
            +
                        __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8));
         
     | 
| 
      
 9911 
     | 
    
         
            +
            #else
         
     | 
| 
      
 9912 
     | 
    
         
            +
                        __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
         
     | 
| 
      
 9913 
     | 
    
         
            +
                        __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
         
     | 
| 
      
 9914 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9915 
     | 
    
         
            +
                        scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
         
     | 
| 
      
 9916 
     | 
    
         
            +
                        scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
         
     | 
| 
      
 9917 
     | 
    
         
            +
                        const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
         
     | 
| 
      
 9918 
     | 
    
         
            +
                        const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
         
     | 
| 
      
 9919 
     | 
    
         
            +
                        const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
         
     | 
| 
      
 9920 
     | 
    
         
            +
                        const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
         
     | 
| 
      
 9921 
     | 
    
         
            +
             
     | 
| 
      
 9922 
     | 
    
         
            +
                        sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
         
     | 
| 
      
 9923 
     | 
    
         
            +
                        sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
         
     | 
| 
      
 9924 
     | 
    
         
            +
             
     | 
| 
      
 9925 
     | 
    
         
            +
                        qs += 8; qh += 4;
         
     | 
| 
      
 9926 
     | 
    
         
            +
                    }
         
     | 
| 
      
 9927 
     | 
    
         
            +
             
     | 
| 
      
 9928 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 9929 
     | 
    
         
            +
                    const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
         
     | 
| 
      
 9930 
     | 
    
         
            +
            #else
         
     | 
| 
      
 9931 
     | 
    
         
            +
                    const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
         
     | 
| 
      
 9932 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9933 
     | 
    
         
            +
                    accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
         
     | 
| 
      
 9934 
     | 
    
         
            +
                    accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
         
     | 
| 
      
 9935 
     | 
    
         
            +
             
     | 
| 
      
 9936 
     | 
    
         
            +
                }
         
     | 
| 
      
 9937 
     | 
    
         
            +
             
     | 
| 
      
 9938 
     | 
    
         
            +
                *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
         
     | 
| 
      
 9939 
     | 
    
         
            +
             
     | 
| 
      
 9940 
     | 
    
         
            +
            #else
         
     | 
| 
      
 9941 
     | 
    
         
            +
             
     | 
| 
      
 9942 
     | 
    
         
            +
                int sum1[2], sum2[2], delta[4];
         
     | 
| 
      
 9943 
     | 
    
         
            +
             
     | 
| 
      
 9944 
     | 
    
         
            +
                float sumf = 0;
         
     | 
| 
      
 9945 
     | 
    
         
            +
                for (int i = 0; i < nb; i++) {
         
     | 
| 
      
 9946 
     | 
    
         
            +
             
     | 
| 
      
 9947 
     | 
    
         
            +
                    const int8_t   * q8 = y[i].qs;
         
     | 
| 
      
 9948 
     | 
    
         
            +
                    const uint8_t  * qs = x[i].qs;
         
     | 
| 
      
 9949 
     | 
    
         
            +
                    const uint8_t  * qh = x[i].qh;
         
     | 
| 
      
 9950 
     | 
    
         
            +
                    const uint16_t * sc = (const uint16_t *)x[i].scales;
         
     | 
| 
      
 9951 
     | 
    
         
            +
             
     | 
| 
      
 9952 
     | 
    
         
            +
            #if QK_K != 64
         
     | 
| 
      
 9953 
     | 
    
         
            +
                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         
     | 
| 
      
 9954 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9955 
     | 
    
         
            +
             
     | 
| 
      
 9956 
     | 
    
         
            +
                    int sumi1 = 0, sumi2 = 0;
         
     | 
| 
      
 9957 
     | 
    
         
            +
                    for (int ib = 0; ib < QK_K/32; ++ib) {
         
     | 
| 
      
 9958 
     | 
    
         
            +
                        delta[0] = qh[0] & 0x08 ? -1 : 1;
         
     | 
| 
      
 9959 
     | 
    
         
            +
                        delta[1] = qh[0] & 0x80 ? -1 : 1;
         
     | 
| 
      
 9960 
     | 
    
         
            +
                        delta[2] = qh[1] & 0x08 ? -1 : 1;
         
     | 
| 
      
 9961 
     | 
    
         
            +
                        delta[3] = qh[1] & 0x80 ? -1 : 1;
         
     | 
| 
      
 9962 
     | 
    
         
            +
                        sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
         
     | 
| 
      
 9963 
     | 
    
         
            +
                        for (int l = 0; l < 4; ++l) {
         
     | 
| 
      
 9964 
     | 
    
         
            +
                            const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
         
     | 
| 
      
 9965 
     | 
    
         
            +
                            int lsum1 = 0, lsum2 = 0;
         
     | 
| 
      
 9966 
     | 
    
         
            +
                            for (int j = 0; j < 8; ++j) {
         
     | 
| 
      
 9967 
     | 
    
         
            +
                                lsum1 += q8[j] * grid[j];
         
     | 
| 
      
 9968 
     | 
    
         
            +
                                lsum2 += q8[j];
         
     | 
| 
      
 9969 
     | 
    
         
            +
                            }
         
     | 
| 
      
 9970 
     | 
    
         
            +
                            q8 += 8;
         
     | 
| 
      
 9971 
     | 
    
         
            +
                            sum1[l/2] += lsum1;
         
     | 
| 
      
 9972 
     | 
    
         
            +
                            sum2[l/2] += lsum2*delta[l];
         
     | 
| 
      
 9973 
     | 
    
         
            +
                        }
         
     | 
| 
      
 9974 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 9975 
     | 
    
         
            +
                        const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1;
         
     | 
| 
      
 9976 
     | 
    
         
            +
                        const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1;
         
     | 
| 
      
 9977 
     | 
    
         
            +
            #else
         
     | 
| 
      
 9978 
     | 
    
         
            +
                        const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
         
     | 
| 
      
 9979 
     | 
    
         
            +
                        const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
         
     | 
| 
      
 9980 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9981 
     | 
    
         
            +
                        sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
         
     | 
| 
      
 9982 
     | 
    
         
            +
                        sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
         
     | 
| 
      
 9983 
     | 
    
         
            +
                        qs += 4;
         
     | 
| 
      
 9984 
     | 
    
         
            +
                        qh += 2;
         
     | 
| 
      
 9985 
     | 
    
         
            +
                    }
         
     | 
| 
      
 9986 
     | 
    
         
            +
             
     | 
| 
      
 9987 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 9988 
     | 
    
         
            +
                    sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
         
     | 
| 
      
 9989 
     | 
    
         
            +
            #else
         
     | 
| 
      
 9990 
     | 
    
         
            +
                    sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
         
     | 
| 
      
 9991 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9992 
     | 
    
         
            +
                }
         
     | 
| 
      
 9993 
     | 
    
         
            +
             
     | 
| 
      
 9994 
     | 
    
         
            +
                *s = sumf;
         
     | 
| 
      
 9995 
     | 
    
         
            +
             
     | 
| 
      
 9996 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 9997 
     | 
    
         
            +
            }
         
     | 
| 
      
 9998 
     | 
    
         
            +
             
     | 
| 
       9698 
9999 
     | 
    
         
             
            void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
         
     | 
| 
       9699 
10000 
     | 
    
         
             
                assert(nrc == 1);
         
     | 
| 
       9700 
10001 
     | 
    
         
             
                UNUSED(nrc);
         
     | 
| 
         @@ -9938,17 +10239,17 @@ static iq2_entry_t iq2_data[4] = { 
     | 
|
| 
       9938 
10239 
     | 
    
         
             
            };
         
     | 
| 
       9939 
10240 
     | 
    
         | 
| 
       9940 
10241 
     | 
    
         
             
            static inline int iq2_data_index(enum ggml_type type) {
         
     | 
| 
       9941 
     | 
    
         
            -
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
         
     | 
| 
      
 10242 
     | 
    
         
            +
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
         
     | 
| 
       9942 
10243 
     | 
    
         
             
                return type == GGML_TYPE_IQ2_XXS ? 0 :
         
     | 
| 
       9943 
10244 
     | 
    
         
             
                       type == GGML_TYPE_IQ2_XS  ? 1 :
         
     | 
| 
       9944 
     | 
    
         
            -
                       type == GGML_TYPE_IQ1_S 
     | 
| 
      
 10245 
     | 
    
         
            +
                       type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
         
     | 
| 
       9945 
10246 
     | 
    
         
             
            }
         
     | 
| 
       9946 
10247 
     | 
    
         | 
| 
       9947 
10248 
     | 
    
         
             
            static inline int iq2_grid_size(enum ggml_type type) {
         
     | 
| 
       9948 
     | 
    
         
            -
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
         
     | 
| 
      
 10249 
     | 
    
         
            +
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
         
     | 
| 
       9949 
10250 
     | 
    
         
             
                return type == GGML_TYPE_IQ2_XXS ? 256 :
         
     | 
| 
       9950 
10251 
     | 
    
         
             
                       type == GGML_TYPE_IQ2_XS  ? 512 :
         
     | 
| 
       9951 
     | 
    
         
            -
                       type == GGML_TYPE_IQ1_S 
     | 
| 
      
 10252 
     | 
    
         
            +
                       type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
         
     | 
| 
       9952 
10253 
     | 
    
         
             
            }
         
     | 
| 
       9953 
10254 
     | 
    
         | 
| 
       9954 
10255 
     | 
    
         
             
            static int iq2_compare_func(const void * left, const void * right) {
         
     | 
| 
         @@ -10214,10 +10515,10 @@ void iq2xs_init_impl(enum ggml_type type) { 
     | 
|
| 
       10214 
10515 
     | 
    
         | 
| 
       10215 
10516 
     | 
    
         
             
                const int kmap_size = 43692;
         
     | 
| 
       10216 
10517 
     | 
    
         
             
                //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
         
     | 
| 
       10217 
     | 
    
         
            -
                const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
         
     | 
| 
      
 10518 
     | 
    
         
            +
                const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
         
     | 
| 
       10218 
10519 
     | 
    
         
             
                const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
         
     | 
| 
       10219 
10520 
     | 
    
         
             
                                         type == GGML_TYPE_IQ2_XS  ? kgrid_2bit_512 :
         
     | 
| 
       10220 
     | 
    
         
            -
                                         type == GGML_TYPE_IQ1_S 
     | 
| 
      
 10521 
     | 
    
         
            +
                                         type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
         
     | 
| 
       10221 
10522 
     | 
    
         
             
                uint64_t * kgrid_q2xs;
         
     | 
| 
       10222 
10523 
     | 
    
         
             
                int      * kmap_q2xs;
         
     | 
| 
       10223 
10524 
     | 
    
         
             
                uint16_t * kneighbors_q2xs;
         
     | 
| 
         @@ -10314,7 +10615,7 @@ void iq2xs_init_impl(enum ggml_type type) { 
     | 
|
| 
       10314 
10615 
     | 
    
         
             
            }
         
     | 
| 
       10315 
10616 
     | 
    
         | 
| 
       10316 
10617 
     | 
    
         
             
            void iq2xs_free_impl(enum ggml_type type) {
         
     | 
| 
       10317 
     | 
    
         
            -
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
         
     | 
| 
      
 10618 
     | 
    
         
            +
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
         
     | 
| 
       10318 
10619 
     | 
    
         
             
                const int gindex = iq2_data_index(type);
         
     | 
| 
       10319 
10620 
     | 
    
         
             
                if (iq2_data[gindex].grid) {
         
     | 
| 
       10320 
10621 
     | 
    
         
             
                    free(iq2_data[gindex].grid);       iq2_data[gindex].grid = NULL;
         
     | 
| 
         @@ -10347,7 +10648,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u 
     | 
|
| 
       10347 
10648 
     | 
    
         
             
                return grid_index;
         
     | 
| 
       10348 
10649 
     | 
    
         
             
            }
         
     | 
| 
       10349 
10650 
     | 
    
         | 
| 
       10350 
     | 
    
         
            -
            static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy,  
     | 
| 
      
 10651 
     | 
    
         
            +
            static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
         
     | 
| 
       10351 
10652 
     | 
    
         | 
| 
       10352 
10653 
     | 
    
         
             
                const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
         
     | 
| 
       10353 
10654 
     | 
    
         | 
| 
         @@ -10363,7 +10664,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict 
     | 
|
| 
       10363 
10664 
     | 
    
         | 
| 
       10364 
10665 
     | 
    
         
             
                const int kMaxQ = 3;
         
     | 
| 
       10365 
10666 
     | 
    
         | 
| 
       10366 
     | 
    
         
            -
                const  
     | 
| 
      
 10667 
     | 
    
         
            +
                const int64_t nbl = n/QK_K;
         
     | 
| 
       10367 
10668 
     | 
    
         | 
| 
       10368 
10669 
     | 
    
         
             
                block_iq2_xxs * y = vy;
         
     | 
| 
       10369 
10670 
     | 
    
         | 
| 
         @@ -10520,7 +10821,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict 
     | 
|
| 
       10520 
10821 
     | 
    
         
             
                }
         
     | 
| 
       10521 
10822 
     | 
    
         
             
            }
         
     | 
| 
       10522 
10823 
     | 
    
         | 
| 
       10523 
     | 
    
         
            -
            static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy,  
     | 
| 
      
 10824 
     | 
    
         
            +
            static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
         
     | 
| 
       10524 
10825 
     | 
    
         | 
| 
       10525 
10826 
     | 
    
         
             
                const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
         
     | 
| 
       10526 
10827 
     | 
    
         | 
| 
         @@ -10536,7 +10837,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v 
     | 
|
| 
       10536 
10837 
     | 
    
         | 
| 
       10537 
10838 
     | 
    
         
             
                const int kMaxQ = 3;
         
     | 
| 
       10538 
10839 
     | 
    
         | 
| 
       10539 
     | 
    
         
            -
                const  
     | 
| 
      
 10840 
     | 
    
         
            +
                const int64_t nbl = n/QK_K;
         
     | 
| 
       10540 
10841 
     | 
    
         | 
| 
       10541 
10842 
     | 
    
         
             
                block_iq2_xs * y = vy;
         
     | 
| 
       10542 
10843 
     | 
    
         | 
| 
         @@ -10700,11 +11001,11 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v 
     | 
|
| 
       10700 
11001 
     | 
    
         
             
                }
         
     | 
| 
       10701 
11002 
     | 
    
         
             
            }
         
     | 
| 
       10702 
11003 
     | 
    
         | 
| 
       10703 
     | 
    
         
            -
            size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst,  
     | 
| 
      
 11004 
     | 
    
         
            +
            size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       10704 
11005 
     | 
    
         
             
                GGML_ASSERT(n_per_row%QK_K == 0);
         
     | 
| 
       10705 
     | 
    
         
            -
                 
     | 
| 
      
 11006 
     | 
    
         
            +
                int64_t nblock = n_per_row/QK_K;
         
     | 
| 
       10706 
11007 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       10707 
     | 
    
         
            -
                for ( 
     | 
| 
      
 11008 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       10708 
11009 
     | 
    
         
             
                    quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
         
     | 
| 
       10709 
11010 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
       10710 
11011 
     | 
    
         
             
                    qrow += nblock*sizeof(block_iq2_xxs);
         
     | 
| 
         @@ -10712,11 +11013,11 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nro 
     | 
|
| 
       10712 
11013 
     | 
    
         
             
                return nrow * nblock * sizeof(block_iq2_xxs);
         
     | 
| 
       10713 
11014 
     | 
    
         
             
            }
         
     | 
| 
       10714 
11015 
     | 
    
         | 
| 
       10715 
     | 
    
         
            -
            size_t quantize_iq2_xs(const float * restrict src, void * restrict dst,  
     | 
| 
      
 11016 
     | 
    
         
            +
            size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       10716 
11017 
     | 
    
         
             
                GGML_ASSERT(n_per_row%QK_K == 0);
         
     | 
| 
       10717 
     | 
    
         
            -
                 
     | 
| 
      
 11018 
     | 
    
         
            +
                int64_t nblock = n_per_row/QK_K;
         
     | 
| 
       10718 
11019 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       10719 
     | 
    
         
            -
                for ( 
     | 
| 
      
 11020 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       10720 
11021 
     | 
    
         
             
                    quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
         
     | 
| 
       10721 
11022 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
       10722 
11023 
     | 
    
         
             
                    qrow += nblock*sizeof(block_iq2_xs);
         
     | 
| 
         @@ -10941,7 +11242,7 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u 
     | 
|
| 
       10941 
11242 
     | 
    
         
             
                return grid_index;
         
     | 
| 
       10942 
11243 
     | 
    
         
             
            }
         
     | 
| 
       10943 
11244 
     | 
    
         | 
| 
       10944 
     | 
    
         
            -
            static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy,  
     | 
| 
      
 11245 
     | 
    
         
            +
            static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
         
     | 
| 
       10945 
11246 
     | 
    
         
             
                    const float * restrict quant_weights) {
         
     | 
| 
       10946 
11247 
     | 
    
         | 
| 
       10947 
11248 
     | 
    
         
             
                const int gindex = iq3_data_index(grid_size);
         
     | 
| 
         @@ -10958,7 +11259,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v 
     | 
|
| 
       10958 
11259 
     | 
    
         | 
| 
       10959 
11260 
     | 
    
         
             
                const int kMaxQ = 8;
         
     | 
| 
       10960 
11261 
     | 
    
         | 
| 
       10961 
     | 
    
         
            -
                const  
     | 
| 
      
 11262 
     | 
    
         
            +
                const int64_t nbl = n/QK_K;
         
     | 
| 
       10962 
11263 
     | 
    
         | 
| 
       10963 
11264 
     | 
    
         
             
                ggml_fp16_t * dh;
         
     | 
| 
       10964 
11265 
     | 
    
         
             
                uint8_t * qs;
         
     | 
| 
         @@ -11154,11 +11455,11 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v 
     | 
|
| 
       11154 
11455 
     | 
    
         
             
                }
         
     | 
| 
       11155 
11456 
     | 
    
         
             
            }
         
     | 
| 
       11156 
11457 
     | 
    
         | 
| 
       11157 
     | 
    
         
            -
            size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst,  
     | 
| 
      
 11458 
     | 
    
         
            +
            size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       11158 
11459 
     | 
    
         
             
                GGML_ASSERT(n_per_row%QK_K == 0);
         
     | 
| 
       11159 
     | 
    
         
            -
                 
     | 
| 
      
 11460 
     | 
    
         
            +
                int64_t nblock = n_per_row/QK_K;
         
     | 
| 
       11160 
11461 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       11161 
     | 
    
         
            -
                for ( 
     | 
| 
      
 11462 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       11162 
11463 
     | 
    
         
             
                    quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
         
     | 
| 
       11163 
11464 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
       11164 
11465 
     | 
    
         
             
                    qrow += nblock*sizeof(block_iq3_xxs);
         
     | 
| 
         @@ -11166,13 +11467,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nro 
     | 
|
| 
       11166 
11467 
     | 
    
         
             
                return nrow * nblock * sizeof(block_iq3_xxs);
         
     | 
| 
       11167 
11468 
     | 
    
         
             
            }
         
     | 
| 
       11168 
11469 
     | 
    
         | 
| 
       11169 
     | 
    
         
            -
            void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy,  
     | 
| 
      
 11470 
     | 
    
         
            +
            void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       11170 
11471 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       11171 
11472 
     | 
    
         
             
                block_iq3_xxs * restrict y = vy;
         
     | 
| 
       11172 
11473 
     | 
    
         
             
                quantize_row_iq3_xxs_reference(x, y, k);
         
     | 
| 
       11173 
11474 
     | 
    
         
             
            }
         
     | 
| 
       11174 
11475 
     | 
    
         | 
| 
       11175 
     | 
    
         
            -
            void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y,  
     | 
| 
      
 11476 
     | 
    
         
            +
            void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
         
     | 
| 
       11176 
11477 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       11177 
11478 
     | 
    
         
             
                quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
         
     | 
| 
       11178 
11479 
     | 
    
         
             
            }
         
     | 
| 
         @@ -11203,7 +11504,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo 
     | 
|
| 
       11203 
11504 
     | 
    
         | 
| 
       11204 
11505 
     | 
    
         
             
                const int kMaxQ = 8;
         
     | 
| 
       11205 
11506 
     | 
    
         | 
| 
       11206 
     | 
    
         
            -
                const  
     | 
| 
      
 11507 
     | 
    
         
            +
                const int64_t nbl = n/QK_K;
         
     | 
| 
       11207 
11508 
     | 
    
         | 
| 
       11208 
11509 
     | 
    
         
             
                block_iq3_s * y = vy;
         
     | 
| 
       11209 
11510 
     | 
    
         | 
| 
         @@ -11360,9 +11661,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo 
     | 
|
| 
       11360 
11661 
     | 
    
         
             
            }
         
     | 
| 
       11361 
11662 
     | 
    
         | 
| 
       11362 
11663 
     | 
    
         
             
            #define IQ3S_BLOCK_SIZE 32
         
     | 
| 
       11363 
     | 
    
         
            -
            size_t quantize_iq3_s(const float * restrict src, void * restrict dst,  
     | 
| 
      
 11664 
     | 
    
         
            +
            size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       11364 
11665 
     | 
    
         
             
                GGML_ASSERT(n_per_row%QK_K == 0);
         
     | 
| 
       11365 
     | 
    
         
            -
                 
     | 
| 
      
 11666 
     | 
    
         
            +
                int64_t nblock = n_per_row/QK_K;
         
     | 
| 
       11366 
11667 
     | 
    
         
             
                float scales[QK_K/IQ3S_BLOCK_SIZE];
         
     | 
| 
       11367 
11668 
     | 
    
         
             
                float weight[IQ3S_BLOCK_SIZE];
         
     | 
| 
       11368 
11669 
     | 
    
         
             
                float xval[IQ3S_BLOCK_SIZE];
         
     | 
| 
         @@ -11373,7 +11674,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       11373 
11674 
     | 
    
         
             
                bool   is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
         
     | 
| 
       11374 
11675 
     | 
    
         
             
                uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
         
     | 
| 
       11375 
11676 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       11376 
     | 
    
         
            -
                for ( 
     | 
| 
      
 11677 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       11377 
11678 
     | 
    
         
             
                    quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
         
     | 
| 
       11378 
11679 
     | 
    
         
             
                            scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
         
     | 
| 
       11379 
11680 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
         @@ -11382,13 +11683,13 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       11382 
11683 
     | 
    
         
             
                return nrow * nblock * sizeof(block_iq3_s);
         
     | 
| 
       11383 
11684 
     | 
    
         
             
            }
         
     | 
| 
       11384 
11685 
     | 
    
         | 
| 
       11385 
     | 
    
         
            -
            void quantize_row_iq3_s(const float * restrict x, void * restrict vy,  
     | 
| 
      
 11686 
     | 
    
         
            +
            void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       11386 
11687 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       11387 
11688 
     | 
    
         
             
                block_iq3_s * restrict y = vy;
         
     | 
| 
       11388 
11689 
     | 
    
         
             
                quantize_row_iq3_s_reference(x, y, k);
         
     | 
| 
       11389 
11690 
     | 
    
         
             
            }
         
     | 
| 
       11390 
11691 
     | 
    
         | 
| 
       11391 
     | 
    
         
            -
            void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y,  
     | 
| 
      
 11692 
     | 
    
         
            +
            void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
         
     | 
| 
       11392 
11693 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       11393 
11694 
     | 
    
         
             
                quantize_iq3_s(x, y, 1, k, NULL);
         
     | 
| 
       11394 
11695 
     | 
    
         
             
            }
         
     | 
| 
         @@ -11520,7 +11821,16 @@ static int iq1_sort_helper(const void * left, const void * right) { 
     | 
|
| 
       11520 
11821 
     | 
    
         
             
            }
         
     | 
| 
       11521 
11822 
     | 
    
         | 
| 
       11522 
11823 
     | 
    
         
             
            #define IQ1S_BLOCK_SIZE 32
         
     | 
| 
       11523 
     | 
    
         
            -
             
     | 
| 
      
 11824 
     | 
    
         
            +
            #define IQ1M_BLOCK_SIZE 16
         
     | 
| 
      
 11825 
     | 
    
         
            +
            static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
         
     | 
| 
      
 11826 
     | 
    
         
            +
                    float    * scales,
         
     | 
| 
      
 11827 
     | 
    
         
            +
                    float    * weight,
         
     | 
| 
      
 11828 
     | 
    
         
            +
                    float    * sumx,
         
     | 
| 
      
 11829 
     | 
    
         
            +
                    float    * sumw,
         
     | 
| 
      
 11830 
     | 
    
         
            +
                    float    * pairs,
         
     | 
| 
      
 11831 
     | 
    
         
            +
                    int8_t   * L,
         
     | 
| 
      
 11832 
     | 
    
         
            +
                    uint16_t * index,
         
     | 
| 
      
 11833 
     | 
    
         
            +
                    int8_t   * shifts) {
         
     | 
| 
       11524 
11834 
     | 
    
         | 
| 
       11525 
11835 
     | 
    
         
             
                const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
         
     | 
| 
       11526 
11836 
     | 
    
         | 
| 
         @@ -11534,22 +11844,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11534 
11844 
     | 
    
         
             
                GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
         
     | 
| 
       11535 
11845 
     | 
    
         
             
                GGML_ASSERT(n%QK_K == 0);
         
     | 
| 
       11536 
11846 
     | 
    
         | 
| 
       11537 
     | 
    
         
            -
                const int nbl = n/QK_K;
         
     | 
| 
       11538 
     | 
    
         
            -
             
     | 
| 
       11539 
11847 
     | 
    
         
             
                block_iq1_s * y = vy;
         
     | 
| 
       11540 
11848 
     | 
    
         | 
| 
      
 11849 
     | 
    
         
            +
                const int64_t nbl = n/QK_K;
         
     | 
| 
      
 11850 
     | 
    
         
            +
             
     | 
| 
      
 11851 
     | 
    
         
            +
                const int block_size = IQ1S_BLOCK_SIZE;
         
     | 
| 
      
 11852 
     | 
    
         
            +
             
     | 
| 
       11541 
11853 
     | 
    
         
             
                const float x_p[3] = {-1 + IQ1S_DELTA,  IQ1S_DELTA, 1 + IQ1S_DELTA};
         
     | 
| 
       11542 
11854 
     | 
    
         
             
                const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
         
     | 
| 
       11543 
11855 
     | 
    
         | 
| 
       11544 
     | 
    
         
            -
             
     | 
| 
       11545 
     | 
    
         
            -
                float  weight[IQ1S_BLOCK_SIZE];
         
     | 
| 
       11546 
     | 
    
         
            -
                int8_t L[IQ1S_BLOCK_SIZE];
         
     | 
| 
       11547 
     | 
    
         
            -
                float  sumx[IQ1S_BLOCK_SIZE+1];
         
     | 
| 
       11548 
     | 
    
         
            -
                float  sumw[IQ1S_BLOCK_SIZE+1];
         
     | 
| 
       11549 
     | 
    
         
            -
                float  pairs[2*IQ1S_BLOCK_SIZE];
         
     | 
| 
      
 11856 
     | 
    
         
            +
             
     | 
| 
       11550 
11857 
     | 
    
         
             
                int * idx = (int *)(pairs + 1);
         
     | 
| 
       11551 
     | 
    
         
            -
                uint16_t index[IQ1S_BLOCK_SIZE/8];
         
     | 
| 
       11552 
     | 
    
         
            -
                int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
         
     | 
| 
       11553 
11858 
     | 
    
         | 
| 
       11554 
11859 
     | 
    
         
             
                for (int ibl = 0; ibl < nbl; ++ibl) {
         
     | 
| 
       11555 
11860 
     | 
    
         | 
| 
         @@ -11564,15 +11869,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11564 
11869 
     | 
    
         
             
                    for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
         
     | 
| 
       11565 
11870 
     | 
    
         
             
                    float sigma2 = 2*sumx2/QK_K;
         
     | 
| 
       11566 
11871 
     | 
    
         | 
| 
       11567 
     | 
    
         
            -
                    for (int ib = 0; ib < QK_K/ 
     | 
| 
       11568 
     | 
    
         
            -
                        const float * xb = xbl +  
     | 
| 
       11569 
     | 
    
         
            -
                        const float * qw = quant_weights + QK_K*ibl +  
     | 
| 
       11570 
     | 
    
         
            -
                        for (int i = 0; i <  
     | 
| 
      
 11872 
     | 
    
         
            +
                    for (int ib = 0; ib < QK_K/block_size; ++ib) {
         
     | 
| 
      
 11873 
     | 
    
         
            +
                        const float * xb = xbl + block_size*ib;
         
     | 
| 
      
 11874 
     | 
    
         
            +
                        const float * qw = quant_weights + QK_K*ibl + block_size*ib;
         
     | 
| 
      
 11875 
     | 
    
         
            +
                        for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
         
     | 
| 
       11571 
11876 
     | 
    
         
             
                        float max = fabsf(xb[0]);
         
     | 
| 
       11572 
     | 
    
         
            -
                        for (int i = 1; i <  
     | 
| 
      
 11877 
     | 
    
         
            +
                        for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
         
     | 
| 
       11573 
11878 
     | 
    
         
             
                        if (!max) {
         
     | 
| 
       11574 
11879 
     | 
    
         
             
                            scales[ib] = 0;
         
     | 
| 
       11575 
     | 
    
         
            -
                            memset(L, 1,  
     | 
| 
      
 11880 
     | 
    
         
            +
                            memset(L, 1, block_size);
         
     | 
| 
       11576 
11881 
     | 
    
         
             
                            continue;
         
     | 
| 
       11577 
11882 
     | 
    
         
             
                        }
         
     | 
| 
       11578 
11883 
     | 
    
         
             
                        // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
         
     | 
| 
         @@ -11581,14 +11886,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11581 
11886 
     | 
    
         
             
                        // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
         
     | 
| 
       11582 
11887 
     | 
    
         
             
                        // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
         
     | 
| 
       11583 
11888 
     | 
    
         
             
                        // for each possible and score for each split.
         
     | 
| 
       11584 
     | 
    
         
            -
                        for (int j = 0; j <  
     | 
| 
      
 11889 
     | 
    
         
            +
                        for (int j = 0; j < block_size; ++j) {
         
     | 
| 
       11585 
11890 
     | 
    
         
             
                            pairs[2*j] = xb[j];
         
     | 
| 
       11586 
11891 
     | 
    
         
             
                            idx[2*j] = j;
         
     | 
| 
       11587 
11892 
     | 
    
         
             
                        }
         
     | 
| 
       11588 
     | 
    
         
            -
                        qsort(pairs,  
     | 
| 
      
 11893 
     | 
    
         
            +
                        qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
         
     | 
| 
       11589 
11894 
     | 
    
         
             
                        {
         
     | 
| 
       11590 
11895 
     | 
    
         
             
                            sumx[0] = sumw[0] = 0;
         
     | 
| 
       11591 
     | 
    
         
            -
                            for (int j = 0; j <  
     | 
| 
      
 11896 
     | 
    
         
            +
                            for (int j = 0; j < block_size; ++j) {
         
     | 
| 
       11592 
11897 
     | 
    
         
             
                                int i = idx[2*j];
         
     | 
| 
       11593 
11898 
     | 
    
         
             
                                sumx[j+1] = sumx[j] + weight[i]*xb[i];
         
     | 
| 
       11594 
11899 
     | 
    
         
             
                                sumw[j+1] = sumw[j] + weight[i];
         
     | 
| 
         @@ -11596,16 +11901,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11596 
11901 
     | 
    
         
             
                        }
         
     | 
| 
       11597 
11902 
     | 
    
         
             
                        float best_score = 0, scale = max;
         
     | 
| 
       11598 
11903 
     | 
    
         
             
                        int besti1 = -1, besti2 = -1, best_shift = 0;
         
     | 
| 
       11599 
     | 
    
         
            -
                        for (int i1 = 0; i1 <=  
     | 
| 
       11600 
     | 
    
         
            -
                            for (int i2 = i1; i2 <=  
     | 
| 
       11601 
     | 
    
         
            -
                                float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[ 
     | 
| 
       11602 
     | 
    
         
            -
                                float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[ 
     | 
| 
      
 11904 
     | 
    
         
            +
                        for (int i1 = 0; i1 <= block_size; ++i1) {
         
     | 
| 
      
 11905 
     | 
    
         
            +
                            for (int i2 = i1; i2 <= block_size; ++i2) {
         
     | 
| 
      
 11906 
     | 
    
         
            +
                                float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
         
     | 
| 
      
 11907 
     | 
    
         
            +
                                float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
         
     | 
| 
       11603 
11908 
     | 
    
         
             
                                if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
         
     | 
| 
       11604 
11909 
     | 
    
         
             
                                    scale = sumqx/sumq2; best_score = scale*sumqx;
         
     | 
| 
       11605 
11910 
     | 
    
         
             
                                    besti1 = i1; besti2 = i2; best_shift = 1;
         
     | 
| 
       11606 
11911 
     | 
    
         
             
                                }
         
     | 
| 
       11607 
     | 
    
         
            -
                                sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[ 
     | 
| 
       11608 
     | 
    
         
            -
                                sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[ 
     | 
| 
      
 11912 
     | 
    
         
            +
                                sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
         
     | 
| 
      
 11913 
     | 
    
         
            +
                                sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
         
     | 
| 
       11609 
11914 
     | 
    
         
             
                                if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
         
     | 
| 
       11610 
11915 
     | 
    
         
             
                                    scale = sumqx/sumq2; best_score = scale*sumqx;
         
     | 
| 
       11611 
11916 
     | 
    
         
             
                                    besti1 = i1; besti2 = i2; best_shift = -1;
         
     | 
| 
         @@ -11615,14 +11920,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11615 
11920 
     | 
    
         
             
                        GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
         
     | 
| 
       11616 
11921 
     | 
    
         
             
                        for (int j =      0; j < besti1; ++j) L[idx[2*j]] = 0;
         
     | 
| 
       11617 
11922 
     | 
    
         
             
                        for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
         
     | 
| 
       11618 
     | 
    
         
            -
                        for (int j = besti2; j <  
     | 
| 
      
 11923 
     | 
    
         
            +
                        for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
         
     | 
| 
       11619 
11924 
     | 
    
         
             
                        if (scale < 0) {
         
     | 
| 
       11620 
     | 
    
         
            -
                            for (int j = 0; j <  
     | 
| 
      
 11925 
     | 
    
         
            +
                            for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
         
     | 
| 
       11621 
11926 
     | 
    
         
             
                            scale = -scale; best_shift = -best_shift;
         
     | 
| 
       11622 
11927 
     | 
    
         
             
                        }
         
     | 
| 
       11623 
11928 
     | 
    
         
             
                        bool all_on_grid = true;
         
     | 
| 
       11624 
11929 
     | 
    
         
             
                        const float * xx = best_shift == 1 ? x_p : x_m;
         
     | 
| 
       11625 
     | 
    
         
            -
                        for (int k = 0; k <  
     | 
| 
      
 11930 
     | 
    
         
            +
                        for (int k = 0; k < block_size/8; ++k) {
         
     | 
| 
       11626 
11931 
     | 
    
         
             
                            uint16_t u = 0;
         
     | 
| 
       11627 
11932 
     | 
    
         
             
                            for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
         
     | 
| 
       11628 
11933 
     | 
    
         
             
                            int grid_index = kmap_q2xs[u];
         
     | 
| 
         @@ -11636,7 +11941,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11636 
11941 
     | 
    
         
             
                        }
         
     | 
| 
       11637 
11942 
     | 
    
         
             
                        if (!all_on_grid) {
         
     | 
| 
       11638 
11943 
     | 
    
         
             
                            float sumqx = 0, sumq2 = 0;
         
     | 
| 
       11639 
     | 
    
         
            -
                            for (int k = 0; k <  
     | 
| 
      
 11944 
     | 
    
         
            +
                            for (int k = 0; k < block_size/8; ++k) {
         
     | 
| 
       11640 
11945 
     | 
    
         
             
                                const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
         
     | 
| 
       11641 
11946 
     | 
    
         
             
                                for (int j = 0; j < 8; ++j) {
         
     | 
| 
       11642 
11947 
     | 
    
         
             
                                    float w = weight[8*k + j];
         
     | 
| 
         @@ -11648,8 +11953,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11648 
11953 
     | 
    
         
             
                            if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
         
     | 
| 
       11649 
11954 
     | 
    
         
             
                        }
         
     | 
| 
       11650 
11955 
     | 
    
         
             
                        uint16_t h = 0;
         
     | 
| 
       11651 
     | 
    
         
            -
                        for (int k = 0; k <  
     | 
| 
       11652 
     | 
    
         
            -
                            y[ibl].qs[( 
     | 
| 
      
 11956 
     | 
    
         
            +
                        for (int k = 0; k < block_size/8; ++k) {
         
     | 
| 
      
 11957 
     | 
    
         
            +
                            y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
         
     | 
| 
       11653 
11958 
     | 
    
         
             
                            h |= (index[k] >> 8) << 3*k;
         
     | 
| 
       11654 
11959 
     | 
    
         
             
                        }
         
     | 
| 
       11655 
11960 
     | 
    
         
             
                        y[ibl].qh[ib] = h;
         
     | 
| 
         @@ -11660,14 +11965,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11660 
11965 
     | 
    
         
             
                    }
         
     | 
| 
       11661 
11966 
     | 
    
         | 
| 
       11662 
11967 
     | 
    
         
             
                    if (!max_scale) {
         
     | 
| 
       11663 
     | 
    
         
            -
                        memset(y[ibl].qs, 0, QK_K/8);
         
     | 
| 
       11664 
11968 
     | 
    
         
             
                        continue;
         
     | 
| 
       11665 
11969 
     | 
    
         
             
                    }
         
     | 
| 
       11666 
11970 
     | 
    
         | 
| 
       11667 
11971 
     | 
    
         
             
                    float d = max_scale/15;
         
     | 
| 
       11668 
     | 
    
         
            -
                    y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1. 
     | 
| 
      
 11972 
     | 
    
         
            +
                    y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
         
     | 
| 
       11669 
11973 
     | 
    
         
             
                    float id = 1/d;
         
     | 
| 
       11670 
     | 
    
         
            -
                    for (int ib = 0; ib < QK_K/ 
     | 
| 
      
 11974 
     | 
    
         
            +
                    for (int ib = 0; ib < QK_K/block_size; ++ib) {
         
     | 
| 
       11671 
11975 
     | 
    
         
             
                        int l = nearest_int(0.5f*(id*scales[ib]-1));
         
     | 
| 
       11672 
11976 
     | 
    
         
             
                        l = MAX(0, MIN(7, l));
         
     | 
| 
       11673 
11977 
     | 
    
         
             
                        if (shifts[ib] == -1) l |= 8;
         
     | 
| 
         @@ -11676,18 +11980,309 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11676 
11980 
     | 
    
         
             
                }
         
     | 
| 
       11677 
11981 
     | 
    
         
             
            }
         
     | 
| 
       11678 
11982 
     | 
    
         | 
| 
       11679 
     | 
    
         
            -
            size_t quantize_iq1_s(const float * restrict src, void * restrict dst,  
     | 
| 
      
 11983 
     | 
    
         
            +
            size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       11680 
11984 
     | 
    
         
             
                GGML_ASSERT(n_per_row%QK_K == 0);
         
     | 
| 
       11681 
     | 
    
         
            -
                 
     | 
| 
      
 11985 
     | 
    
         
            +
                float  scales[QK_K/IQ1S_BLOCK_SIZE];
         
     | 
| 
      
 11986 
     | 
    
         
            +
                float  weight[IQ1S_BLOCK_SIZE];
         
     | 
| 
      
 11987 
     | 
    
         
            +
                int8_t L[IQ1S_BLOCK_SIZE];
         
     | 
| 
      
 11988 
     | 
    
         
            +
                float  sumx[IQ1S_BLOCK_SIZE+1];
         
     | 
| 
      
 11989 
     | 
    
         
            +
                float  sumw[IQ1S_BLOCK_SIZE+1];
         
     | 
| 
      
 11990 
     | 
    
         
            +
                float  pairs[2*IQ1S_BLOCK_SIZE];
         
     | 
| 
      
 11991 
     | 
    
         
            +
                uint16_t index[IQ1S_BLOCK_SIZE/8];
         
     | 
| 
      
 11992 
     | 
    
         
            +
                int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
         
     | 
| 
      
 11993 
     | 
    
         
            +
                int64_t nblock = n_per_row/QK_K;
         
     | 
| 
       11682 
11994 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       11683 
     | 
    
         
            -
                for ( 
     | 
| 
       11684 
     | 
    
         
            -
                    quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
         
     | 
| 
      
 11995 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
      
 11996 
     | 
    
         
            +
                    quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
         
     | 
| 
       11685 
11997 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
       11686 
11998 
     | 
    
         
             
                    qrow += nblock*sizeof(block_iq1_s);
         
     | 
| 
       11687 
11999 
     | 
    
         
             
                }
         
     | 
| 
       11688 
12000 
     | 
    
         
             
                return nrow * nblock * sizeof(block_iq1_s);
         
     | 
| 
       11689 
12001 
     | 
    
         
             
            }
         
     | 
| 
       11690 
12002 
     | 
    
         | 
| 
      
 12003 
     | 
    
         
            +
            static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
         
     | 
| 
      
 12004 
     | 
    
         
            +
                    float    * scales,
         
     | 
| 
      
 12005 
     | 
    
         
            +
                    float    * weight,
         
     | 
| 
      
 12006 
     | 
    
         
            +
                    float    * pairs,
         
     | 
| 
      
 12007 
     | 
    
         
            +
                    int8_t   * L,
         
     | 
| 
      
 12008 
     | 
    
         
            +
                    uint16_t * index,
         
     | 
| 
      
 12009 
     | 
    
         
            +
                    int8_t   * shifts) {
         
     | 
| 
      
 12010 
     | 
    
         
            +
             
     | 
| 
      
 12011 
     | 
    
         
            +
                const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
         
     | 
| 
      
 12012 
     | 
    
         
            +
             
     | 
| 
      
 12013 
     | 
    
         
            +
                const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
         
     | 
| 
      
 12014 
     | 
    
         
            +
                const int      * kmap_q2xs       = iq2_data[gindex].map;
         
     | 
| 
      
 12015 
     | 
    
         
            +
                const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
         
     | 
| 
      
 12016 
     | 
    
         
            +
             
     | 
| 
      
 12017 
     | 
    
         
            +
                //GGML_ASSERT(quant_weights   && "missing quantization weights");
         
     | 
| 
      
 12018 
     | 
    
         
            +
                GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
         
     | 
| 
      
 12019 
     | 
    
         
            +
                GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
         
     | 
| 
      
 12020 
     | 
    
         
            +
                GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
         
     | 
| 
      
 12021 
     | 
    
         
            +
                GGML_ASSERT(n%QK_K == 0);
         
     | 
| 
      
 12022 
     | 
    
         
            +
             
     | 
| 
      
 12023 
     | 
    
         
            +
                block_iq1_m * y = vy;
         
     | 
| 
      
 12024 
     | 
    
         
            +
             
     | 
| 
      
 12025 
     | 
    
         
            +
                const int64_t nbl = n/QK_K;
         
     | 
| 
      
 12026 
     | 
    
         
            +
             
     | 
| 
      
 12027 
     | 
    
         
            +
                const int block_size = IQ1M_BLOCK_SIZE;
         
     | 
| 
      
 12028 
     | 
    
         
            +
             
     | 
| 
      
 12029 
     | 
    
         
            +
                const float x_p[3] = {-1 + IQ1M_DELTA,  IQ1M_DELTA, 1 + IQ1M_DELTA};
         
     | 
| 
      
 12030 
     | 
    
         
            +
                const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
         
     | 
| 
      
 12031 
     | 
    
         
            +
                const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
         
     | 
| 
      
 12032 
     | 
    
         
            +
             
     | 
| 
      
 12033 
     | 
    
         
            +
                int * idx = (int *)(pairs + 1);
         
     | 
| 
      
 12034 
     | 
    
         
            +
             
     | 
| 
      
 12035 
     | 
    
         
            +
                float sumqx[4], sumq2[4];
         
     | 
| 
      
 12036 
     | 
    
         
            +
             
     | 
| 
      
 12037 
     | 
    
         
            +
                iq1m_scale_t s;
         
     | 
| 
      
 12038 
     | 
    
         
            +
                const float * xx;
         
     | 
| 
      
 12039 
     | 
    
         
            +
             
     | 
| 
      
 12040 
     | 
    
         
            +
                for (int ibl = 0; ibl < nbl; ++ibl) {
         
     | 
| 
      
 12041 
     | 
    
         
            +
             
     | 
| 
      
 12042 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 12043 
     | 
    
         
            +
                    y[ibl].d = GGML_FP32_TO_FP16(0.f);
         
     | 
| 
      
 12044 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 12045 
     | 
    
         
            +
                    memset(y[ibl].qs, 0, QK_K/8);
         
     | 
| 
      
 12046 
     | 
    
         
            +
                    memset(y[ibl].qh, 0, QK_K/16);
         
     | 
| 
      
 12047 
     | 
    
         
            +
                    memset(y[ibl].scales, 0, QK_K/32);
         
     | 
| 
      
 12048 
     | 
    
         
            +
             
     | 
| 
      
 12049 
     | 
    
         
            +
                    float max_scale = 0;
         
     | 
| 
      
 12050 
     | 
    
         
            +
             
     | 
| 
      
 12051 
     | 
    
         
            +
                    const float * xbl = x + QK_K*ibl;
         
     | 
| 
      
 12052 
     | 
    
         
            +
                    float sumx2 = 0;
         
     | 
| 
      
 12053 
     | 
    
         
            +
                    for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
         
     | 
| 
      
 12054 
     | 
    
         
            +
                    float sigma2 = 2*sumx2/QK_K;
         
     | 
| 
      
 12055 
     | 
    
         
            +
             
     | 
| 
      
 12056 
     | 
    
         
            +
                    for (int ib = 0; ib < QK_K/block_size; ++ib) {
         
     | 
| 
      
 12057 
     | 
    
         
            +
                        const float * xb = xbl + block_size*ib;
         
     | 
| 
      
 12058 
     | 
    
         
            +
                        if (quant_weights) {
         
     | 
| 
      
 12059 
     | 
    
         
            +
                            const float * qw = quant_weights + QK_K*ibl + block_size*ib;
         
     | 
| 
      
 12060 
     | 
    
         
            +
                            for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
         
     | 
| 
      
 12061 
     | 
    
         
            +
                        } else {
         
     | 
| 
      
 12062 
     | 
    
         
            +
                            for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
         
     | 
| 
      
 12063 
     | 
    
         
            +
                        }
         
     | 
| 
      
 12064 
     | 
    
         
            +
                        float max = fabsf(xb[0]);
         
     | 
| 
      
 12065 
     | 
    
         
            +
                        for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
         
     | 
| 
      
 12066 
     | 
    
         
            +
                        if (!max) {
         
     | 
| 
      
 12067 
     | 
    
         
            +
                            scales[ib] = 0;
         
     | 
| 
      
 12068 
     | 
    
         
            +
                            memset(L, 1, block_size);
         
     | 
| 
      
 12069 
     | 
    
         
            +
                            continue;
         
     | 
| 
      
 12070 
     | 
    
         
            +
                        }
         
     | 
| 
      
 12071 
     | 
    
         
            +
                        // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
         
     | 
| 
      
 12072 
     | 
    
         
            +
                        // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
         
     | 
| 
      
 12073 
     | 
    
         
            +
                        // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
         
     | 
| 
      
 12074 
     | 
    
         
            +
                        // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
         
     | 
| 
      
 12075 
     | 
    
         
            +
                        // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
         
     | 
| 
      
 12076 
     | 
    
         
            +
                        // for each possible and score for each split.
         
     | 
| 
      
 12077 
     | 
    
         
            +
                        for (int j = 0; j < block_size; ++j) {
         
     | 
| 
      
 12078 
     | 
    
         
            +
                            pairs[2*j] = xb[j];
         
     | 
| 
      
 12079 
     | 
    
         
            +
                            idx[2*j] = j;
         
     | 
| 
      
 12080 
     | 
    
         
            +
                        }
         
     | 
| 
      
 12081 
     | 
    
         
            +
                        qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
         
     | 
| 
      
 12082 
     | 
    
         
            +
                        float best_score = 0, scale = max;
         
     | 
| 
      
 12083 
     | 
    
         
            +
                        int besti1 = -1, besti2 = -1, best_k = -1;
         
     | 
| 
      
 12084 
     | 
    
         
            +
                        // 0: +, +
         
     | 
| 
      
 12085 
     | 
    
         
            +
                        // 1: +, -
         
     | 
| 
      
 12086 
     | 
    
         
            +
                        // 2: -, +
         
     | 
| 
      
 12087 
     | 
    
         
            +
                        // 3: -, -
         
     | 
| 
      
 12088 
     | 
    
         
            +
                        for (int i1 = 0; i1 <= block_size; ++i1) {
         
     | 
| 
      
 12089 
     | 
    
         
            +
                            for (int i2 = i1; i2 <= block_size; ++i2) {
         
     | 
| 
      
 12090 
     | 
    
         
            +
                                memset(sumqx, 0, 4*sizeof(float));
         
     | 
| 
      
 12091 
     | 
    
         
            +
                                memset(sumq2, 0, 4*sizeof(float));
         
     | 
| 
      
 12092 
     | 
    
         
            +
                                for (int j = 0; j < i1; ++j) {
         
     | 
| 
      
 12093 
     | 
    
         
            +
                                    int i = idx[2*j];
         
     | 
| 
      
 12094 
     | 
    
         
            +
                                    if (i < block_size/2) {
         
     | 
| 
      
 12095 
     | 
    
         
            +
                                        sumqx[0] += weight[i]*x_p[0]*xb[i];
         
     | 
| 
      
 12096 
     | 
    
         
            +
                                        sumqx[1] += weight[i]*x_p[0]*xb[i];
         
     | 
| 
      
 12097 
     | 
    
         
            +
                                        sumqx[2] += weight[i]*x_m[0]*xb[i];
         
     | 
| 
      
 12098 
     | 
    
         
            +
                                        sumqx[3] += weight[i]*x_m[0]*xb[i];
         
     | 
| 
      
 12099 
     | 
    
         
            +
                                        sumq2[0] += weight[i]*x_p[0]*x_p[0];
         
     | 
| 
      
 12100 
     | 
    
         
            +
                                        sumq2[1] += weight[i]*x_p[0]*x_p[0];
         
     | 
| 
      
 12101 
     | 
    
         
            +
                                        sumq2[2] += weight[i]*x_m[0]*x_m[0];
         
     | 
| 
      
 12102 
     | 
    
         
            +
                                        sumq2[3] += weight[i]*x_m[0]*x_m[0];
         
     | 
| 
      
 12103 
     | 
    
         
            +
                                    } else {
         
     | 
| 
      
 12104 
     | 
    
         
            +
                                        sumqx[0] += weight[i]*x_p[0]*xb[i];
         
     | 
| 
      
 12105 
     | 
    
         
            +
                                        sumqx[2] += weight[i]*x_p[0]*xb[i];
         
     | 
| 
      
 12106 
     | 
    
         
            +
                                        sumqx[1] += weight[i]*x_m[0]*xb[i];
         
     | 
| 
      
 12107 
     | 
    
         
            +
                                        sumqx[3] += weight[i]*x_m[0]*xb[i];
         
     | 
| 
      
 12108 
     | 
    
         
            +
                                        sumq2[0] += weight[i]*x_p[0]*x_p[0];
         
     | 
| 
      
 12109 
     | 
    
         
            +
                                        sumq2[2] += weight[i]*x_p[0]*x_p[0];
         
     | 
| 
      
 12110 
     | 
    
         
            +
                                        sumq2[1] += weight[i]*x_m[0]*x_m[0];
         
     | 
| 
      
 12111 
     | 
    
         
            +
                                        sumq2[3] += weight[i]*x_m[0]*x_m[0];
         
     | 
| 
      
 12112 
     | 
    
         
            +
                                    }
         
     | 
| 
      
 12113 
     | 
    
         
            +
                                }
         
     | 
| 
      
 12114 
     | 
    
         
            +
                                for (int j = i1; j < i2; ++j) {
         
     | 
| 
      
 12115 
     | 
    
         
            +
                                    int i = idx[2*j];
         
     | 
| 
      
 12116 
     | 
    
         
            +
                                    if (i < block_size/2) {
         
     | 
| 
      
 12117 
     | 
    
         
            +
                                        sumqx[0] += weight[i]*x_p[1]*xb[i];
         
     | 
| 
      
 12118 
     | 
    
         
            +
                                        sumqx[1] += weight[i]*x_p[1]*xb[i];
         
     | 
| 
      
 12119 
     | 
    
         
            +
                                        sumqx[2] += weight[i]*x_m[1]*xb[i];
         
     | 
| 
      
 12120 
     | 
    
         
            +
                                        sumqx[3] += weight[i]*x_m[1]*xb[i];
         
     | 
| 
      
 12121 
     | 
    
         
            +
                                        sumq2[0] += weight[i]*x_p[1]*x_p[1];
         
     | 
| 
      
 12122 
     | 
    
         
            +
                                        sumq2[1] += weight[i]*x_p[1]*x_p[1];
         
     | 
| 
      
 12123 
     | 
    
         
            +
                                        sumq2[2] += weight[i]*x_m[1]*x_m[1];
         
     | 
| 
      
 12124 
     | 
    
         
            +
                                        sumq2[3] += weight[i]*x_m[1]*x_m[1];
         
     | 
| 
      
 12125 
     | 
    
         
            +
                                    } else {
         
     | 
| 
      
 12126 
     | 
    
         
            +
                                        sumqx[0] += weight[i]*x_p[1]*xb[i];
         
     | 
| 
      
 12127 
     | 
    
         
            +
                                        sumqx[2] += weight[i]*x_p[1]*xb[i];
         
     | 
| 
      
 12128 
     | 
    
         
            +
                                        sumqx[1] += weight[i]*x_m[1]*xb[i];
         
     | 
| 
      
 12129 
     | 
    
         
            +
                                        sumqx[3] += weight[i]*x_m[1]*xb[i];
         
     | 
| 
      
 12130 
     | 
    
         
            +
                                        sumq2[0] += weight[i]*x_p[1]*x_p[1];
         
     | 
| 
      
 12131 
     | 
    
         
            +
                                        sumq2[2] += weight[i]*x_p[1]*x_p[1];
         
     | 
| 
      
 12132 
     | 
    
         
            +
                                        sumq2[1] += weight[i]*x_m[1]*x_m[1];
         
     | 
| 
      
 12133 
     | 
    
         
            +
                                        sumq2[3] += weight[i]*x_m[1]*x_m[1];
         
     | 
| 
      
 12134 
     | 
    
         
            +
                                    }
         
     | 
| 
      
 12135 
     | 
    
         
            +
                                }
         
     | 
| 
      
 12136 
     | 
    
         
            +
                                for (int j = i2; j < block_size; ++j) {
         
     | 
| 
      
 12137 
     | 
    
         
            +
                                    int i = idx[2*j];
         
     | 
| 
      
 12138 
     | 
    
         
            +
                                    if (i < block_size/2) {
         
     | 
| 
      
 12139 
     | 
    
         
            +
                                        sumqx[0] += weight[i]*x_p[2]*xb[i];
         
     | 
| 
      
 12140 
     | 
    
         
            +
                                        sumqx[1] += weight[i]*x_p[2]*xb[i];
         
     | 
| 
      
 12141 
     | 
    
         
            +
                                        sumqx[2] += weight[i]*x_m[2]*xb[i];
         
     | 
| 
      
 12142 
     | 
    
         
            +
                                        sumqx[3] += weight[i]*x_m[2]*xb[i];
         
     | 
| 
      
 12143 
     | 
    
         
            +
                                        sumq2[0] += weight[i]*x_p[2]*x_p[2];
         
     | 
| 
      
 12144 
     | 
    
         
            +
                                        sumq2[1] += weight[i]*x_p[2]*x_p[2];
         
     | 
| 
      
 12145 
     | 
    
         
            +
                                        sumq2[2] += weight[i]*x_m[2]*x_m[2];
         
     | 
| 
      
 12146 
     | 
    
         
            +
                                        sumq2[3] += weight[i]*x_m[2]*x_m[2];
         
     | 
| 
      
 12147 
     | 
    
         
            +
                                    } else {
         
     | 
| 
      
 12148 
     | 
    
         
            +
                                        sumqx[0] += weight[i]*x_p[2]*xb[i];
         
     | 
| 
      
 12149 
     | 
    
         
            +
                                        sumqx[2] += weight[i]*x_p[2]*xb[i];
         
     | 
| 
      
 12150 
     | 
    
         
            +
                                        sumqx[1] += weight[i]*x_m[2]*xb[i];
         
     | 
| 
      
 12151 
     | 
    
         
            +
                                        sumqx[3] += weight[i]*x_m[2]*xb[i];
         
     | 
| 
      
 12152 
     | 
    
         
            +
                                        sumq2[0] += weight[i]*x_p[2]*x_p[2];
         
     | 
| 
      
 12153 
     | 
    
         
            +
                                        sumq2[2] += weight[i]*x_p[2]*x_p[2];
         
     | 
| 
      
 12154 
     | 
    
         
            +
                                        sumq2[1] += weight[i]*x_m[2]*x_m[2];
         
     | 
| 
      
 12155 
     | 
    
         
            +
                                        sumq2[3] += weight[i]*x_m[2]*x_m[2];
         
     | 
| 
      
 12156 
     | 
    
         
            +
                                    }
         
     | 
| 
      
 12157 
     | 
    
         
            +
                                }
         
     | 
| 
      
 12158 
     | 
    
         
            +
                                for (int k = 0; k < 4; ++k) {
         
     | 
| 
      
 12159 
     | 
    
         
            +
                                    if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
         
     | 
| 
      
 12160 
     | 
    
         
            +
                                        scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
         
     | 
| 
      
 12161 
     | 
    
         
            +
                                        besti1 = i1; besti2 = i2; best_k = k;
         
     | 
| 
      
 12162 
     | 
    
         
            +
                                    }
         
     | 
| 
      
 12163 
     | 
    
         
            +
                                }
         
     | 
| 
      
 12164 
     | 
    
         
            +
                            }
         
     | 
| 
      
 12165 
     | 
    
         
            +
                        }
         
     | 
| 
      
 12166 
     | 
    
         
            +
                        GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
         
     | 
| 
      
 12167 
     | 
    
         
            +
                        for (int j =      0; j < besti1; ++j) L[idx[2*j]] = 0;
         
     | 
| 
      
 12168 
     | 
    
         
            +
                        for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
         
     | 
| 
      
 12169 
     | 
    
         
            +
                        for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
         
     | 
| 
      
 12170 
     | 
    
         
            +
                        if (scale < 0) {
         
     | 
| 
      
 12171 
     | 
    
         
            +
                            for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
         
     | 
| 
      
 12172 
     | 
    
         
            +
                            scale = -scale;
         
     | 
| 
      
 12173 
     | 
    
         
            +
                            best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
         
     | 
| 
      
 12174 
     | 
    
         
            +
                        }
         
     | 
| 
      
 12175 
     | 
    
         
            +
                        bool all_on_grid = true;
         
     | 
| 
      
 12176 
     | 
    
         
            +
                        for (int k = 0; k < block_size/8; ++k) {
         
     | 
| 
      
 12177 
     | 
    
         
            +
                            if (k == 0) xx = best_k < 2 ? x_p : x_m;
         
     | 
| 
      
 12178 
     | 
    
         
            +
                            else xx = best_k%2 == 0 ? x_p : x_m;
         
     | 
| 
      
 12179 
     | 
    
         
            +
                            uint16_t u = 0;
         
     | 
| 
      
 12180 
     | 
    
         
            +
                            for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
         
     | 
| 
      
 12181 
     | 
    
         
            +
                            int grid_index = kmap_q2xs[u];
         
     | 
| 
      
 12182 
     | 
    
         
            +
                            if (grid_index < 0) {
         
     | 
| 
      
 12183 
     | 
    
         
            +
                                all_on_grid = false;
         
     | 
| 
      
 12184 
     | 
    
         
            +
                                const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
         
     | 
| 
      
 12185 
     | 
    
         
            +
                                grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
         
     | 
| 
      
 12186 
     | 
    
         
            +
                                GGML_ASSERT(grid_index >= 0);
         
     | 
| 
      
 12187 
     | 
    
         
            +
                            }
         
     | 
| 
      
 12188 
     | 
    
         
            +
                            index[k] = grid_index;
         
     | 
| 
      
 12189 
     | 
    
         
            +
                        }
         
     | 
| 
      
 12190 
     | 
    
         
            +
                        if (!all_on_grid) {
         
     | 
| 
      
 12191 
     | 
    
         
            +
                            float sumqx_f = 0, sumq2_f = 0;
         
     | 
| 
      
 12192 
     | 
    
         
            +
                            for (int k = 0; k < block_size/8; ++k) {
         
     | 
| 
      
 12193 
     | 
    
         
            +
                                if (k == 0) xx = best_k < 2 ? x_p : x_m;
         
     | 
| 
      
 12194 
     | 
    
         
            +
                                else xx = best_k%2 == 0 ? x_p : x_m;
         
     | 
| 
      
 12195 
     | 
    
         
            +
                                const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
         
     | 
| 
      
 12196 
     | 
    
         
            +
                                for (int j = 0; j < 8; ++j) {
         
     | 
| 
      
 12197 
     | 
    
         
            +
                                    float w = weight[8*k + j];
         
     | 
| 
      
 12198 
     | 
    
         
            +
                                    float q = xx[(pg[j] - 1)/2];
         
     | 
| 
      
 12199 
     | 
    
         
            +
                                    sumqx_f += w*q*xb[8*k+j];
         
     | 
| 
      
 12200 
     | 
    
         
            +
                                    sumq2_f += w*q*q;
         
     | 
| 
      
 12201 
     | 
    
         
            +
                                }
         
     | 
| 
      
 12202 
     | 
    
         
            +
                            }
         
     | 
| 
      
 12203 
     | 
    
         
            +
                            if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
         
     | 
| 
      
 12204 
     | 
    
         
            +
                        }
         
     | 
| 
      
 12205 
     | 
    
         
            +
                        y[ibl].qs[2*ib + 0] = index[0] & 255;
         
     | 
| 
      
 12206 
     | 
    
         
            +
                        y[ibl].qs[2*ib + 1] = index[1] & 255;
         
     | 
| 
      
 12207 
     | 
    
         
            +
                        y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
         
     | 
| 
      
 12208 
     | 
    
         
            +
                        GGML_ASSERT(scale >= 0);
         
     | 
| 
      
 12209 
     | 
    
         
            +
                        scales[ib] = scale;
         
     | 
| 
      
 12210 
     | 
    
         
            +
                        shifts[ib] = best_k;
         
     | 
| 
      
 12211 
     | 
    
         
            +
                        max_scale = MAX(max_scale, scale);
         
     | 
| 
      
 12212 
     | 
    
         
            +
                    }
         
     | 
| 
      
 12213 
     | 
    
         
            +
             
     | 
| 
      
 12214 
     | 
    
         
            +
                    if (!max_scale) {
         
     | 
| 
      
 12215 
     | 
    
         
            +
                        continue;
         
     | 
| 
      
 12216 
     | 
    
         
            +
                    }
         
     | 
| 
      
 12217 
     | 
    
         
            +
             
     | 
| 
      
 12218 
     | 
    
         
            +
                    uint16_t * sc = (uint16_t *)y[ibl].scales;
         
     | 
| 
      
 12219 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 12220 
     | 
    
         
            +
                    float d = max_scale/31;
         
     | 
| 
      
 12221 
     | 
    
         
            +
            #else
         
     | 
| 
      
 12222 
     | 
    
         
            +
                    float d = max_scale/15;
         
     | 
| 
      
 12223 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 12224 
     | 
    
         
            +
                    float id = 1/d;
         
     | 
| 
      
 12225 
     | 
    
         
            +
                    float sumqx_f = 0, sumq2_f = 0;
         
     | 
| 
      
 12226 
     | 
    
         
            +
                    for (int ib = 0; ib < QK_K/block_size; ++ib) {
         
     | 
| 
      
 12227 
     | 
    
         
            +
                        int l = nearest_int(0.5f*(id*scales[ib+0]-1));
         
     | 
| 
      
 12228 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 12229 
     | 
    
         
            +
                        l = MAX(0, MIN(15, l));
         
     | 
| 
      
 12230 
     | 
    
         
            +
                        sc[ib/4] |= (l << 4*(ib%4));
         
     | 
| 
      
 12231 
     | 
    
         
            +
            #else
         
     | 
| 
      
 12232 
     | 
    
         
            +
                        l = MAX(0, MIN(7, l));
         
     | 
| 
      
 12233 
     | 
    
         
            +
                        sc[ib/4] |= (l << 3*(ib%4));
         
     | 
| 
      
 12234 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 12235 
     | 
    
         
            +
                        y[ibl].qh[ib] |= masks[shifts[ib]];
         
     | 
| 
      
 12236 
     | 
    
         
            +
                        const float * xb = xbl + block_size*ib;
         
     | 
| 
      
 12237 
     | 
    
         
            +
                        if (quant_weights) {
         
     | 
| 
      
 12238 
     | 
    
         
            +
                            const float * qw = quant_weights + QK_K*ibl + block_size*ib;
         
     | 
| 
      
 12239 
     | 
    
         
            +
                            for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
         
     | 
| 
      
 12240 
     | 
    
         
            +
                        } else {
         
     | 
| 
      
 12241 
     | 
    
         
            +
                            for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
         
     | 
| 
      
 12242 
     | 
    
         
            +
                        }
         
     | 
| 
      
 12243 
     | 
    
         
            +
                        for (int k = 0; k < block_size/8; ++k) {
         
     | 
| 
      
 12244 
     | 
    
         
            +
                            if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
         
     | 
| 
      
 12245 
     | 
    
         
            +
                            else xx = shifts[ib]%2 == 0 ? x_p : x_m;
         
     | 
| 
      
 12246 
     | 
    
         
            +
                            const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
         
     | 
| 
      
 12247 
     | 
    
         
            +
                            for (int j = 0; j < 8; ++j) {
         
     | 
| 
      
 12248 
     | 
    
         
            +
                                float w = weight[8*k + j];
         
     | 
| 
      
 12249 
     | 
    
         
            +
                                float q = xx[(pg[j] - 1)/2]*(2*l+1);
         
     | 
| 
      
 12250 
     | 
    
         
            +
                                sumqx_f += w*q*xb[8*k+j];
         
     | 
| 
      
 12251 
     | 
    
         
            +
                                sumq2_f += w*q*q;
         
     | 
| 
      
 12252 
     | 
    
         
            +
                            }
         
     | 
| 
      
 12253 
     | 
    
         
            +
                        }
         
     | 
| 
      
 12254 
     | 
    
         
            +
                    }
         
     | 
| 
      
 12255 
     | 
    
         
            +
                    if (sumq2_f > 0) d = sumqx_f/sumq2_f;
         
     | 
| 
      
 12256 
     | 
    
         
            +
                    s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
         
     | 
| 
      
 12257 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 12258 
     | 
    
         
            +
                    y[ibl].d = s.f16;
         
     | 
| 
      
 12259 
     | 
    
         
            +
            #else
         
     | 
| 
      
 12260 
     | 
    
         
            +
                    sc[0] |= ((s.u16 & 0x000f) << 12);
         
     | 
| 
      
 12261 
     | 
    
         
            +
                    sc[1] |= ((s.u16 & 0x00f0) <<  8);
         
     | 
| 
      
 12262 
     | 
    
         
            +
                    sc[2] |= ((s.u16 & 0x0f00) <<  4);
         
     | 
| 
      
 12263 
     | 
    
         
            +
                    sc[3] |= ((s.u16 & 0xf000) <<  0);
         
     | 
| 
      
 12264 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 12265 
     | 
    
         
            +
                }
         
     | 
| 
      
 12266 
     | 
    
         
            +
            }
         
     | 
| 
      
 12267 
     | 
    
         
            +
             
     | 
| 
      
 12268 
     | 
    
         
            +
            size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
      
 12269 
     | 
    
         
            +
                GGML_ASSERT(n_per_row%QK_K == 0);
         
     | 
| 
      
 12270 
     | 
    
         
            +
                float  scales[QK_K/IQ1M_BLOCK_SIZE];
         
     | 
| 
      
 12271 
     | 
    
         
            +
                float  weight[IQ1M_BLOCK_SIZE];
         
     | 
| 
      
 12272 
     | 
    
         
            +
                int8_t L[IQ1M_BLOCK_SIZE];
         
     | 
| 
      
 12273 
     | 
    
         
            +
                float  pairs[2*IQ1M_BLOCK_SIZE];
         
     | 
| 
      
 12274 
     | 
    
         
            +
                uint16_t index[IQ1M_BLOCK_SIZE/8];
         
     | 
| 
      
 12275 
     | 
    
         
            +
                int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
         
     | 
| 
      
 12276 
     | 
    
         
            +
                int64_t nblock = n_per_row/QK_K;
         
     | 
| 
      
 12277 
     | 
    
         
            +
                char * qrow = (char *)dst;
         
     | 
| 
      
 12278 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
      
 12279 
     | 
    
         
            +
                    quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
         
     | 
| 
      
 12280 
     | 
    
         
            +
                    src += n_per_row;
         
     | 
| 
      
 12281 
     | 
    
         
            +
                    qrow += nblock*sizeof(block_iq1_m);
         
     | 
| 
      
 12282 
     | 
    
         
            +
                }
         
     | 
| 
      
 12283 
     | 
    
         
            +
                return nrow * nblock * sizeof(block_iq1_m);
         
     | 
| 
      
 12284 
     | 
    
         
            +
            }
         
     | 
| 
      
 12285 
     | 
    
         
            +
             
     | 
| 
       11691 
12286 
     | 
    
         
             
            // ============================ 4-bit non-linear quants
         
     | 
| 
       11692 
12287 
     | 
    
         | 
| 
       11693 
12288 
     | 
    
         
             
            static inline int best_index_int8(int n, const int8_t * val, float x) {
         
     | 
| 
         @@ -11812,16 +12407,16 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block 
     | 
|
| 
       11812 
12407 
     | 
    
         
             
                }
         
     | 
| 
       11813 
12408 
     | 
    
         
             
            }
         
     | 
| 
       11814 
12409 
     | 
    
         | 
| 
       11815 
     | 
    
         
            -
            size_t quantize_iq4_nl(const float * restrict src, void * restrict dst,  
     | 
| 
      
 12410 
     | 
    
         
            +
            size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       11816 
12411 
     | 
    
         
             
                GGML_ASSERT(n_per_row%QK4_NL == 0);
         
     | 
| 
       11817 
     | 
    
         
            -
                 
     | 
| 
      
 12412 
     | 
    
         
            +
                int64_t nblock = n_per_row/QK4_NL;
         
     | 
| 
       11818 
12413 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       11819 
12414 
     | 
    
         
             
                uint8_t L[QK4_NL];
         
     | 
| 
       11820 
12415 
     | 
    
         
             
                float weight[QK4_NL];
         
     | 
| 
       11821 
12416 
     | 
    
         
             
                uint16_t unused_h;
         
     | 
| 
       11822 
12417 
     | 
    
         
             
                uint8_t * unused_l = NULL;
         
     | 
| 
       11823 
12418 
     | 
    
         
             
                float scale;
         
     | 
| 
       11824 
     | 
    
         
            -
                for ( 
     | 
| 
      
 12419 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       11825 
12420 
     | 
    
         
             
                    block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
         
     | 
| 
       11826 
12421 
     | 
    
         
             
                    for (int ibl = 0; ibl < nblock; ++ibl) {
         
     | 
| 
       11827 
12422 
     | 
    
         
             
                        const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
         
     | 
| 
         @@ -11834,9 +12429,9 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow 
     | 
|
| 
       11834 
12429 
     | 
    
         
             
                return nrow * nblock * sizeof(block_iq4_nl);
         
     | 
| 
       11835 
12430 
     | 
    
         
             
            }
         
     | 
| 
       11836 
12431 
     | 
    
         | 
| 
       11837 
     | 
    
         
            -
            void quantize_row_iq4_nl(const float * restrict x, void * restrict vy,  
     | 
| 
      
 12432 
     | 
    
         
            +
            void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       11838 
12433 
     | 
    
         
             
                GGML_ASSERT(k%QK4_NL == 0);
         
     | 
| 
       11839 
     | 
    
         
            -
                 
     | 
| 
      
 12434 
     | 
    
         
            +
                int64_t nblock = k/QK4_NL;
         
     | 
| 
       11840 
12435 
     | 
    
         
             
                uint8_t L[QK4_NL];
         
     | 
| 
       11841 
12436 
     | 
    
         
             
                float weight[QK4_NL];
         
     | 
| 
       11842 
12437 
     | 
    
         
             
                uint16_t unused_h;
         
     | 
| 
         @@ -11849,22 +12444,22 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { 
     | 
|
| 
       11849 
12444 
     | 
    
         
             
                }
         
     | 
| 
       11850 
12445 
     | 
    
         
             
            }
         
     | 
| 
       11851 
12446 
     | 
    
         | 
| 
       11852 
     | 
    
         
            -
            void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y,  
     | 
| 
      
 12447 
     | 
    
         
            +
            void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
         
     | 
| 
       11853 
12448 
     | 
    
         
             
                assert(k % QK4_NL == 0);
         
     | 
| 
       11854 
12449 
     | 
    
         
             
                quantize_row_iq4_nl(x, y, k);
         
     | 
| 
       11855 
12450 
     | 
    
         
             
            }
         
     | 
| 
       11856 
12451 
     | 
    
         | 
| 
       11857 
     | 
    
         
            -
            size_t quantize_iq4_xs(const float * restrict src, void * restrict dst,  
     | 
| 
      
 12452 
     | 
    
         
            +
            size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       11858 
12453 
     | 
    
         
             
            #if QK_K == 64
         
     | 
| 
       11859 
12454 
     | 
    
         
             
                return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
         
     | 
| 
       11860 
12455 
     | 
    
         
             
            #else
         
     | 
| 
       11861 
12456 
     | 
    
         
             
                GGML_ASSERT(n_per_row%QK_K == 0);
         
     | 
| 
       11862 
     | 
    
         
            -
                 
     | 
| 
      
 12457 
     | 
    
         
            +
                int64_t nblock = n_per_row/QK_K;
         
     | 
| 
       11863 
12458 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       11864 
12459 
     | 
    
         
             
                uint8_t L[QK_K];
         
     | 
| 
       11865 
12460 
     | 
    
         
             
                float weight[32];
         
     | 
| 
       11866 
12461 
     | 
    
         
             
                float scales[QK_K/32];
         
     | 
| 
       11867 
     | 
    
         
            -
                for ( 
     | 
| 
      
 12462 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       11868 
12463 
     | 
    
         
             
                    block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
         
     | 
| 
       11869 
12464 
     | 
    
         
             
                    for (int ibl = 0; ibl < nblock; ++ibl) {
         
     | 
| 
       11870 
12465 
     | 
    
         
             
                        const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
         
     | 
| 
         @@ -11878,20 +12473,20 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow 
     | 
|
| 
       11878 
12473 
     | 
    
         
             
            #endif
         
     | 
| 
       11879 
12474 
     | 
    
         
             
            }
         
     | 
| 
       11880 
12475 
     | 
    
         | 
| 
       11881 
     | 
    
         
            -
            void quantize_row_iq4_xs(const float * restrict x, void * restrict vy,  
     | 
| 
      
 12476 
     | 
    
         
            +
            void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       11882 
12477 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       11883 
12478 
     | 
    
         
             
                block_iq4_xs * restrict y = vy;
         
     | 
| 
       11884 
12479 
     | 
    
         
             
                quantize_row_iq4_xs_reference(x, y, k);
         
     | 
| 
       11885 
12480 
     | 
    
         
             
            }
         
     | 
| 
       11886 
12481 
     | 
    
         | 
| 
       11887 
     | 
    
         
            -
            void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y,  
     | 
| 
      
 12482 
     | 
    
         
            +
            void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
         
     | 
| 
       11888 
12483 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       11889 
12484 
     | 
    
         
             
                quantize_iq4_xs(x, y, 1, k, NULL);
         
     | 
| 
       11890 
12485 
     | 
    
         
             
            }
         
     | 
| 
       11891 
12486 
     | 
    
         | 
| 
       11892 
12487 
     | 
    
         
             
            // =============================== 2.5625 bpw
         
     | 
| 
       11893 
12488 
     | 
    
         | 
| 
       11894 
     | 
    
         
            -
            static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy,  
     | 
| 
      
 12489 
     | 
    
         
            +
            static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
         
     | 
| 
       11895 
12490 
     | 
    
         | 
| 
       11896 
12491 
     | 
    
         
             
                const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
         
     | 
| 
       11897 
12492 
     | 
    
         | 
| 
         @@ -11906,7 +12501,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       11906 
12501 
     | 
    
         | 
| 
       11907 
12502 
     | 
    
         
             
                const int kMaxQ = 3;
         
     | 
| 
       11908 
12503 
     | 
    
         | 
| 
       11909 
     | 
    
         
            -
                const  
     | 
| 
      
 12504 
     | 
    
         
            +
                const int64_t nbl = n/QK_K;
         
     | 
| 
       11910 
12505 
     | 
    
         | 
| 
       11911 
12506 
     | 
    
         
             
                block_iq2_s * y = vy;
         
     | 
| 
       11912 
12507 
     | 
    
         | 
| 
         @@ -12059,11 +12654,11 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy 
     | 
|
| 
       12059 
12654 
     | 
    
         
             
                }
         
     | 
| 
       12060 
12655 
     | 
    
         
             
            }
         
     | 
| 
       12061 
12656 
     | 
    
         | 
| 
       12062 
     | 
    
         
            -
            size_t quantize_iq2_s(const float * restrict src, void * restrict dst,  
     | 
| 
      
 12657 
     | 
    
         
            +
            size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
         
     | 
| 
       12063 
12658 
     | 
    
         
             
                GGML_ASSERT(n_per_row%QK_K == 0);
         
     | 
| 
       12064 
     | 
    
         
            -
                 
     | 
| 
      
 12659 
     | 
    
         
            +
                int64_t nblock = n_per_row/QK_K;
         
     | 
| 
       12065 
12660 
     | 
    
         
             
                char * qrow = (char *)dst;
         
     | 
| 
       12066 
     | 
    
         
            -
                for ( 
     | 
| 
      
 12661 
     | 
    
         
            +
                for (int64_t row = 0; row < nrow; ++row) {
         
     | 
| 
       12067 
12662 
     | 
    
         
             
                    quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
         
     | 
| 
       12068 
12663 
     | 
    
         
             
                    src += n_per_row;
         
     | 
| 
       12069 
12664 
     | 
    
         
             
                    qrow += nblock*sizeof(block_iq2_s);
         
     | 
| 
         @@ -12071,12 +12666,12 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, 
     | 
|
| 
       12071 
12666 
     | 
    
         
             
                return nrow * nblock * sizeof(block_iq2_s);
         
     | 
| 
       12072 
12667 
     | 
    
         
             
            }
         
     | 
| 
       12073 
12668 
     | 
    
         | 
| 
       12074 
     | 
    
         
            -
            void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y,  
     | 
| 
      
 12669 
     | 
    
         
            +
            void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
         
     | 
| 
       12075 
12670 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       12076 
12671 
     | 
    
         
             
                quantize_iq2_s(x, y, 1, k, NULL);
         
     | 
| 
       12077 
12672 
     | 
    
         
             
            }
         
     | 
| 
       12078 
12673 
     | 
    
         | 
| 
       12079 
     | 
    
         
            -
            void quantize_row_iq2_s(const float * restrict x, void * restrict vy,  
     | 
| 
      
 12674 
     | 
    
         
            +
            void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
         
     | 
| 
       12080 
12675 
     | 
    
         
             
                assert(k % QK_K == 0);
         
     | 
| 
       12081 
12676 
     | 
    
         
             
                block_iq2_s * restrict y = vy;
         
     | 
| 
       12082 
12677 
     | 
    
         
             
                quantize_row_iq2_s_reference(x, y, k);
         
     |