llama_cpp 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +17 -0
 - data/README.md +1 -1
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +3 -3
 - data/ext/llama_cpp/llama_cpp.cpp +118 -117
 - data/ext/llama_cpp/src/ggml-alloc.c +97 -53
 - data/ext/llama_cpp/src/ggml-alloc.h +4 -0
 - data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
 - data/ext/llama_cpp/src/ggml-cuda.h +32 -23
 - data/ext/llama_cpp/src/ggml-metal.h +9 -3
 - data/ext/llama_cpp/src/ggml-metal.m +142 -161
 - data/ext/llama_cpp/src/ggml-metal.metal +577 -500
 - data/ext/llama_cpp/src/ggml.c +2064 -233
 - data/ext/llama_cpp/src/ggml.h +238 -13
 - data/ext/llama_cpp/src/k_quants.c +110 -54
 - data/ext/llama_cpp/src/llama-util.h +10 -8
 - data/ext/llama_cpp/src/llama.cpp +4544 -2890
 - data/ext/llama_cpp/src/llama.h +133 -123
 - data/lib/llama_cpp/version.rb +2 -2
 - data/lib/llama_cpp.rb +1 -1
 - data/sig/llama_cpp.rbs +8 -8
 - metadata +2 -2
 
    
        data/ext/llama_cpp/src/ggml.h
    CHANGED
    
    | 
         @@ -207,7 +207,7 @@ 
     | 
|
| 
       207 
207 
     | 
    
         
             
            #define GGML_MAX_PARAMS        256
         
     | 
| 
       208 
208 
     | 
    
         
             
            #define GGML_MAX_CONTEXTS      64
         
     | 
| 
       209 
209 
     | 
    
         
             
            #define GGML_MAX_SRC           6
         
     | 
| 
       210 
     | 
    
         
            -
            #define GGML_MAX_NAME           
     | 
| 
      
 210 
     | 
    
         
            +
            #define GGML_MAX_NAME          64
         
     | 
| 
       211 
211 
     | 
    
         
             
            #define GGML_MAX_OP_PARAMS     32
         
     | 
| 
       212 
212 
     | 
    
         
             
            #define GGML_DEFAULT_N_THREADS 4
         
     | 
| 
       213 
213 
     | 
    
         | 
| 
         @@ -215,6 +215,11 @@ 
     | 
|
| 
       215 
215 
     | 
    
         
             
            #define GGML_EXIT_SUCCESS 0
         
     | 
| 
       216 
216 
     | 
    
         
             
            #define GGML_EXIT_ABORTED 1
         
     | 
| 
       217 
217 
     | 
    
         | 
| 
      
 218 
     | 
    
         
            +
            #define GGUF_MAGIC   0x46554747 // "GGUF"
         
     | 
| 
      
 219 
     | 
    
         
            +
            #define GGUF_VERSION 1
         
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
      
 221 
     | 
    
         
            +
            #define GGUF_DEFAULT_ALIGNMENT 32
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
       218 
223 
     | 
    
         
             
            #define GGML_UNUSED(x) (void)(x)
         
     | 
| 
       219 
224 
     | 
    
         | 
| 
       220 
225 
     | 
    
         
             
            #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
         
     | 
| 
         @@ -255,8 +260,9 @@ 
     | 
|
| 
       255 
260 
     | 
    
         
             
            extern "C" {
         
     | 
| 
       256 
261 
     | 
    
         
             
            #endif
         
     | 
| 
       257 
262 
     | 
    
         | 
| 
       258 
     | 
    
         
            -
            # 
     | 
| 
       259 
     | 
    
         
            -
                 
     | 
| 
      
 263 
     | 
    
         
            +
            #if defined(__ARM_NEON) && defined(__CUDACC__)
         
     | 
| 
      
 264 
     | 
    
         
            +
                typedef half ggml_fp16_t;
         
     | 
| 
      
 265 
     | 
    
         
            +
            #elif defined(__ARM_NEON)
         
     | 
| 
       260 
266 
     | 
    
         
             
                typedef __fp16 ggml_fp16_t;
         
     | 
| 
       261 
267 
     | 
    
         
             
            #else
         
     | 
| 
       262 
268 
     | 
    
         
             
                typedef uint16_t ggml_fp16_t;
         
     | 
| 
         @@ -340,10 +346,12 @@ extern "C" { 
     | 
|
| 
       340 
346 
     | 
    
         
             
                    GGML_OP_ARGMAX,
         
     | 
| 
       341 
347 
     | 
    
         
             
                    GGML_OP_REPEAT,
         
     | 
| 
       342 
348 
     | 
    
         
             
                    GGML_OP_REPEAT_BACK,
         
     | 
| 
      
 349 
     | 
    
         
            +
                    GGML_OP_CONCAT,
         
     | 
| 
       343 
350 
     | 
    
         
             
                    GGML_OP_SILU_BACK,
         
     | 
| 
       344 
351 
     | 
    
         
             
                    GGML_OP_NORM, // normalize
         
     | 
| 
       345 
352 
     | 
    
         
             
                    GGML_OP_RMS_NORM,
         
     | 
| 
       346 
353 
     | 
    
         
             
                    GGML_OP_RMS_NORM_BACK,
         
     | 
| 
      
 354 
     | 
    
         
            +
                    GGML_OP_GROUP_NORM,
         
     | 
| 
       347 
355 
     | 
    
         | 
| 
       348 
356 
     | 
    
         
             
                    GGML_OP_MUL_MAT,
         
     | 
| 
       349 
357 
     | 
    
         
             
                    GGML_OP_OUT_PROD,
         
     | 
| 
         @@ -369,14 +377,19 @@ extern "C" { 
     | 
|
| 
       369 
377 
     | 
    
         
             
                    GGML_OP_CLAMP,
         
     | 
| 
       370 
378 
     | 
    
         
             
                    GGML_OP_CONV_1D,
         
     | 
| 
       371 
379 
     | 
    
         
             
                    GGML_OP_CONV_2D,
         
     | 
| 
      
 380 
     | 
    
         
            +
                    GGML_OP_CONV_TRANSPOSE_2D,
         
     | 
| 
       372 
381 
     | 
    
         
             
                    GGML_OP_POOL_1D,
         
     | 
| 
       373 
382 
     | 
    
         
             
                    GGML_OP_POOL_2D,
         
     | 
| 
       374 
383 
     | 
    
         | 
| 
      
 384 
     | 
    
         
            +
                    GGML_OP_UPSCALE, // nearest interpolate
         
     | 
| 
      
 385 
     | 
    
         
            +
             
     | 
| 
       375 
386 
     | 
    
         
             
                    GGML_OP_FLASH_ATTN,
         
     | 
| 
       376 
387 
     | 
    
         
             
                    GGML_OP_FLASH_FF,
         
     | 
| 
       377 
388 
     | 
    
         
             
                    GGML_OP_FLASH_ATTN_BACK,
         
     | 
| 
       378 
389 
     | 
    
         
             
                    GGML_OP_WIN_PART,
         
     | 
| 
       379 
390 
     | 
    
         
             
                    GGML_OP_WIN_UNPART,
         
     | 
| 
      
 391 
     | 
    
         
            +
                    GGML_OP_GET_REL_POS,
         
     | 
| 
      
 392 
     | 
    
         
            +
                    GGML_OP_ADD_REL_POS,
         
     | 
| 
       380 
393 
     | 
    
         | 
| 
       381 
394 
     | 
    
         
             
                    GGML_OP_UNARY,
         
     | 
| 
       382 
395 
     | 
    
         | 
| 
         @@ -562,6 +575,7 @@ extern "C" { 
     | 
|
| 
       562 
575 
     | 
    
         
             
                GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
         
     | 
| 
       563 
576 
     | 
    
         
             
                GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
         
     | 
| 
       564 
577 
     | 
    
         
             
                GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
         
     | 
| 
      
 578 
     | 
    
         
            +
                GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
         
     | 
| 
       565 
579 
     | 
    
         
             
                GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
         
     | 
| 
       566 
580 
     | 
    
         | 
| 
       567 
581 
     | 
    
         
             
                GGML_API int     ggml_blck_size (enum ggml_type type);
         
     | 
| 
         @@ -799,6 +813,13 @@ extern "C" { 
     | 
|
| 
       799 
813 
     | 
    
         
             
                        struct ggml_tensor  * a,
         
     | 
| 
       800 
814 
     | 
    
         
             
                        struct ggml_tensor  * b);
         
     | 
| 
       801 
815 
     | 
    
         | 
| 
      
 816 
     | 
    
         
            +
                // concat a and b on dim 2
         
     | 
| 
      
 817 
     | 
    
         
            +
                // used in stable-diffusion
         
     | 
| 
      
 818 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_concat(
         
     | 
| 
      
 819 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 820 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 821 
     | 
    
         
            +
                        struct ggml_tensor  * b);
         
     | 
| 
      
 822 
     | 
    
         
            +
             
     | 
| 
       802 
823 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_abs(
         
     | 
| 
       803 
824 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
       804 
825 
     | 
    
         
             
                        struct ggml_tensor  * a);
         
     | 
| 
         @@ -888,14 +909,15 @@ extern "C" { 
     | 
|
| 
       888 
909 
     | 
    
         
             
                        struct ggml_tensor  * b);
         
     | 
| 
       889 
910 
     | 
    
         | 
| 
       890 
911 
     | 
    
         
             
                // normalize along rows
         
     | 
| 
       891 
     | 
    
         
            -
                // TODO: eps is hardcoded to 1e-5 for now
         
     | 
| 
       892 
912 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_norm(
         
     | 
| 
       893 
913 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
       894 
     | 
    
         
            -
                        struct ggml_tensor  * a 
     | 
| 
      
 914 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 915 
     | 
    
         
            +
                        float                 eps);
         
     | 
| 
       895 
916 
     | 
    
         | 
| 
       896 
917 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_norm_inplace(
         
     | 
| 
       897 
918 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
       898 
     | 
    
         
            -
                        struct ggml_tensor  * a 
     | 
| 
      
 919 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 920 
     | 
    
         
            +
                        float                 eps);
         
     | 
| 
       899 
921 
     | 
    
         | 
| 
       900 
922 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_rms_norm(
         
     | 
| 
       901 
923 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
         @@ -907,6 +929,19 @@ extern "C" { 
     | 
|
| 
       907 
929 
     | 
    
         
             
                        struct ggml_tensor  * a,
         
     | 
| 
       908 
930 
     | 
    
         
             
                        float                 eps);
         
     | 
| 
       909 
931 
     | 
    
         | 
| 
      
 932 
     | 
    
         
            +
                // group normalize along ne0*ne1*n_groups
         
     | 
| 
      
 933 
     | 
    
         
            +
                // used in stable-diffusion
         
     | 
| 
      
 934 
     | 
    
         
            +
                // TODO: eps is hardcoded to 1e-6 for now
         
     | 
| 
      
 935 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_group_norm(
         
     | 
| 
      
 936 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 937 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 938 
     | 
    
         
            +
                        int                   n_groups);
         
     | 
| 
      
 939 
     | 
    
         
            +
             
     | 
| 
      
 940 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_group_norm_inplace(
         
     | 
| 
      
 941 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 942 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 943 
     | 
    
         
            +
                        int                   n_groups);
         
     | 
| 
      
 944 
     | 
    
         
            +
             
     | 
| 
       910 
945 
     | 
    
         
             
                // a - x
         
     | 
| 
       911 
946 
     | 
    
         
             
                // b - dy
         
     | 
| 
       912 
947 
     | 
    
         
             
                // TODO: update with configurable eps
         
     | 
| 
         @@ -1207,6 +1242,15 @@ extern "C" { 
     | 
|
| 
       1207 
1242 
     | 
    
         
             
                        float                 freq_base,
         
     | 
| 
       1208 
1243 
     | 
    
         
             
                        float                 freq_scale);
         
     | 
| 
       1209 
1244 
     | 
    
         | 
| 
      
 1245 
     | 
    
         
            +
                // xPos RoPE, in-place, returns view(a)
         
     | 
| 
      
 1246 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
         
     | 
| 
      
 1247 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 1248 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 1249 
     | 
    
         
            +
                        int                   n_past,
         
     | 
| 
      
 1250 
     | 
    
         
            +
                        int                   n_dims,
         
     | 
| 
      
 1251 
     | 
    
         
            +
                        float                 base,
         
     | 
| 
      
 1252 
     | 
    
         
            +
                        bool                  down);
         
     | 
| 
      
 1253 
     | 
    
         
            +
             
     | 
| 
       1210 
1254 
     | 
    
         
             
                // rotary position embedding backward, i.e compute dx from dy
         
     | 
| 
       1211 
1255 
     | 
    
         
             
                // a - dy
         
     | 
| 
       1212 
1256 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_rope_back(
         
     | 
| 
         @@ -1215,7 +1259,11 @@ extern "C" { 
     | 
|
| 
       1215 
1259 
     | 
    
         
             
                        int                   n_past,
         
     | 
| 
       1216 
1260 
     | 
    
         
             
                        int                   n_dims,
         
     | 
| 
       1217 
1261 
     | 
    
         
             
                        int                   mode,
         
     | 
| 
       1218 
     | 
    
         
            -
                        int                   n_ctx 
     | 
| 
      
 1262 
     | 
    
         
            +
                        int                   n_ctx,
         
     | 
| 
      
 1263 
     | 
    
         
            +
                        float                 freq_base,
         
     | 
| 
      
 1264 
     | 
    
         
            +
                        float                 freq_scale,
         
     | 
| 
      
 1265 
     | 
    
         
            +
                        float                 xpos_base,
         
     | 
| 
      
 1266 
     | 
    
         
            +
                        bool                  xpos_down);
         
     | 
| 
       1219 
1267 
     | 
    
         | 
| 
       1220 
1268 
     | 
    
         
             
                // alibi position embedding
         
     | 
| 
       1221 
1269 
     | 
    
         
             
                // in-place, returns view(a)
         
     | 
| 
         @@ -1242,6 +1290,15 @@ extern "C" { 
     | 
|
| 
       1242 
1290 
     | 
    
         
             
                        int                   p0,  // padding
         
     | 
| 
       1243 
1291 
     | 
    
         
             
                        int                   d0); // dilation
         
     | 
| 
       1244 
1292 
     | 
    
         | 
| 
      
 1293 
     | 
    
         
            +
                // conv_1d with padding = half
         
     | 
| 
      
 1294 
     | 
    
         
            +
                // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
         
     | 
| 
      
 1295 
     | 
    
         
            +
                GGML_API struct ggml_tensor* ggml_conv_1d_ph(
         
     | 
| 
      
 1296 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 1297 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 1298 
     | 
    
         
            +
                        struct ggml_tensor  * b,
         
     | 
| 
      
 1299 
     | 
    
         
            +
                        int                   s,
         
     | 
| 
      
 1300 
     | 
    
         
            +
                        int                   d);
         
     | 
| 
      
 1301 
     | 
    
         
            +
             
     | 
| 
       1245 
1302 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_conv_2d(
         
     | 
| 
       1246 
1303 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
       1247 
1304 
     | 
    
         
             
                        struct ggml_tensor  * a,
         
     | 
| 
         @@ -1253,14 +1310,38 @@ extern "C" { 
     | 
|
| 
       1253 
1310 
     | 
    
         
             
                        int                   d0,
         
     | 
| 
       1254 
1311 
     | 
    
         
             
                        int                   d1);
         
     | 
| 
       1255 
1312 
     | 
    
         | 
| 
       1256 
     | 
    
         
            -
             
     | 
| 
       1257 
     | 
    
         
            -
                //  
     | 
| 
       1258 
     | 
    
         
            -
                 
     | 
| 
      
 1313 
     | 
    
         
            +
             
     | 
| 
      
 1314 
     | 
    
         
            +
                // kernel size is a->ne[0] x a->ne[1]
         
     | 
| 
      
 1315 
     | 
    
         
            +
                // stride is equal to kernel size
         
     | 
| 
      
 1316 
     | 
    
         
            +
                // padding is zero
         
     | 
| 
      
 1317 
     | 
    
         
            +
                // example:
         
     | 
| 
      
 1318 
     | 
    
         
            +
                // a:     16   16    3  768
         
     | 
| 
      
 1319 
     | 
    
         
            +
                // b:   1024 1024    3    1
         
     | 
| 
      
 1320 
     | 
    
         
            +
                // res:   64   64  768    1
         
     | 
| 
      
 1321 
     | 
    
         
            +
                // used in sam
         
     | 
| 
      
 1322 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
         
     | 
| 
      
 1323 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 1324 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 1325 
     | 
    
         
            +
                        struct ggml_tensor  * b);
         
     | 
| 
      
 1326 
     | 
    
         
            +
             
     | 
| 
      
 1327 
     | 
    
         
            +
                // kernel size is a->ne[0] x a->ne[1]
         
     | 
| 
      
 1328 
     | 
    
         
            +
                // stride is 1
         
     | 
| 
      
 1329 
     | 
    
         
            +
                // padding is half
         
     | 
| 
      
 1330 
     | 
    
         
            +
                // example:
         
     | 
| 
      
 1331 
     | 
    
         
            +
                // a:      3    3    256  256
         
     | 
| 
      
 1332 
     | 
    
         
            +
                // b:     64   64    256    1
         
     | 
| 
      
 1333 
     | 
    
         
            +
                // res:   64   64    256    1
         
     | 
| 
      
 1334 
     | 
    
         
            +
                // used in sam
         
     | 
| 
      
 1335 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
         
     | 
| 
      
 1336 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 1337 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 1338 
     | 
    
         
            +
                        struct ggml_tensor  * b);
         
     | 
| 
      
 1339 
     | 
    
         
            +
             
     | 
| 
      
 1340 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
         
     | 
| 
       1259 
1341 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
       1260 
1342 
     | 
    
         
             
                        struct ggml_tensor  * a,
         
     | 
| 
       1261 
1343 
     | 
    
         
             
                        struct ggml_tensor  * b,
         
     | 
| 
       1262 
     | 
    
         
            -
                        int                    
     | 
| 
       1263 
     | 
    
         
            -
                        int                   d);
         
     | 
| 
      
 1344 
     | 
    
         
            +
                        int                   stride);
         
     | 
| 
       1264 
1345 
     | 
    
         | 
| 
       1265 
1346 
     | 
    
         
             
                enum ggml_op_pool {
         
     | 
| 
       1266 
1347 
     | 
    
         
             
                    GGML_OP_POOL_MAX,
         
     | 
| 
         @@ -1287,6 +1368,13 @@ extern "C" { 
     | 
|
| 
       1287 
1368 
     | 
    
         
             
                        int                   p0,
         
     | 
| 
       1288 
1369 
     | 
    
         
             
                        int                   p1);
         
     | 
| 
       1289 
1370 
     | 
    
         | 
| 
      
 1371 
     | 
    
         
            +
                // nearest interpolate
         
     | 
| 
      
 1372 
     | 
    
         
            +
                // used in stable-diffusion
         
     | 
| 
      
 1373 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_upscale(
         
     | 
| 
      
 1374 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 1375 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 1376 
     | 
    
         
            +
                        int                   scale_factor);
         
     | 
| 
      
 1377 
     | 
    
         
            +
             
     | 
| 
       1290 
1378 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_flash_attn(
         
     | 
| 
       1291 
1379 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
       1292 
1380 
     | 
    
         
             
                        struct ggml_tensor  * q,
         
     | 
| 
         @@ -1340,6 +1428,27 @@ extern "C" { 
     | 
|
| 
       1340 
1428 
     | 
    
         
             
                    struct ggml_tensor  * a,
         
     | 
| 
       1341 
1429 
     | 
    
         
             
                    enum ggml_unary_op op);
         
     | 
| 
       1342 
1430 
     | 
    
         | 
| 
      
 1431 
     | 
    
         
            +
                // used in sam
         
     | 
| 
      
 1432 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_get_rel_pos(
         
     | 
| 
      
 1433 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 1434 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 1435 
     | 
    
         
            +
                        int                   qh,
         
     | 
| 
      
 1436 
     | 
    
         
            +
                        int                   kh);
         
     | 
| 
      
 1437 
     | 
    
         
            +
             
     | 
| 
      
 1438 
     | 
    
         
            +
                // used in sam
         
     | 
| 
      
 1439 
     | 
    
         
            +
             
     | 
| 
      
 1440 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_add_rel_pos(
         
     | 
| 
      
 1441 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 1442 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 1443 
     | 
    
         
            +
                        struct ggml_tensor  * pw,
         
     | 
| 
      
 1444 
     | 
    
         
            +
                        struct ggml_tensor  * ph);
         
     | 
| 
      
 1445 
     | 
    
         
            +
             
     | 
| 
      
 1446 
     | 
    
         
            +
                GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
         
     | 
| 
      
 1447 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 1448 
     | 
    
         
            +
                        struct ggml_tensor  * a,
         
     | 
| 
      
 1449 
     | 
    
         
            +
                        struct ggml_tensor  * pw,
         
     | 
| 
      
 1450 
     | 
    
         
            +
                        struct ggml_tensor  * ph);
         
     | 
| 
      
 1451 
     | 
    
         
            +
             
     | 
| 
       1343 
1452 
     | 
    
         
             
                // custom operators
         
     | 
| 
       1344 
1453 
     | 
    
         | 
| 
       1345 
1454 
     | 
    
         
             
                typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
         
     | 
| 
         @@ -1703,6 +1812,118 @@ extern "C" { 
     | 
|
| 
       1703 
1812 
     | 
    
         | 
| 
       1704 
1813 
     | 
    
         
             
                GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
         
     | 
| 
       1705 
1814 
     | 
    
         | 
| 
      
 1815 
     | 
    
         
            +
                //
         
     | 
| 
      
 1816 
     | 
    
         
            +
                // gguf
         
     | 
| 
      
 1817 
     | 
    
         
            +
                //
         
     | 
| 
      
 1818 
     | 
    
         
            +
             
     | 
| 
      
 1819 
     | 
    
         
            +
                enum gguf_type {
         
     | 
| 
      
 1820 
     | 
    
         
            +
                    GGUF_TYPE_UINT8   = 0,
         
     | 
| 
      
 1821 
     | 
    
         
            +
                    GGUF_TYPE_INT8    = 1,
         
     | 
| 
      
 1822 
     | 
    
         
            +
                    GGUF_TYPE_UINT16  = 2,
         
     | 
| 
      
 1823 
     | 
    
         
            +
                    GGUF_TYPE_INT16   = 3,
         
     | 
| 
      
 1824 
     | 
    
         
            +
                    GGUF_TYPE_UINT32  = 4,
         
     | 
| 
      
 1825 
     | 
    
         
            +
                    GGUF_TYPE_INT32   = 5,
         
     | 
| 
      
 1826 
     | 
    
         
            +
                    GGUF_TYPE_FLOAT32 = 6,
         
     | 
| 
      
 1827 
     | 
    
         
            +
                    GGUF_TYPE_BOOL    = 7,
         
     | 
| 
      
 1828 
     | 
    
         
            +
                    GGUF_TYPE_STRING  = 8,
         
     | 
| 
      
 1829 
     | 
    
         
            +
                    GGUF_TYPE_ARRAY   = 9,
         
     | 
| 
      
 1830 
     | 
    
         
            +
                    GGUF_TYPE_COUNT,       // marks the end of the enum
         
     | 
| 
      
 1831 
     | 
    
         
            +
                };
         
     | 
| 
      
 1832 
     | 
    
         
            +
             
     | 
| 
      
 1833 
     | 
    
         
            +
                struct gguf_context;
         
     | 
| 
      
 1834 
     | 
    
         
            +
             
     | 
| 
      
 1835 
     | 
    
         
            +
                struct gguf_init_params {
         
     | 
| 
      
 1836 
     | 
    
         
            +
                    bool no_alloc;
         
     | 
| 
      
 1837 
     | 
    
         
            +
             
     | 
| 
      
 1838 
     | 
    
         
            +
                    // if not NULL, create a ggml_context and allocate the tensor data in it
         
     | 
| 
      
 1839 
     | 
    
         
            +
                    struct ggml_context ** ctx;
         
     | 
| 
      
 1840 
     | 
    
         
            +
                };
         
     | 
| 
      
 1841 
     | 
    
         
            +
             
     | 
| 
      
 1842 
     | 
    
         
            +
                GGML_API struct gguf_context * gguf_init_empty(void);
         
     | 
| 
      
 1843 
     | 
    
         
            +
                GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
         
     | 
| 
      
 1844 
     | 
    
         
            +
                //GGML_API struct gguf_context * gguf_init_from_buffer(..);
         
     | 
| 
      
 1845 
     | 
    
         
            +
             
     | 
| 
      
 1846 
     | 
    
         
            +
                GGML_API void gguf_free(struct gguf_context * ctx);
         
     | 
| 
      
 1847 
     | 
    
         
            +
             
     | 
| 
      
 1848 
     | 
    
         
            +
                GGML_API const char * gguf_type_name(enum gguf_type type);
         
     | 
| 
      
 1849 
     | 
    
         
            +
             
     | 
| 
      
 1850 
     | 
    
         
            +
                GGML_API int    gguf_get_version    (struct gguf_context * ctx);
         
     | 
| 
      
 1851 
     | 
    
         
            +
                GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
         
     | 
| 
      
 1852 
     | 
    
         
            +
                GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
         
     | 
| 
      
 1853 
     | 
    
         
            +
                GGML_API void * gguf_get_data       (struct gguf_context * ctx);
         
     | 
| 
      
 1854 
     | 
    
         
            +
             
     | 
| 
      
 1855 
     | 
    
         
            +
                GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
         
     | 
| 
      
 1856 
     | 
    
         
            +
                GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
         
     | 
| 
      
 1857 
     | 
    
         
            +
                GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1858 
     | 
    
         
            +
             
     | 
| 
      
 1859 
     | 
    
         
            +
                GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1860 
     | 
    
         
            +
                GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
         
     | 
| 
      
 1861 
     | 
    
         
            +
             
     | 
| 
      
 1862 
     | 
    
         
            +
                // results are undefined if the wrong type is used for the key
         
     | 
| 
      
 1863 
     | 
    
         
            +
                GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1864 
     | 
    
         
            +
                GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1865 
     | 
    
         
            +
                GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1866 
     | 
    
         
            +
                GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1867 
     | 
    
         
            +
                GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1868 
     | 
    
         
            +
                GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1869 
     | 
    
         
            +
                GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1870 
     | 
    
         
            +
                GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
         
     | 
| 
      
 1871 
     | 
    
         
            +
                GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1872 
     | 
    
         
            +
                GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1873 
     | 
    
         
            +
                GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
         
     | 
| 
      
 1874 
     | 
    
         
            +
                GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
         
     | 
| 
      
 1875 
     | 
    
         
            +
             
     | 
| 
      
 1876 
     | 
    
         
            +
                GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
         
     | 
| 
      
 1877 
     | 
    
         
            +
                GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
         
     | 
| 
      
 1878 
     | 
    
         
            +
                GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
         
     | 
| 
      
 1879 
     | 
    
         
            +
                GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
         
     | 
| 
      
 1880 
     | 
    
         
            +
             
     | 
| 
      
 1881 
     | 
    
         
            +
                // overrides existing values or adds a new one
         
     | 
| 
      
 1882 
     | 
    
         
            +
                GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
         
     | 
| 
      
 1883 
     | 
    
         
            +
                GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
         
     | 
| 
      
 1884 
     | 
    
         
            +
                GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
         
     | 
| 
      
 1885 
     | 
    
         
            +
                GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
         
     | 
| 
      
 1886 
     | 
    
         
            +
                GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
         
     | 
| 
      
 1887 
     | 
    
         
            +
                GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
         
     | 
| 
      
 1888 
     | 
    
         
            +
                GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
         
     | 
| 
      
 1889 
     | 
    
         
            +
                GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
         
     | 
| 
      
 1890 
     | 
    
         
            +
                GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
         
     | 
| 
      
 1891 
     | 
    
         
            +
                GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
         
     | 
| 
      
 1892 
     | 
    
         
            +
                GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
         
     | 
| 
      
 1893 
     | 
    
         
            +
             
     | 
| 
      
 1894 
     | 
    
         
            +
                // set or add KV pairs from another context
         
     | 
| 
      
 1895 
     | 
    
         
            +
                GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
         
     | 
| 
      
 1896 
     | 
    
         
            +
             
     | 
| 
      
 1897 
     | 
    
         
            +
                // manage tensor info
         
     | 
| 
      
 1898 
     | 
    
         
            +
                GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
         
     | 
| 
      
 1899 
     | 
    
         
            +
                GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
         
     | 
| 
      
 1900 
     | 
    
         
            +
                GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
         
     | 
| 
      
 1901 
     | 
    
         
            +
             
     | 
| 
      
 1902 
     | 
    
         
            +
                // writing gguf files can be done in 2 ways:
         
     | 
| 
      
 1903 
     | 
    
         
            +
                //
         
     | 
| 
      
 1904 
     | 
    
         
            +
                // - write the entire gguf_context to a binary file in a single pass:
         
     | 
| 
      
 1905 
     | 
    
         
            +
                //
         
     | 
| 
      
 1906 
     | 
    
         
            +
                //   gguf_write_to_file(ctx, fname);
         
     | 
| 
      
 1907 
     | 
    
         
            +
                //
         
     | 
| 
      
 1908 
     | 
    
         
            +
                // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
         
     | 
| 
      
 1909 
     | 
    
         
            +
                //
         
     | 
| 
      
 1910 
     | 
    
         
            +
                //   FILE * f = fopen(fname, "wb");
         
     | 
| 
      
 1911 
     | 
    
         
            +
                //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
         
     | 
| 
      
 1912 
     | 
    
         
            +
                //   fwrite(f, ...);
         
     | 
| 
      
 1913 
     | 
    
         
            +
                //   void * data = gguf_meta_get_meta_data(ctx);
         
     | 
| 
      
 1914 
     | 
    
         
            +
                //   fseek(f, 0, SEEK_SET);
         
     | 
| 
      
 1915 
     | 
    
         
            +
                //   fwrite(f, data, gguf_get_meta_size(ctx));
         
     | 
| 
      
 1916 
     | 
    
         
            +
                //   free(data);
         
     | 
| 
      
 1917 
     | 
    
         
            +
                //   fclose(f);
         
     | 
| 
      
 1918 
     | 
    
         
            +
                //
         
     | 
| 
      
 1919 
     | 
    
         
            +
             
     | 
| 
      
 1920 
     | 
    
         
            +
                // write the entire context to a binary file
         
     | 
| 
      
 1921 
     | 
    
         
            +
                GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
         
     | 
| 
      
 1922 
     | 
    
         
            +
             
     | 
| 
      
 1923 
     | 
    
         
            +
                // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
         
     | 
| 
      
 1924 
     | 
    
         
            +
                GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
         
     | 
| 
      
 1925 
     | 
    
         
            +
                GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
         
     | 
| 
      
 1926 
     | 
    
         
            +
             
     | 
| 
       1706 
1927 
     | 
    
         
             
                //
         
     | 
| 
       1707 
1928 
     | 
    
         
             
                // system info
         
     | 
| 
       1708 
1929 
     | 
    
         
             
                //
         
     | 
| 
         @@ -1740,6 +1961,10 @@ extern "C" { 
     | 
|
| 
       1740 
1961 
     | 
    
         
             
                typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
         
     | 
| 
       1741 
1962 
     | 
    
         | 
| 
       1742 
1963 
     | 
    
         
             
                typedef struct {
         
     | 
| 
      
 1964 
     | 
    
         
            +
                    const char      * type_name;
         
     | 
| 
      
 1965 
     | 
    
         
            +
                    int               blck_size;
         
     | 
| 
      
 1966 
     | 
    
         
            +
                    size_t            type_size;
         
     | 
| 
      
 1967 
     | 
    
         
            +
                    bool              is_quantized;
         
     | 
| 
       1743 
1968 
     | 
    
         
             
                    ggml_to_float_t   to_float;
         
     | 
| 
       1744 
1969 
     | 
    
         
             
                    ggml_from_float_t from_float;
         
     | 
| 
       1745 
1970 
     | 
    
         
             
                    ggml_from_float_t from_float_reference;
         
     | 
| 
         @@ -1747,7 +1972,7 @@ extern "C" { 
     | 
|
| 
       1747 
1972 
     | 
    
         
             
                    enum ggml_type    vec_dot_type;
         
     | 
| 
       1748 
1973 
     | 
    
         
             
                } ggml_type_traits_t;
         
     | 
| 
       1749 
1974 
     | 
    
         | 
| 
       1750 
     | 
    
         
            -
                ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type  
     | 
| 
      
 1975 
     | 
    
         
            +
                ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
         
     | 
| 
       1751 
1976 
     | 
    
         | 
| 
       1752 
1977 
     | 
    
         
             
            #ifdef  __cplusplus
         
     | 
| 
       1753 
1978 
     | 
    
         
             
            }
         
     | 
| 
         @@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * 
     | 
|
| 
       77 
77 
     | 
    
         
             
                    }
         
     | 
| 
       78 
78 
     | 
    
         
             
                    return 1/iscale;
         
     | 
| 
       79 
79 
     | 
    
         
             
                }
         
     | 
| 
      
 80 
     | 
    
         
            +
                bool return_early = false;
         
     | 
| 
      
 81 
     | 
    
         
            +
                if (rmse_type < 0) {
         
     | 
| 
      
 82 
     | 
    
         
            +
                    rmse_type = -rmse_type;
         
     | 
| 
      
 83 
     | 
    
         
            +
                    return_early = true;
         
     | 
| 
      
 84 
     | 
    
         
            +
                }
         
     | 
| 
       80 
85 
     | 
    
         
             
                int weight_type = rmse_type%2;
         
     | 
| 
       81 
86 
     | 
    
         
             
                float sumlx = 0;
         
     | 
| 
       82 
87 
     | 
    
         
             
                float suml2 = 0;
         
     | 
| 
         @@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * 
     | 
|
| 
       89 
94 
     | 
    
         
             
                    suml2 += w*l*l;
         
     | 
| 
       90 
95 
     | 
    
         
             
                }
         
     | 
| 
       91 
96 
     | 
    
         
             
                float scale = sumlx/suml2;
         
     | 
| 
      
 97 
     | 
    
         
            +
                if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
         
     | 
| 
       92 
98 
     | 
    
         
             
                float best = scale * sumlx;
         
     | 
| 
       93 
     | 
    
         
            -
                for (int  
     | 
| 
       94 
     | 
    
         
            -
                    iscale = 1/scale;
         
     | 
| 
       95 
     | 
    
         
            -
                    float slx = 0;
         
     | 
| 
       96 
     | 
    
         
            -
                    float sl2 = 0;
         
     | 
| 
       97 
     | 
    
         
            -
                    bool changed = false;
         
     | 
| 
       98 
     | 
    
         
            -
                    for (int i = 0; i < n; ++i) {
         
     | 
| 
       99 
     | 
    
         
            -
                        int l = nearest_int(iscale * x[i]);
         
     | 
| 
       100 
     | 
    
         
            -
                        l = MAX(-nmax, MIN(nmax-1, l));
         
     | 
| 
       101 
     | 
    
         
            -
                        if (l + nmax != L[i]) { changed = true; }
         
     | 
| 
       102 
     | 
    
         
            -
                        float w = weight_type == 1 ? x[i] * x[i] : 1.f;
         
     | 
| 
       103 
     | 
    
         
            -
                        slx += w*x[i]*l;
         
     | 
| 
       104 
     | 
    
         
            -
                        sl2 += w*l*l;
         
     | 
| 
       105 
     | 
    
         
            -
                    }
         
     | 
| 
       106 
     | 
    
         
            -
                    if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; }
         
     | 
| 
       107 
     | 
    
         
            -
                    for (int i = 0; i < n; ++i) {
         
     | 
| 
       108 
     | 
    
         
            -
                        int l = nearest_int(iscale * x[i]);
         
     | 
| 
       109 
     | 
    
         
            -
                        L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
         
     | 
| 
       110 
     | 
    
         
            -
                    }
         
     | 
| 
       111 
     | 
    
         
            -
                    sumlx = slx; suml2 = sl2;
         
     | 
| 
       112 
     | 
    
         
            -
                    scale = sumlx/suml2;
         
     | 
| 
       113 
     | 
    
         
            -
                    best = scale * sumlx;
         
     | 
| 
       114 
     | 
    
         
            -
                }
         
     | 
| 
       115 
     | 
    
         
            -
                for (int itry = 0; itry < 5; ++itry) {
         
     | 
| 
       116 
     | 
    
         
            -
                    int n_changed = 0;
         
     | 
| 
       117 
     | 
    
         
            -
                    for (int i = 0; i < n; ++i) {
         
     | 
| 
       118 
     | 
    
         
            -
                        float w = weight_type == 1 ? x[i]*x[i] : 1;
         
     | 
| 
       119 
     | 
    
         
            -
                        int l = L[i] - nmax;
         
     | 
| 
       120 
     | 
    
         
            -
                        float slx = sumlx - w*x[i]*l;
         
     | 
| 
       121 
     | 
    
         
            -
                        if (slx > 0) {
         
     | 
| 
       122 
     | 
    
         
            -
                            float sl2 = suml2 - w*l*l;
         
     | 
| 
       123 
     | 
    
         
            -
                            int new_l = nearest_int(x[i] * sl2 / slx);
         
     | 
| 
       124 
     | 
    
         
            -
                            new_l = MAX(-nmax, MIN(nmax-1, new_l));
         
     | 
| 
       125 
     | 
    
         
            -
                            if (new_l != l) {
         
     | 
| 
       126 
     | 
    
         
            -
                                slx += w*x[i]*new_l;
         
     | 
| 
       127 
     | 
    
         
            -
                                sl2 += w*new_l*new_l;
         
     | 
| 
       128 
     | 
    
         
            -
                                if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
         
     | 
| 
       129 
     | 
    
         
            -
                                    L[i] = nmax + new_l; sumlx = slx; suml2 = sl2;
         
     | 
| 
       130 
     | 
    
         
            -
                                    scale = sumlx / suml2; best = scale * sumlx;
         
     | 
| 
       131 
     | 
    
         
            -
                                    ++n_changed;
         
     | 
| 
       132 
     | 
    
         
            -
                                }
         
     | 
| 
       133 
     | 
    
         
            -
                            }
         
     | 
| 
       134 
     | 
    
         
            -
                        }
         
     | 
| 
       135 
     | 
    
         
            -
                    }
         
     | 
| 
       136 
     | 
    
         
            -
                    if (!n_changed) { break; }
         
     | 
| 
       137 
     | 
    
         
            -
                }
         
     | 
| 
       138 
     | 
    
         
            -
                if (rmse_type < 3) {
         
     | 
| 
       139 
     | 
    
         
            -
                    return scale;
         
     | 
| 
       140 
     | 
    
         
            -
                }
         
     | 
| 
       141 
     | 
    
         
            -
                for (int is = -4; is <= 4; ++is) {
         
     | 
| 
      
 99 
     | 
    
         
            +
                for (int is = -9; is <= 9; ++is) {
         
     | 
| 
       142 
100 
     | 
    
         
             
                    if (is == 0) {
         
     | 
| 
       143 
101 
     | 
    
         
             
                        continue;
         
     | 
| 
       144 
102 
     | 
    
         
             
                    }
         
     | 
| 
         @@ -221,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * 
     | 
|
| 
       221 
179 
     | 
    
         
             
                return 1/iscale;
         
     | 
| 
       222 
180 
     | 
    
         
             
            }
         
     | 
| 
       223 
181 
     | 
    
         | 
| 
       224 
     | 
    
         
            -
            static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, 
     | 
| 
      
 182 
     | 
    
         
            +
            static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
         
     | 
| 
      
 183 
     | 
    
         
            +
                    int ntry, float alpha) {
         
     | 
| 
       225 
184 
     | 
    
         
             
                float min = x[0];
         
     | 
| 
       226 
185 
     | 
    
         
             
                float max = x[0];
         
     | 
| 
      
 186 
     | 
    
         
            +
                float sum_x = 0;
         
     | 
| 
      
 187 
     | 
    
         
            +
                float sum_x2 = 0;
         
     | 
| 
       227 
188 
     | 
    
         
             
                for (int i = 1; i < n; ++i) {
         
     | 
| 
       228 
189 
     | 
    
         
             
                    if (x[i] < min) min = x[i];
         
     | 
| 
       229 
190 
     | 
    
         
             
                    if (x[i] > max) max = x[i];
         
     | 
| 
      
 191 
     | 
    
         
            +
                    sum_x += x[i];
         
     | 
| 
      
 192 
     | 
    
         
            +
                    sum_x2 += x[i]*x[i];
         
     | 
| 
       230 
193 
     | 
    
         
             
                }
         
     | 
| 
       231 
194 
     | 
    
         
             
                if (max == min) {
         
     | 
| 
       232 
195 
     | 
    
         
             
                    for (int i = 0; i < n; ++i) L[i] = 0;
         
     | 
| 
         @@ -254,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t 
     | 
|
| 
       254 
217 
     | 
    
         
             
                    for (int i = 0; i < n; ++i) {
         
     | 
| 
       255 
218 
     | 
    
         
             
                        sum += x[i] - scale*L[i];
         
     | 
| 
       256 
219 
     | 
    
         
             
                    }
         
     | 
| 
       257 
     | 
    
         
            -
                    min = sum/n;
         
     | 
| 
      
 220 
     | 
    
         
            +
                    min = alpha*min + (1 - alpha)*sum/n;
         
     | 
| 
       258 
221 
     | 
    
         
             
                    if (min > 0) min = 0;
         
     | 
| 
       259 
222 
     | 
    
         
             
                    iscale = 1/scale;
         
     | 
| 
       260 
223 
     | 
    
         
             
                    if (!did_change) break;
         
     | 
| 
         @@ -263,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t 
     | 
|
| 
       263 
226 
     | 
    
         
             
                return scale;
         
     | 
| 
       264 
227 
     | 
    
         
             
            }
         
     | 
| 
       265 
228 
     | 
    
         | 
| 
      
 229 
     | 
    
         
            +
            static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
         
     | 
| 
      
 230 
     | 
    
         
            +
                    uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
         
     | 
| 
      
 231 
     | 
    
         
            +
                    float rmin, float rdelta, int nstep, bool use_mad) {
         
     | 
| 
      
 232 
     | 
    
         
            +
                float min = x[0];
         
     | 
| 
      
 233 
     | 
    
         
            +
                float max = x[0];
         
     | 
| 
      
 234 
     | 
    
         
            +
                float sum_w = weights[0];
         
     | 
| 
      
 235 
     | 
    
         
            +
                float sum_x = sum_w * x[0];
         
     | 
| 
      
 236 
     | 
    
         
            +
                for (int i = 1; i < n; ++i) {
         
     | 
| 
      
 237 
     | 
    
         
            +
                    if (x[i] < min) min = x[i];
         
     | 
| 
      
 238 
     | 
    
         
            +
                    if (x[i] > max) max = x[i];
         
     | 
| 
      
 239 
     | 
    
         
            +
                    float w = weights[i];
         
     | 
| 
      
 240 
     | 
    
         
            +
                    sum_w += w;
         
     | 
| 
      
 241 
     | 
    
         
            +
                    sum_x += w * x[i];
         
     | 
| 
      
 242 
     | 
    
         
            +
                }
         
     | 
| 
      
 243 
     | 
    
         
            +
                if (min > 0) min = 0;
         
     | 
| 
      
 244 
     | 
    
         
            +
                if (max == min) {
         
     | 
| 
      
 245 
     | 
    
         
            +
                    for (int i = 0; i < n; ++i) L[i] = 0;
         
     | 
| 
      
 246 
     | 
    
         
            +
                    *the_min = -min;
         
     | 
| 
      
 247 
     | 
    
         
            +
                    return 0.f;
         
     | 
| 
      
 248 
     | 
    
         
            +
                }
         
     | 
| 
      
 249 
     | 
    
         
            +
                float iscale = nmax/(max - min);
         
     | 
| 
      
 250 
     | 
    
         
            +
                float scale = 1/iscale;
         
     | 
| 
      
 251 
     | 
    
         
            +
                float best_mad = 0;
         
     | 
| 
      
 252 
     | 
    
         
            +
                for (int i = 0; i < n; ++i) {
         
     | 
| 
      
 253 
     | 
    
         
            +
                    int l = nearest_int(iscale*(x[i] - min));
         
     | 
| 
      
 254 
     | 
    
         
            +
                    L[i] = MAX(0, MIN(nmax, l));
         
     | 
| 
      
 255 
     | 
    
         
            +
                    float diff = scale * L[i] + min - x[i];
         
     | 
| 
      
 256 
     | 
    
         
            +
                    diff = use_mad ? fabsf(diff) : diff * diff;
         
     | 
| 
      
 257 
     | 
    
         
            +
                    float w = weights[i];
         
     | 
| 
      
 258 
     | 
    
         
            +
                    best_mad += w * diff;
         
     | 
| 
      
 259 
     | 
    
         
            +
                }
         
     | 
| 
      
 260 
     | 
    
         
            +
                if (nstep < 1) {
         
     | 
| 
      
 261 
     | 
    
         
            +
                    *the_min = -min;
         
     | 
| 
      
 262 
     | 
    
         
            +
                    return scale;
         
     | 
| 
      
 263 
     | 
    
         
            +
                }
         
     | 
| 
      
 264 
     | 
    
         
            +
                for (int is = 0; is <= nstep; ++is) {
         
     | 
| 
      
 265 
     | 
    
         
            +
                    iscale = (rmin + rdelta*is + nmax)/(max - min);
         
     | 
| 
      
 266 
     | 
    
         
            +
                    float sum_l = 0, sum_l2 = 0, sum_xl = 0;
         
     | 
| 
      
 267 
     | 
    
         
            +
                    for (int i = 0; i < n; ++i) {
         
     | 
| 
      
 268 
     | 
    
         
            +
                        int l = nearest_int(iscale*(x[i] - min));
         
     | 
| 
      
 269 
     | 
    
         
            +
                        l = MAX(0, MIN(nmax, l));
         
     | 
| 
      
 270 
     | 
    
         
            +
                        Laux[i] = l;
         
     | 
| 
      
 271 
     | 
    
         
            +
                        float w = weights[i];
         
     | 
| 
      
 272 
     | 
    
         
            +
                        sum_l += w*l;
         
     | 
| 
      
 273 
     | 
    
         
            +
                        sum_l2 += w*l*l;
         
     | 
| 
      
 274 
     | 
    
         
            +
                        sum_xl += w*l*x[i];
         
     | 
| 
      
 275 
     | 
    
         
            +
                    }
         
     | 
| 
      
 276 
     | 
    
         
            +
                    float D = sum_w * sum_l2 - sum_l * sum_l;
         
     | 
| 
      
 277 
     | 
    
         
            +
                    if (D > 0) {
         
     | 
| 
      
 278 
     | 
    
         
            +
                        float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
         
     | 
| 
      
 279 
     | 
    
         
            +
                        float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
         
     | 
| 
      
 280 
     | 
    
         
            +
                        if (this_min > 0) {
         
     | 
| 
      
 281 
     | 
    
         
            +
                            this_min = 0;
         
     | 
| 
      
 282 
     | 
    
         
            +
                            this_scale = sum_xl / sum_l2;
         
     | 
| 
      
 283 
     | 
    
         
            +
                        }
         
     | 
| 
      
 284 
     | 
    
         
            +
                        float mad = 0;
         
     | 
| 
      
 285 
     | 
    
         
            +
                        for (int i = 0; i < n; ++i) {
         
     | 
| 
      
 286 
     | 
    
         
            +
                            float diff = this_scale * Laux[i] + this_min - x[i];
         
     | 
| 
      
 287 
     | 
    
         
            +
                            diff = use_mad ? fabsf(diff) : diff * diff;
         
     | 
| 
      
 288 
     | 
    
         
            +
                            float w = weights[i];
         
     | 
| 
      
 289 
     | 
    
         
            +
                            mad += w * diff;
         
     | 
| 
      
 290 
     | 
    
         
            +
                        }
         
     | 
| 
      
 291 
     | 
    
         
            +
                        if (mad < best_mad) {
         
     | 
| 
      
 292 
     | 
    
         
            +
                            for (int i = 0; i < n; ++i) {
         
     | 
| 
      
 293 
     | 
    
         
            +
                                L[i] = Laux[i];
         
     | 
| 
      
 294 
     | 
    
         
            +
                            }
         
     | 
| 
      
 295 
     | 
    
         
            +
                            best_mad = mad;
         
     | 
| 
      
 296 
     | 
    
         
            +
                            scale = this_scale;
         
     | 
| 
      
 297 
     | 
    
         
            +
                            min = this_min;
         
     | 
| 
      
 298 
     | 
    
         
            +
                        }
         
     | 
| 
      
 299 
     | 
    
         
            +
                    }
         
     | 
| 
      
 300 
     | 
    
         
            +
                }
         
     | 
| 
      
 301 
     | 
    
         
            +
                *the_min = -min;
         
     | 
| 
      
 302 
     | 
    
         
            +
                return scale;
         
     | 
| 
      
 303 
     | 
    
         
            +
            }
         
     | 
| 
      
 304 
     | 
    
         
            +
             
     | 
| 
       266 
305 
     | 
    
         
             
            #if QK_K == 256
         
     | 
| 
       267 
306 
     | 
    
         
             
            static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
         
     | 
| 
       268 
307 
     | 
    
         
             
                if (j < 4) {
         
     | 
| 
         @@ -281,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict 
     | 
|
| 
       281 
320 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       282 
321 
     | 
    
         | 
| 
       283 
322 
     | 
    
         
             
                uint8_t L[QK_K];
         
     | 
| 
      
 323 
     | 
    
         
            +
                uint8_t Laux[16];
         
     | 
| 
      
 324 
     | 
    
         
            +
                float   weights[16];
         
     | 
| 
       284 
325 
     | 
    
         
             
                float mins[QK_K/16];
         
     | 
| 
       285 
326 
     | 
    
         
             
                float scales[QK_K/16];
         
     | 
| 
       286 
327 
     | 
    
         | 
| 
         @@ -291,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict 
     | 
|
| 
       291 
332 
     | 
    
         
             
                    float max_scale = 0; // as we are deducting the min, scales are always positive
         
     | 
| 
       292 
333 
     | 
    
         
             
                    float max_min = 0;
         
     | 
| 
       293 
334 
     | 
    
         
             
                    for (int j = 0; j < QK_K/16; ++j) {
         
     | 
| 
       294 
     | 
    
         
            -
                         
     | 
| 
      
 335 
     | 
    
         
            +
                        for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
         
     | 
| 
      
 336 
     | 
    
         
            +
                        scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
         
     | 
| 
       295 
337 
     | 
    
         
             
                        float scale = scales[j];
         
     | 
| 
       296 
338 
     | 
    
         
             
                        if (scale > max_scale) {
         
     | 
| 
       297 
339 
     | 
    
         
             
                            max_scale = scale;
         
     | 
| 
         @@ -637,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict 
     | 
|
| 
       637 
679 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       638 
680 
     | 
    
         | 
| 
       639 
681 
     | 
    
         
             
                uint8_t L[QK_K];
         
     | 
| 
      
 682 
     | 
    
         
            +
                uint8_t Laux[32];
         
     | 
| 
      
 683 
     | 
    
         
            +
                float   weights[32];
         
     | 
| 
       640 
684 
     | 
    
         
             
                float mins[QK_K/32];
         
     | 
| 
       641 
685 
     | 
    
         
             
                float scales[QK_K/32];
         
     | 
| 
       642 
686 
     | 
    
         | 
| 
         @@ -645,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict 
     | 
|
| 
       645 
689 
     | 
    
         
             
                    float max_scale = 0; // as we are deducting the min, scales are always positive
         
     | 
| 
       646 
690 
     | 
    
         
             
                    float max_min = 0;
         
     | 
| 
       647 
691 
     | 
    
         
             
                    for (int j = 0; j < QK_K/32; ++j) {
         
     | 
| 
       648 
     | 
    
         
            -
                        scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j],  
     | 
| 
      
 692 
     | 
    
         
            +
                        //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
         
     | 
| 
      
 693 
     | 
    
         
            +
                        float sum_x2 = 0;
         
     | 
| 
      
 694 
     | 
    
         
            +
                        for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
         
     | 
| 
      
 695 
     | 
    
         
            +
                        float av_x = sqrtf(sum_x2/32);
         
     | 
| 
      
 696 
     | 
    
         
            +
                        for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
         
     | 
| 
      
 697 
     | 
    
         
            +
                        scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
         
     | 
| 
       649 
698 
     | 
    
         
             
                        float scale = scales[j];
         
     | 
| 
       650 
699 
     | 
    
         
             
                        if (scale > max_scale) {
         
     | 
| 
       651 
700 
     | 
    
         
             
                            max_scale = scale;
         
     | 
| 
         @@ -798,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict 
     | 
|
| 
       798 
847 
     | 
    
         
             
                uint8_t L[QK_K];
         
     | 
| 
       799 
848 
     | 
    
         
             
                float mins[QK_K/32];
         
     | 
| 
       800 
849 
     | 
    
         
             
                float scales[QK_K/32];
         
     | 
| 
      
 850 
     | 
    
         
            +
                float weights[32];
         
     | 
| 
      
 851 
     | 
    
         
            +
                uint8_t Laux[32];
         
     | 
| 
       801 
852 
     | 
    
         
             
            #else
         
     | 
| 
       802 
853 
     | 
    
         
             
                int8_t L[QK_K];
         
     | 
| 
       803 
854 
     | 
    
         
             
                float scales[QK_K/16];
         
     | 
| 
         @@ -810,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict 
     | 
|
| 
       810 
861 
     | 
    
         
             
                    float max_scale = 0; // as we are deducting the min, scales are always positive
         
     | 
| 
       811 
862 
     | 
    
         
             
                    float max_min = 0;
         
     | 
| 
       812 
863 
     | 
    
         
             
                    for (int j = 0; j < QK_K/32; ++j) {
         
     | 
| 
       813 
     | 
    
         
            -
                        scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j],  
     | 
| 
      
 864 
     | 
    
         
            +
                        //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
         
     | 
| 
      
 865 
     | 
    
         
            +
                        float sum_x2 = 0;
         
     | 
| 
      
 866 
     | 
    
         
            +
                        for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
         
     | 
| 
      
 867 
     | 
    
         
            +
                        float av_x = sqrtf(sum_x2/32);
         
     | 
| 
      
 868 
     | 
    
         
            +
                        for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
         
     | 
| 
      
 869 
     | 
    
         
            +
                        scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
         
     | 
| 
       814 
870 
     | 
    
         
             
                        float scale = scales[j];
         
     | 
| 
       815 
871 
     | 
    
         
             
                        if (scale > max_scale) {
         
     | 
| 
       816 
872 
     | 
    
         
             
                            max_scale = scale;
         
     | 
| 
         @@ -273,14 +273,16 @@ struct llama_mmap { 
     | 
|
| 
       273 
273 
     | 
    
         | 
| 
       274 
274 
     | 
    
         
             
                    #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
         
     | 
| 
       275 
275 
     | 
    
         
             
                    if (prefetch) {
         
     | 
| 
       276 
     | 
    
         
            -
             
     | 
| 
       277 
     | 
    
         
            -
             
     | 
| 
       278 
     | 
    
         
            -
             
     | 
| 
       279 
     | 
    
         
            -
             
     | 
| 
       280 
     | 
    
         
            -
             
     | 
| 
       281 
     | 
    
         
            -
             
     | 
| 
       282 
     | 
    
         
            -
             
     | 
| 
       283 
     | 
    
         
            -
             
     | 
| 
      
 276 
     | 
    
         
            +
                      // Advise the kernel to preload the mapped memory
         
     | 
| 
      
 277 
     | 
    
         
            +
             
     | 
| 
      
 278 
     | 
    
         
            +
                      WIN32_MEMORY_RANGE_ENTRY range;
         
     | 
| 
      
 279 
     | 
    
         
            +
                      range.VirtualAddress = addr;
         
     | 
| 
      
 280 
     | 
    
         
            +
             
     | 
| 
      
 281 
     | 
    
         
            +
                      range.NumberOfBytes = (SIZE_T)size;
         
     | 
| 
      
 282 
     | 
    
         
            +
                      if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
         
     | 
| 
      
 283 
     | 
    
         
            +
                          fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
         
     | 
| 
      
 284 
     | 
    
         
            +
                                  llama_format_win_err(GetLastError()).c_str());
         
     | 
| 
      
 285 
     | 
    
         
            +
                      }
         
     | 
| 
       284 
286 
     | 
    
         
             
                    }
         
     | 
| 
       285 
287 
     | 
    
         
             
                    #else
         
     | 
| 
       286 
288 
     | 
    
         
             
                    #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
         
     |