llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +16 -0
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +1 -0
 - data/ext/llama_cpp/llama_cpp.cpp +27 -0
 - data/lib/llama_cpp/version.rb +2 -2
 - data/sig/llama_cpp.rbs +14 -0
 - data/vendor/tmp/llama.cpp/LICENSE +1 -1
 - data/vendor/tmp/llama.cpp/Makefile +81 -20
 - data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
 - data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
 - data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
 - data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
 - data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
 - data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
 - data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
 - data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
 - data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
 - data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
 - data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
 - data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
 - data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
 - data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
 - data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
 - data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
 - data/vendor/tmp/llama.cpp/ggml.c +141 -101
 - data/vendor/tmp/llama.cpp/ggml.h +18 -12
 - data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
 - data/vendor/tmp/llama.cpp/llama.h +145 -29
 - data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
 - data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
 - data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
 - data/vendor/tmp/llama.cpp/unicode.h +2 -0
 - metadata +5 -3
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
         
     | 
    
        data/CHANGELOG.md
    CHANGED
    
    | 
         @@ -1,3 +1,19 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            - Bump llama.cpp from b2608 to b2658.
         
     | 
| 
      
 4 
     | 
    
         
            +
              - Add magic number constants.
         
     | 
| 
      
 5 
     | 
    
         
            +
              - Add `token_cls` and `token_sep` methods to `Model`.
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            - Bump llama.cpp from b2496 to b2573.
         
     | 
| 
      
 12 
     | 
    
         
            +
              - Add file type constants.
         
     | 
| 
      
 13 
     | 
    
         
            +
            - Bump llama.cpp from b2573 to b2608.
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
       1 
17 
     | 
    
         
             
            ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
         
     | 
| 
       2 
18 
     | 
    
         | 
| 
       3 
19 
     | 
    
         
             
            - Bump llama.cpp from b2435 to b2496.
         
     | 
    
        data/examples/chat.rb
    CHANGED
    
    | 
         @@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation 
     | 
|
| 
       127 
127 
     | 
    
         
             
                  end
         
     | 
| 
       128 
128 
     | 
    
         | 
| 
       129 
129 
     | 
    
         
             
                  if input_echo
         
     | 
| 
       130 
     | 
    
         
            -
                    output =  
     | 
| 
       131 
     | 
    
         
            -
                    embd.each { |token| output << context.model.token_to_piece(token) }
         
     | 
| 
      
 130 
     | 
    
         
            +
                    output = embd.map { |token| context.model.token_to_piece(token) }
         
     | 
| 
       132 
131 
     | 
    
         
             
                    output_str = output.join
         
     | 
| 
       133 
132 
     | 
    
         
             
                    output_str.chomp!(antiprompt) if first_input
         
     | 
| 
       134 
133 
     | 
    
         
             
                    print(output_str)
         
     | 
| 
         @@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation 
     | 
|
| 
       136 
135 
     | 
    
         | 
| 
       137 
136 
     | 
    
         
             
                  if embd_input.size <= n_consumed
         
     | 
| 
       138 
137 
     | 
    
         
             
                    if antiprompt.size.positive?
         
     | 
| 
       139 
     | 
    
         
            -
                      last_output =  
     | 
| 
       140 
     | 
    
         
            -
                      last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
         
     | 
| 
      
 138 
     | 
    
         
            +
                      last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
         
     | 
| 
       141 
139 
     | 
    
         
             
                      last_output_str = last_output.join
         
     | 
| 
       142 
140 
     | 
    
         | 
| 
       143 
141 
     | 
    
         
             
                      search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
         
     | 
    
        data/ext/llama_cpp/extconf.rb
    CHANGED
    
    | 
         @@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/) 
     | 
|
| 
       37 
37 
     | 
    
         
             
                abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
         
     | 
| 
       38 
38 
     | 
    
         
             
              end
         
     | 
| 
       39 
39 
     | 
    
         
             
              FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
         
     | 
| 
      
 40 
     | 
    
         
            +
              FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
         
     | 
| 
       40 
41 
     | 
    
         
             
            end
         
     | 
| 
       41 
42 
     | 
    
         | 
| 
       42 
43 
     | 
    
         
             
            abort('libstdc++ is not found.') unless have_library('stdc++')
         
     | 
    
        data/ext/llama_cpp/llama_cpp.cpp
    CHANGED
    
    | 
         @@ -1478,6 +1478,8 @@ public: 
     | 
|
| 
       1478 
1478 
     | 
    
         
             
                rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
         
     | 
| 
       1479 
1479 
     | 
    
         
             
                rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
         
     | 
| 
       1480 
1480 
     | 
    
         
             
                rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
         
     | 
| 
      
 1481 
     | 
    
         
            +
                rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
         
     | 
| 
      
 1482 
     | 
    
         
            +
                rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
         
     | 
| 
       1481 
1483 
     | 
    
         
             
                rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
         
     | 
| 
       1482 
1484 
     | 
    
         
             
                rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
         
     | 
| 
       1483 
1485 
     | 
    
         
             
                rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
         
     | 
| 
         @@ -1743,6 +1745,16 @@ private: 
     | 
|
| 
       1743 
1745 
     | 
    
         
             
                return INT2NUM(llama_token_eos(ptr->model));
         
     | 
| 
       1744 
1746 
     | 
    
         
             
              }
         
     | 
| 
       1745 
1747 
     | 
    
         | 
| 
      
 1748 
     | 
    
         
            +
              static VALUE _llama_model_token_cls(VALUE self) {
         
     | 
| 
      
 1749 
     | 
    
         
            +
                LLaMAModelWrapper* ptr = get_llama_model(self);
         
     | 
| 
      
 1750 
     | 
    
         
            +
                return INT2NUM(llama_token_cls(ptr->model));
         
     | 
| 
      
 1751 
     | 
    
         
            +
              }
         
     | 
| 
      
 1752 
     | 
    
         
            +
             
     | 
| 
      
 1753 
     | 
    
         
            +
              static VALUE _llama_model_token_sep(VALUE self) {
         
     | 
| 
      
 1754 
     | 
    
         
            +
                LLaMAModelWrapper* ptr = get_llama_model(self);
         
     | 
| 
      
 1755 
     | 
    
         
            +
                return INT2NUM(llama_token_sep(ptr->model));
         
     | 
| 
      
 1756 
     | 
    
         
            +
              }
         
     | 
| 
      
 1757 
     | 
    
         
            +
             
     | 
| 
       1746 
1758 
     | 
    
         
             
              static VALUE _llama_model_token_nl(VALUE self) {
         
     | 
| 
       1747 
1759 
     | 
    
         
             
                LLaMAModelWrapper* ptr = get_llama_model(self);
         
     | 
| 
       1748 
1760 
     | 
    
         
             
                return INT2NUM(llama_token_nl(ptr->model));
         
     | 
| 
         @@ -3371,6 +3383,10 @@ extern "C" void Init_llama_cpp(void) { 
     | 
|
| 
       3371 
3383 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
         
     | 
| 
       3372 
3384 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
         
     | 
| 
       3373 
3385 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
         
     | 
| 
      
 3386 
     | 
    
         
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
         
     | 
| 
      
 3387 
     | 
    
         
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
         
     | 
| 
      
 3388 
     | 
    
         
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
         
     | 
| 
      
 3389 
     | 
    
         
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
         
     | 
| 
       3374 
3390 
     | 
    
         | 
| 
       3375 
3391 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
         
     | 
| 
       3376 
3392 
     | 
    
         | 
| 
         @@ -3410,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) { 
     | 
|
| 
       3410 
3426 
     | 
    
         
             
              ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
         
     | 
| 
       3411 
3427 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
         
     | 
| 
       3412 
3428 
     | 
    
         | 
| 
      
 3429 
     | 
    
         
            +
              ss_magic.str("");
         
     | 
| 
      
 3430 
     | 
    
         
            +
              ss_magic.clear(std::stringstream::goodbit);
         
     | 
| 
      
 3431 
     | 
    
         
            +
              ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
         
     | 
| 
      
 3432 
     | 
    
         
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
         
     | 
| 
      
 3433 
     | 
    
         
            +
             
     | 
| 
       3413 
3434 
     | 
    
         
             
              ss_magic.str("");
         
     | 
| 
       3414 
3435 
     | 
    
         
             
              ss_magic.clear(std::stringstream::goodbit);
         
     | 
| 
       3415 
3436 
     | 
    
         
             
              ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
         
     | 
| 
       3416 
3437 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
         
     | 
| 
       3417 
3438 
     | 
    
         | 
| 
      
 3439 
     | 
    
         
            +
              ss_magic.str("");
         
     | 
| 
      
 3440 
     | 
    
         
            +
              ss_magic.clear(std::stringstream::goodbit);
         
     | 
| 
      
 3441 
     | 
    
         
            +
              ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
         
     | 
| 
      
 3442 
     | 
    
         
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
         
     | 
| 
      
 3443 
     | 
    
         
            +
             
     | 
| 
       3418 
3444 
     | 
    
         
             
              ss_magic.str("");
         
     | 
| 
       3419 
3445 
     | 
    
         
             
              ss_magic.clear(std::stringstream::goodbit);
         
     | 
| 
       3420 
3446 
     | 
    
         
             
              ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
         
     | 
| 
       3421 
3447 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
         
     | 
| 
       3422 
3448 
     | 
    
         | 
| 
       3423 
3449 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
         
     | 
| 
      
 3450 
     | 
    
         
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
         
     | 
| 
       3424 
3451 
     | 
    
         
             
            }
         
     | 
    
        data/lib/llama_cpp/version.rb
    CHANGED
    
    | 
         @@ -3,8 +3,8 @@ 
     | 
|
| 
       3 
3 
     | 
    
         
             
            # llama_cpp.rb provides Ruby bindings for the llama.cpp.
         
     | 
| 
       4 
4 
     | 
    
         
             
            module LLaMACpp
         
     | 
| 
       5 
5 
     | 
    
         
             
              # The version of llama_cpp.rb you install.
         
     | 
| 
       6 
     | 
    
         
            -
              VERSION = '0.14. 
     | 
| 
      
 6 
     | 
    
         
            +
              VERSION = '0.14.5'
         
     | 
| 
       7 
7 
     | 
    
         | 
| 
       8 
8 
     | 
    
         
             
              # The version of llama.cpp bundled with llama_cpp.rb.
         
     | 
| 
       9 
     | 
    
         
            -
              LLAMA_CPP_VERSION = ' 
     | 
| 
      
 9 
     | 
    
         
            +
              LLAMA_CPP_VERSION = 'b2658'
         
     | 
| 
       10 
10 
     | 
    
         
             
            end
         
     | 
    
        data/sig/llama_cpp.rbs
    CHANGED
    
    | 
         @@ -3,6 +3,14 @@ module LLaMACpp 
     | 
|
| 
       3 
3 
     | 
    
         
             
              LLAMA_CPP_VERSION: String
         
     | 
| 
       4 
4 
     | 
    
         
             
              LLAMA_DEFALUT_SEED: String
         
     | 
| 
       5 
5 
     | 
    
         | 
| 
      
 6 
     | 
    
         
            +
              LLAMA_FILE_MAGIC_GGLA: String
         
     | 
| 
      
 7 
     | 
    
         
            +
              LLAMA_FILE_MAGIC_GGSN: String
         
     | 
| 
      
 8 
     | 
    
         
            +
              LLAMA_FILE_MAGIC_GGSQ: String
         
     | 
| 
      
 9 
     | 
    
         
            +
              LLAMA_SESSION_MAGIC: String
         
     | 
| 
      
 10 
     | 
    
         
            +
              LLAMA_SESSION_VERSION: String
         
     | 
| 
      
 11 
     | 
    
         
            +
              LLAMA_STATE_SEQ_MAGIC: String
         
     | 
| 
      
 12 
     | 
    
         
            +
              LLAMA_STATE_SEQ_VERSION: String
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
       6 
14 
     | 
    
         
             
              LLAMA_VOCAB_TYPE_NONE: Integer
         
     | 
| 
       7 
15 
     | 
    
         
             
              LLAMA_VOCAB_TYPE_SPM: Integer
         
     | 
| 
       8 
16 
     | 
    
         
             
              LLAMA_VOCAB_TYPE_BPE: Integer
         
     | 
| 
         @@ -32,6 +40,10 @@ module LLaMACpp 
     | 
|
| 
       32 
40 
     | 
    
         
             
              LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
         
     | 
| 
       33 
41 
     | 
    
         
             
              LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
         
     | 
| 
       34 
42 
     | 
    
         
             
              LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
         
     | 
| 
      
 43 
     | 
    
         
            +
              LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
         
     | 
| 
      
 44 
     | 
    
         
            +
              LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
         
     | 
| 
      
 45 
     | 
    
         
            +
              LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
         
     | 
| 
      
 46 
     | 
    
         
            +
              LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
         
     | 
| 
       35 
47 
     | 
    
         | 
| 
       36 
48 
     | 
    
         
             
              LLAMA_KV_OVERRIDE_TYPE_INT: Integer
         
     | 
| 
       37 
49 
     | 
    
         
             
              LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
         
     | 
| 
         @@ -120,6 +132,8 @@ module LLaMACpp 
     | 
|
| 
       120 
132 
     | 
    
         
             
                def type: (Integer) -> Integer
         
     | 
| 
       121 
133 
     | 
    
         
             
                def token_bos: () -> Integer
         
     | 
| 
       122 
134 
     | 
    
         
             
                def token_eos: () -> Integer
         
     | 
| 
      
 135 
     | 
    
         
            +
                def token_cls: () -> Integer
         
     | 
| 
      
 136 
     | 
    
         
            +
                def token_sep: () -> Integer
         
     | 
| 
       123 
137 
     | 
    
         
             
                def token_nl: () -> Integer
         
     | 
| 
       124 
138 
     | 
    
         
             
                def add_bos_token?: () -> bool
         
     | 
| 
       125 
139 
     | 
    
         
             
                def add_eos_token?: () -> bool
         
     | 
| 
         @@ -1,8 +1,8 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # Define the default target now so that it is always the first target
         
     | 
| 
       2 
2 
     | 
    
         
             
            BUILD_TARGETS = \
         
     | 
| 
       3 
3 
     | 
    
         
             
            	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
         
     | 
| 
       4 
     | 
    
         
            -
            	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
         
     | 
| 
       5 
     | 
    
         
            -
            	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
         
     | 
| 
      
 4 
     | 
    
         
            +
            	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search  \
         
     | 
| 
      
 5 
     | 
    
         
            +
            	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
7 
     | 
    
         
             
            # Binaries only useful for tests
         
     | 
| 
       8 
8 
     | 
    
         
             
            TEST_TARGETS = \
         
     | 
| 
         @@ -10,7 +10,7 @@ TEST_TARGETS = \ 
     | 
|
| 
       10 
10 
     | 
    
         
             
            	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
         
     | 
| 
       11 
11 
     | 
    
         
             
            	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
         
     | 
| 
       12 
12 
     | 
    
         
             
            	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease                                 \
         
     | 
| 
       13 
     | 
    
         
            -
            	tests/test-json-schema-to-grammar
         
     | 
| 
      
 13 
     | 
    
         
            +
            	tests/test-json-schema-to-grammar tests/test-grammar-integration
         
     | 
| 
       14 
14 
     | 
    
         | 
| 
       15 
15 
     | 
    
         
             
            # Code coverage output files
         
     | 
| 
       16 
16 
     | 
    
         
             
            COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
         
     | 
| 
         @@ -392,14 +392,20 @@ ifdef LLAMA_BLIS 
     | 
|
| 
       392 
392 
     | 
    
         
             
            endif # LLAMA_BLIS
         
     | 
| 
       393 
393 
     | 
    
         | 
| 
       394 
394 
     | 
    
         
             
            ifdef LLAMA_CUBLAS
         
     | 
| 
      
 395 
     | 
    
         
            +
            # LLAMA_CUBLAS is deprecated and will be removed in the future
         
     | 
| 
      
 396 
     | 
    
         
            +
            	LLAMA_CUDA := 1
         
     | 
| 
      
 397 
     | 
    
         
            +
            endif
         
     | 
| 
      
 398 
     | 
    
         
            +
             
     | 
| 
      
 399 
     | 
    
         
            +
            ifdef LLAMA_CUDA
         
     | 
| 
       395 
400 
     | 
    
         
             
            	ifneq ('', '$(wildcard /opt/cuda)')
         
     | 
| 
       396 
401 
     | 
    
         
             
            		CUDA_PATH ?= /opt/cuda
         
     | 
| 
       397 
402 
     | 
    
         
             
            	else
         
     | 
| 
       398 
403 
     | 
    
         
             
            		CUDA_PATH ?= /usr/local/cuda
         
     | 
| 
       399 
404 
     | 
    
         
             
            	endif
         
     | 
| 
       400 
     | 
    
         
            -
            	MK_CPPFLAGS  += - 
     | 
| 
      
 405 
     | 
    
         
            +
            	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
         
     | 
| 
       401 
406 
     | 
    
         
             
            	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
         
     | 
| 
       402 
407 
     | 
    
         
             
            	OBJS         += ggml-cuda.o
         
     | 
| 
      
 408 
     | 
    
         
            +
            	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
         
     | 
| 
       403 
409 
     | 
    
         
             
            	MK_NVCCFLAGS += -use_fast_math
         
     | 
| 
       404 
410 
     | 
    
         
             
            ifdef LLAMA_FATAL_WARNINGS
         
     | 
| 
       405 
411 
     | 
    
         
             
            	MK_NVCCFLAGS += -Werror all-warnings
         
     | 
| 
         @@ -454,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE 
     | 
|
| 
       454 
460 
     | 
    
         
             
            else
         
     | 
| 
       455 
461 
     | 
    
         
             
            	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
         
     | 
| 
       456 
462 
     | 
    
         
             
            endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
         
     | 
| 
       457 
     | 
    
         
            -
             
     | 
| 
       458 
     | 
    
         
            -
             
     | 
| 
       459 
     | 
    
         
            -
             
     | 
| 
      
 463 
     | 
    
         
            +
            ifdef LLAMA_CUDA_NO_PEER_COPY
         
     | 
| 
      
 464 
     | 
    
         
            +
            	MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
         
     | 
| 
      
 465 
     | 
    
         
            +
            endif # LLAMA_CUDA_NO_PEER_COPY
         
     | 
| 
       460 
466 
     | 
    
         
             
            ifdef LLAMA_CUDA_CCBIN
         
     | 
| 
       461 
467 
     | 
    
         
             
            	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
         
     | 
| 
       462 
468 
     | 
    
         
             
            endif
         
     | 
| 
       463 
     | 
    
         
            -
             
     | 
| 
      
 469 
     | 
    
         
            +
             
     | 
| 
       464 
470 
     | 
    
         
             
            ifdef JETSON_EOL_MODULE_DETECT
         
     | 
| 
       465 
     | 
    
         
            -
             
     | 
| 
      
 471 
     | 
    
         
            +
            define NVCC_COMPILE
         
     | 
| 
      
 472 
     | 
    
         
            +
            	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
         
     | 
| 
      
 473 
     | 
    
         
            +
            endef # NVCC_COMPILE
         
     | 
| 
       466 
474 
     | 
    
         
             
            else
         
     | 
| 
      
 475 
     | 
    
         
            +
            define NVCC_COMPILE
         
     | 
| 
       467 
476 
     | 
    
         
             
            	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
         
     | 
| 
      
 477 
     | 
    
         
            +
            endef # NVCC_COMPILE
         
     | 
| 
       468 
478 
     | 
    
         
             
            endif # JETSON_EOL_MODULE_DETECT
         
     | 
| 
       469 
     | 
    
         
            -
             
     | 
| 
      
 479 
     | 
    
         
            +
             
     | 
| 
      
 480 
     | 
    
         
            +
            ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
         
     | 
| 
      
 481 
     | 
    
         
            +
            	$(NVCC_COMPILE)
         
     | 
| 
      
 482 
     | 
    
         
            +
             
     | 
| 
      
 483 
     | 
    
         
            +
            ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
         
     | 
| 
      
 484 
     | 
    
         
            +
            	$(NVCC_COMPILE)
         
     | 
| 
      
 485 
     | 
    
         
            +
             
     | 
| 
      
 486 
     | 
    
         
            +
            endif # LLAMA_CUDA
         
     | 
| 
       470 
487 
     | 
    
         | 
| 
       471 
488 
     | 
    
         
             
            ifdef LLAMA_CLBLAST
         
     | 
| 
       472 
489 
     | 
    
         | 
| 
         @@ -512,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h 
     | 
|
| 
       512 
529 
     | 
    
         
             
            endif # LLAMA_VULKAN
         
     | 
| 
       513 
530 
     | 
    
         | 
| 
       514 
531 
     | 
    
         
             
            ifdef LLAMA_HIPBLAS
         
     | 
| 
       515 
     | 
    
         
            -
             
     | 
| 
       516 
532 
     | 
    
         
             
            	ifeq ($(wildcard /opt/rocm),)
         
     | 
| 
       517 
533 
     | 
    
         
             
            		ROCM_PATH	?= /usr
         
     | 
| 
       518 
534 
     | 
    
         
             
            		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
         
     | 
| 
         @@ -524,7 +540,7 @@ ifdef LLAMA_HIPBLAS 
     | 
|
| 
       524 
540 
     | 
    
         
             
            	LLAMA_CUDA_DMMV_X       ?= 32
         
     | 
| 
       525 
541 
     | 
    
         
             
            	LLAMA_CUDA_MMV_Y        ?= 1
         
     | 
| 
       526 
542 
     | 
    
         
             
            	LLAMA_CUDA_KQUANTS_ITER ?= 2
         
     | 
| 
       527 
     | 
    
         
            -
            	MK_CPPFLAGS += -DGGML_USE_HIPBLAS - 
     | 
| 
      
 543 
     | 
    
         
            +
            	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
         
     | 
| 
       528 
544 
     | 
    
         
             
            ifdef LLAMA_HIP_UMA
         
     | 
| 
       529 
545 
     | 
    
         
             
            	MK_CPPFLAGS += -DGGML_HIP_UMA
         
     | 
| 
       530 
546 
     | 
    
         
             
            endif # LLAMA_HIP_UMA
         
     | 
| 
         @@ -537,9 +553,18 @@ endif # LLAMA_HIP_UMA 
     | 
|
| 
       537 
553 
     | 
    
         
             
            ifdef LLAMA_CUDA_FORCE_DMMV
         
     | 
| 
       538 
554 
     | 
    
         
             
            	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
         
     | 
| 
       539 
555 
     | 
    
         
             
            endif # LLAMA_CUDA_FORCE_DMMV
         
     | 
| 
      
 556 
     | 
    
         
            +
            ifdef LLAMA_CUDA_NO_PEER_COPY
         
     | 
| 
      
 557 
     | 
    
         
            +
            	HIPFLAGS 	+= -DGGML_CUDA_NO_PEER_COPY
         
     | 
| 
      
 558 
     | 
    
         
            +
            endif # LLAMA_CUDA_NO_PEER_COPY
         
     | 
| 
       540 
559 
     | 
    
         
             
            	OBJS        += ggml-cuda.o
         
     | 
| 
       541 
     | 
    
         
            -
             
     | 
| 
      
 560 
     | 
    
         
            +
            	OBJS        += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
         
     | 
| 
      
 561 
     | 
    
         
            +
             
     | 
| 
      
 562 
     | 
    
         
            +
            ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
         
     | 
| 
       542 
563 
     | 
    
         
             
            	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
         
     | 
| 
      
 564 
     | 
    
         
            +
             
     | 
| 
      
 565 
     | 
    
         
            +
            ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
         
     | 
| 
      
 566 
     | 
    
         
            +
            	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
         
     | 
| 
      
 567 
     | 
    
         
            +
             
     | 
| 
       543 
568 
     | 
    
         
             
            endif # LLAMA_HIPBLAS
         
     | 
| 
       544 
569 
     | 
    
         | 
| 
       545 
570 
     | 
    
         
             
            ifdef LLAMA_METAL
         
     | 
| 
         @@ -592,7 +617,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) 
     | 
|
| 
       592 
617 
     | 
    
         
             
            override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
         
     | 
| 
       593 
618 
     | 
    
         | 
| 
       594 
619 
     | 
    
         
             
            # identify CUDA host compiler
         
     | 
| 
       595 
     | 
    
         
            -
            ifdef  
     | 
| 
      
 620 
     | 
    
         
            +
            ifdef LLAMA_CUDA
         
     | 
| 
       596 
621 
     | 
    
         
             
            GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
         
     | 
| 
       597 
622 
     | 
    
         
             
            include scripts/get-flags.mk
         
     | 
| 
       598 
623 
     | 
    
         
             
            CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
         
     | 
| 
         @@ -617,19 +642,26 @@ $(info I NVCCFLAGS: $(NVCCFLAGS)) 
     | 
|
| 
       617 
642 
     | 
    
         
             
            $(info I LDFLAGS:   $(LDFLAGS))
         
     | 
| 
       618 
643 
     | 
    
         
             
            $(info I CC:        $(shell $(CC)   --version | head -n 1))
         
     | 
| 
       619 
644 
     | 
    
         
             
            $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
         
     | 
| 
       620 
     | 
    
         
            -
            ifdef  
     | 
| 
      
 645 
     | 
    
         
            +
            ifdef LLAMA_CUDA
         
     | 
| 
       621 
646 
     | 
    
         
             
            $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
         
     | 
| 
       622 
647 
     | 
    
         
             
            CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
         
     | 
| 
       623 
648 
     | 
    
         
             
            ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
         
     | 
| 
       624 
649 
     | 
    
         
             
            ifndef CUDA_DOCKER_ARCH
         
     | 
| 
       625 
650 
     | 
    
         
             
            ifndef CUDA_POWER_ARCH
         
     | 
| 
       626 
     | 
    
         
            -
            $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
         
     | 
| 
      
 651 
     | 
    
         
            +
            $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
         
     | 
| 
       627 
652 
     | 
    
         
             
            endif # CUDA_POWER_ARCH
         
     | 
| 
       628 
653 
     | 
    
         
             
            endif # CUDA_DOCKER_ARCH
         
     | 
| 
       629 
654 
     | 
    
         
             
            endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
         
     | 
| 
       630 
     | 
    
         
            -
            endif #  
     | 
| 
      
 655 
     | 
    
         
            +
            endif # LLAMA_CUDA
         
     | 
| 
       631 
656 
     | 
    
         
             
            $(info )
         
     | 
| 
       632 
657 
     | 
    
         | 
| 
      
 658 
     | 
    
         
            +
            ifdef LLAMA_CUBLAS
         
     | 
| 
      
 659 
     | 
    
         
            +
            $(info !!!!)
         
     | 
| 
      
 660 
     | 
    
         
            +
            $(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
         
     | 
| 
      
 661 
     | 
    
         
            +
            $(info !!!!)
         
     | 
| 
      
 662 
     | 
    
         
            +
            $(info )
         
     | 
| 
      
 663 
     | 
    
         
            +
            endif
         
     | 
| 
      
 664 
     | 
    
         
            +
             
     | 
| 
       633 
665 
     | 
    
         
             
            #
         
     | 
| 
       634 
666 
     | 
    
         
             
            # Build library
         
     | 
| 
       635 
667 
     | 
    
         
             
            #
         
     | 
| 
         @@ -649,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h 
     | 
|
| 
       649 
681 
     | 
    
         
             
            unicode.o: unicode.cpp unicode.h
         
     | 
| 
       650 
682 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $@
         
     | 
| 
       651 
683 
     | 
    
         | 
| 
       652 
     | 
    
         
            -
             
     | 
| 
      
 684 
     | 
    
         
            +
            unicode-data.o: unicode-data.cpp unicode-data.h
         
     | 
| 
      
 685 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) -c $< -o $@
         
     | 
| 
      
 686 
     | 
    
         
            +
             
     | 
| 
      
 687 
     | 
    
         
            +
            OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
         
     | 
| 
       653 
688 
     | 
    
         | 
| 
       654 
689 
     | 
    
         
             
            llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
         
     | 
| 
       655 
690 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $@
         
     | 
| 
         @@ -675,6 +710,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t 
     | 
|
| 
       675 
710 
     | 
    
         
             
            train.o: common/train.cpp common/train.h
         
     | 
| 
       676 
711 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $@
         
     | 
| 
       677 
712 
     | 
    
         | 
| 
      
 713 
     | 
    
         
            +
            ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
         
     | 
| 
      
 714 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) -c $< -o $@
         
     | 
| 
      
 715 
     | 
    
         
            +
             
     | 
| 
       678 
716 
     | 
    
         
             
            libllama.so: llama.o ggml.o $(OBJS)
         
     | 
| 
       679 
717 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
         
     | 
| 
       680 
718 
     | 
    
         | 
| 
         @@ -686,7 +724,8 @@ lib: llama.o ggml.o $(OBJS) 
     | 
|
| 
       686 
724 
     | 
    
         
             
            	ar rcs libllama.a $^
         
     | 
| 
       687 
725 
     | 
    
         | 
| 
       688 
726 
     | 
    
         
             
            clean:
         
     | 
| 
       689 
     | 
    
         
            -
            	rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
         
     | 
| 
      
 727 
     | 
    
         
            +
            	rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
         
     | 
| 
      
 728 
     | 
    
         
            +
            	rm -vrf ggml-cuda/*.o
         
     | 
| 
       690 
729 
     | 
    
         | 
| 
       691 
730 
     | 
    
         
             
            #
         
     | 
| 
       692 
731 
     | 
    
         
             
            # Examples
         
     | 
| 
         @@ -766,6 +805,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O 
     | 
|
| 
       766 
805 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
       767 
806 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
       768 
807 
     | 
    
         | 
| 
      
 808 
     | 
    
         
            +
            eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
         
     | 
| 
      
 809 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
      
 810 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
      
 811 
     | 
    
         
            +
             
     | 
| 
       769 
812 
     | 
    
         
             
            train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
         
     | 
| 
       770 
813 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
       771 
814 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
         @@ -803,6 +846,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) 
     | 
|
| 
       803 
846 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
       804 
847 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
       805 
848 
     | 
    
         | 
| 
      
 849 
     | 
    
         
            +
            retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
         
     | 
| 
      
 850 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
      
 851 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
      
 852 
     | 
    
         
            +
             
     | 
| 
       806 
853 
     | 
    
         
             
            speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
         
     | 
| 
       807 
854 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
       808 
855 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
         @@ -815,14 +862,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS 
     | 
|
| 
       815 
862 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
       816 
863 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
       817 
864 
     | 
    
         | 
| 
       818 
     | 
    
         
            -
            lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
         
     | 
| 
      
 865 
     | 
    
         
            +
            lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
         
     | 
| 
       819 
866 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
       820 
867 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
      
 868 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
         
     | 
| 
      
 869 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
         
     | 
| 
      
 870 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
         
     | 
| 
      
 871 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
         
     | 
| 
      
 872 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
         
     | 
| 
      
 873 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
         
     | 
| 
       821 
874 
     | 
    
         | 
| 
       822 
875 
     | 
    
         
             
            passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
         
     | 
| 
       823 
876 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
       824 
877 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
       825 
878 
     | 
    
         | 
| 
      
 879 
     | 
    
         
            +
            gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
         
     | 
| 
      
 880 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
      
 881 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
      
 882 
     | 
    
         
            +
             
     | 
| 
       826 
883 
     | 
    
         
             
            ifeq ($(UNAME_S),Darwin)
         
     | 
| 
       827 
884 
     | 
    
         
             
            swift: examples/batched.swift
         
     | 
| 
       828 
885 
     | 
    
         
             
            	(cd examples/batched.swift; make build)
         
     | 
| 
         @@ -870,6 +927,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar- 
     | 
|
| 
       870 
927 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
       871 
928 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
       872 
929 
     | 
    
         | 
| 
      
 930 
     | 
    
         
            +
            tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
         
     | 
| 
      
 931 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
      
 932 
     | 
    
         
            +
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
      
 933 
     | 
    
         
            +
             
     | 
| 
       873 
934 
     | 
    
         
             
            tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
         
     | 
| 
       874 
935 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         
     | 
| 
       875 
936 
     | 
    
         
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
         
     | 
| 
         @@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c 
     | 
|
| 
       705 
705 
     | 
    
         
             
                    struct ggml_tensor * leaf = graph->leafs[i];
         
     | 
| 
       706 
706 
     | 
    
         
             
                    struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
         
     | 
| 
       707 
707 
     | 
    
         
             
                    galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
         
     | 
| 
       708 
     | 
    
         
            -
                     
     | 
| 
       709 
     | 
    
         
            -
             
     | 
| 
      
 708 
     | 
    
         
            +
                    if (leaf->view_src || leaf->data) {
         
     | 
| 
      
 709 
     | 
    
         
            +
                        galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
         
     | 
| 
      
 710 
     | 
    
         
            +
                        galloc->leaf_allocs[i].leaf.size_max = 0;
         
     | 
| 
      
 711 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 712 
     | 
    
         
            +
                        galloc->leaf_allocs[i].leaf.offset = hn->offset;
         
     | 
| 
      
 713 
     | 
    
         
            +
                        galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
         
     | 
| 
      
 714 
     | 
    
         
            +
                    }
         
     | 
| 
       710 
715 
     | 
    
         
             
                }
         
     | 
| 
       711 
716 
     | 
    
         | 
| 
       712 
717 
     | 
    
         
             
                // reallocate buffers if needed
         
     | 
| 
         @@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) { 
     | 
|
| 
       420 
420 
     | 
    
         
             
                ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
         
     | 
| 
       421 
421 
     | 
    
         | 
| 
       422 
422 
     | 
    
         
             
                // add forward decls here to avoid including the backend headers
         
     | 
| 
       423 
     | 
    
         
            -
            #ifdef  
     | 
| 
      
 423 
     | 
    
         
            +
            #ifdef GGML_USE_CUDA
         
     | 
| 
       424 
424 
     | 
    
         
             
                extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
         
     | 
| 
       425 
425 
     | 
    
         
             
                ggml_backend_cuda_reg_devices();
         
     | 
| 
       426 
426 
     | 
    
         
             
            #endif
         
     | 
| 
         @@ -137,7 +137,7 @@ extern "C" { 
     | 
|
| 
       137 
137 
     | 
    
         
             
                /*
         
     | 
| 
       138 
138 
     | 
    
         
             
                  Example usage:
         
     | 
| 
       139 
139 
     | 
    
         | 
| 
       140 
     | 
    
         
            -
                    // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be  
     | 
| 
      
 140 
     | 
    
         
            +
                    // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
         
     | 
| 
       141 
141 
     | 
    
         
             
                    // preferrably to run on the same backend as the buffer
         
     | 
| 
       142 
142 
     | 
    
         
             
                    ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
         
     | 
| 
       143 
143 
     | 
    
         | 
| 
         @@ -377,6 +377,27 @@ typedef struct { 
     | 
|
| 
       377 
377 
     | 
    
         
             
            } block_iq1_s;
         
     | 
| 
       378 
378 
     | 
    
         
             
            static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
         
     | 
| 
       379 
379 
     | 
    
         | 
| 
      
 380 
     | 
    
         
            +
            // 1.75 bpw
         
     | 
| 
      
 381 
     | 
    
         
            +
            typedef struct {
         
     | 
| 
      
 382 
     | 
    
         
            +
                uint8_t  qs[QK_K/8];      // grid index, low 8 bits
         
     | 
| 
      
 383 
     | 
    
         
            +
                uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
         
     | 
| 
      
 384 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 385 
     | 
    
         
            +
                ggml_half d;
         
     | 
| 
      
 386 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 387 
     | 
    
         
            +
                uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
         
     | 
| 
      
 388 
     | 
    
         
            +
            } block_iq1_m;
         
     | 
| 
      
 389 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 390 
     | 
    
         
            +
            static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
         
     | 
| 
      
 391 
     | 
    
         
            +
            #else
         
     | 
| 
      
 392 
     | 
    
         
            +
            static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
         
     | 
| 
      
 393 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 394 
     | 
    
         
            +
             
     | 
| 
      
 395 
     | 
    
         
            +
            // Used by IQ1_M quants
         
     | 
| 
      
 396 
     | 
    
         
            +
            typedef union {
         
     | 
| 
      
 397 
     | 
    
         
            +
                ggml_half f16;
         
     | 
| 
      
 398 
     | 
    
         
            +
                uint16_t  u16;
         
     | 
| 
      
 399 
     | 
    
         
            +
            } iq1m_scale_t;
         
     | 
| 
      
 400 
     | 
    
         
            +
             
     | 
| 
       380 
401 
     | 
    
         
             
            // Non-linear quants
         
     | 
| 
       381 
402 
     | 
    
         
             
            #define QK4_NL 32
         
     | 
| 
       382 
403 
     | 
    
         
             
            typedef struct {
         
     | 
| 
         @@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_ 
     | 
|
| 
       426 
447 
     | 
    
         | 
| 
       427 
448 
     | 
    
         
             
            #define GGML_COMMON_IMPL
         
     | 
| 
       428 
449 
     | 
    
         
             
            #elif defined(GGML_COMMON_IMPL_SYCL)
         
     | 
| 
      
 450 
     | 
    
         
            +
             
     | 
| 
       429 
451 
     | 
    
         
             
            #include <cstdint>
         
     | 
| 
       430 
452 
     | 
    
         | 
| 
       431 
     | 
    
         
            -
            #define GGML_TABLE_BEGIN(type, name, size) static  
     | 
| 
       432 
     | 
    
         
            -
            #define GGML_TABLE_END() } 
     | 
| 
      
 453 
     | 
    
         
            +
            #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
         
     | 
| 
      
 454 
     | 
    
         
            +
            #define GGML_TABLE_END() };
         
     | 
| 
       433 
455 
     | 
    
         | 
| 
       434 
456 
     | 
    
         
             
            #define GGML_COMMON_IMPL
         
     | 
| 
       435 
457 
     | 
    
         
             
            #endif
         
     | 
| 
         @@ -1050,6 +1072,7 @@ GGML_TABLE_END() 
     | 
|
| 
       1050 
1072 
     | 
    
         | 
| 
       1051 
1073 
     | 
    
         
             
            #define NGRID_IQ1S 2048
         
     | 
| 
       1052 
1074 
     | 
    
         
             
            #define IQ1S_DELTA 0.125f
         
     | 
| 
      
 1075 
     | 
    
         
            +
            #define IQ1M_DELTA 0.125f
         
     | 
| 
       1053 
1076 
     | 
    
         
             
            #if defined(GGML_COMMON_IMPL_C)
         
     | 
| 
       1054 
1077 
     | 
    
         
             
            GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
         
     | 
| 
       1055 
1078 
     | 
    
         
             
                0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
         
     |