llama_cpp 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: bf8532fb7d2d96acd42b0da600cd5a8923411545817cb433036c182cb6d549ca
         | 
| 4 | 
            +
              data.tar.gz: 2aa68d4ffe814632d6b8f2d97f6407284520d2b540b8920c40d486c599221bc3
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: ea8dad06ae15f9ca6ba585ae901d163bc6580543131338bbe785444083791b6251b2f631725190f9935740d0169520e3da604a66330e5bf7551031b7dc47dd81
         | 
| 7 | 
            +
              data.tar.gz: 3b32180e6a4653af2afac59d4640c5d06b29f5872d7ef40f33dcddc27b97e2eb0fa2f38fe6389ddcb60b4631db9e3ebfd0d4b14cc9de6419e50452b4c67ad98a
         | 
    
        data/CHANGELOG.md
    CHANGED
    
    | @@ -1,3 +1,32 @@ | |
| 1 | 
            +
            ## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            - Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
         | 
| 4 | 
            +
            - Fix installation files for CUDA.
         | 
| 5 | 
            +
            - Add metal config option:
         | 
| 6 | 
            +
              ```
         | 
| 7 | 
            +
              $ gem install llama_cpp -- --with-metal
         | 
| 8 | 
            +
              ```
         | 
| 9 | 
            +
              ```ruby
         | 
| 10 | 
            +
              require 'llama_cpp'
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              params = LLaMACpp::ContextParams.new
         | 
| 13 | 
            +
              params.n_gpu_layers = 1
         | 
| 14 | 
            +
             | 
| 15 | 
            +
              context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
         | 
| 16 | 
            +
              LLaMACpp.generate(context, 'Hello, world.')
         | 
| 17 | 
            +
              ```
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            **Breaking Changes**
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            - Add ModelQuantizationParams class.
         | 
| 22 | 
            +
            - Change the argument of the `model_quantize` module function in LLaMACpp.
         | 
| 23 | 
            +
              ```ruby
         | 
| 24 | 
            +
              require 'llama_cpp'
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              params = LLaMACpp::ModelQuantizeParams.new
         | 
| 27 | 
            +
              LLaMACpp.model_quantize(input_path: 'foo.model', output_path: 'bar.model', params: params)
         | 
| 28 | 
            +
              ```
         | 
| 29 | 
            +
             | 
| 1 30 | 
             
            ## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
         | 
| 2 31 |  | 
| 3 32 | 
             
            - Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
         | 
    
        data/ext/llama_cpp/extconf.rb
    CHANGED
    
    | @@ -1,6 +1,7 @@ | |
| 1 1 | 
             
            # frozen_string_literal: true
         | 
| 2 2 |  | 
| 3 3 | 
             
            require 'mkmf'
         | 
| 4 | 
            +
            require 'fileutils'
         | 
| 4 5 |  | 
| 5 6 | 
             
            abort 'libstdc++ is not found.' unless have_library('stdc++')
         | 
| 6 7 |  | 
| @@ -36,17 +37,30 @@ if with_config('accelerate') | |
| 36 37 | 
             
              $CFLAGS << ' -DGGML_USE_ACCELERATE'
         | 
| 37 38 | 
             
            end
         | 
| 38 39 |  | 
| 40 | 
            +
            if with_config('metal')
         | 
| 41 | 
            +
              $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
         | 
| 42 | 
            +
              $CXXFLAGS << ' -DGGML_USE_METAL'
         | 
| 43 | 
            +
              $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
         | 
| 44 | 
            +
              $objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
         | 
| 45 | 
            +
            end
         | 
| 46 | 
            +
             | 
| 39 47 | 
             
            if with_config('cublas')
         | 
| 40 48 | 
             
              $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
         | 
| 49 | 
            +
              $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
         | 
| 41 50 | 
             
              $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
         | 
| 42 51 | 
             
              $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
         | 
| 43 52 | 
             
            end
         | 
| 44 53 |  | 
| 45 54 | 
             
            if with_config('clblast')
         | 
| 46 55 | 
             
              abort 'libclblast is not found.' unless have_library('clblast')
         | 
| 47 | 
            -
              abort 'libOpenCL is not found.' unless have_library('OpenCL')
         | 
| 48 56 |  | 
| 49 57 | 
             
              $CFLAGS << ' -DGGML_USE_CLBLAST'
         | 
| 58 | 
            +
              $CXXFLAGS << ' -DGGML_USE_CLBLAST'
         | 
| 59 | 
            +
              if RUBY_PLATFORM.match?(/darwin/)
         | 
| 60 | 
            +
                $LDFLAGS << ' -framework OpenCL'
         | 
| 61 | 
            +
              else
         | 
| 62 | 
            +
                abort 'libOpenCL is not found.' unless have_library('OpenCL')
         | 
| 63 | 
            +
              end
         | 
| 50 64 | 
             
            end
         | 
| 51 65 |  | 
| 52 66 | 
             
            UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
         | 
| @@ -78,3 +92,14 @@ if with_config('cublas') | |
| 78 92 | 
             
                f.puts "\tnvcc -arch=native -c -o $@ $<"
         | 
| 79 93 | 
             
              end
         | 
| 80 94 | 
             
            end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
            if with_config('metal')
         | 
| 97 | 
            +
              File.open('Makefile', 'a') do |f|
         | 
| 98 | 
            +
                f.puts 'ggml-metal.o: ggml-metal.m ggml-metal.h'
         | 
| 99 | 
            +
                f.puts "\t$(CC) $(CFLAGS) -c $< -o $@"
         | 
| 100 | 
            +
              end
         | 
| 101 | 
            +
             | 
| 102 | 
            +
              metal_path = File.expand_path("#{__dir__}/src/ggml-metal.metal")
         | 
| 103 | 
            +
              dest_path = File.expand_path("#{__dir__}/../../lib/llama_cpp/")
         | 
| 104 | 
            +
              FileUtils.cp(metal_path, dest_path)
         | 
| 105 | 
            +
            end
         | 
    
        data/ext/llama_cpp/llama_cpp.cpp
    CHANGED
    
    | @@ -4,6 +4,7 @@ | |
| 4 4 | 
             
            VALUE rb_mLLaMACpp;
         | 
| 5 5 | 
             
            VALUE rb_cLLaMAContext;
         | 
| 6 6 | 
             
            VALUE rb_cLLaMAContextParams;
         | 
| 7 | 
            +
            VALUE rb_cLLaMAModelQuantizeParams;
         | 
| 7 8 | 
             
            VALUE rb_cLLaMATokenData;
         | 
| 8 9 | 
             
            VALUE rb_cLLaMATokenDataArray;
         | 
| 9 10 |  | 
| @@ -292,6 +293,13 @@ public: | |
| 292 293 | 
             
                // rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
         | 
| 293 294 | 
             
                rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
         | 
| 294 295 | 
             
                rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
         | 
| 296 | 
            +
                rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
         | 
| 297 | 
            +
                rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
         | 
| 298 | 
            +
                rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
         | 
| 299 | 
            +
                rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
         | 
| 300 | 
            +
                rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
         | 
| 301 | 
            +
                rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
         | 
| 302 | 
            +
                rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
         | 
| 295 303 | 
             
                rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
         | 
| 296 304 | 
             
                rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
         | 
| 297 305 | 
             
                rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
         | 
| @@ -329,6 +337,55 @@ private: | |
| 329 337 | 
             
                return INT2NUM(ptr->params.n_ctx);
         | 
| 330 338 | 
             
              };
         | 
| 331 339 |  | 
| 340 | 
            +
              // n_batch
         | 
| 341 | 
            +
              static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
         | 
| 342 | 
            +
                LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
         | 
| 343 | 
            +
                ptr->params.n_batch = NUM2INT(n_batch);
         | 
| 344 | 
            +
                return INT2NUM(ptr->params.n_batch);
         | 
| 345 | 
            +
              };
         | 
| 346 | 
            +
             | 
| 347 | 
            +
              static VALUE _llama_context_params_get_n_batch(VALUE self) {
         | 
| 348 | 
            +
                LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
         | 
| 349 | 
            +
                return INT2NUM(ptr->params.n_batch);
         | 
| 350 | 
            +
              };
         | 
| 351 | 
            +
             | 
| 352 | 
            +
              // n_gpu_layers
         | 
| 353 | 
            +
              static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
         | 
| 354 | 
            +
                LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
         | 
| 355 | 
            +
                ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
         | 
| 356 | 
            +
                return INT2NUM(ptr->params.n_gpu_layers);
         | 
| 357 | 
            +
              };
         | 
| 358 | 
            +
             | 
| 359 | 
            +
              static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
         | 
| 360 | 
            +
                LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
         | 
| 361 | 
            +
                return INT2NUM(ptr->params.n_gpu_layers);
         | 
| 362 | 
            +
              };
         | 
| 363 | 
            +
             | 
| 364 | 
            +
              // main_gpu
         | 
| 365 | 
            +
              static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
         | 
| 366 | 
            +
                LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
         | 
| 367 | 
            +
                ptr->params.main_gpu = NUM2INT(main_gpu);
         | 
| 368 | 
            +
                return INT2NUM(ptr->params.main_gpu);
         | 
| 369 | 
            +
              };
         | 
| 370 | 
            +
             | 
| 371 | 
            +
              static VALUE _llama_context_params_get_main_gpu(VALUE self) {
         | 
| 372 | 
            +
                LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
         | 
| 373 | 
            +
                return INT2NUM(ptr->params.main_gpu);
         | 
| 374 | 
            +
              };
         | 
| 375 | 
            +
             | 
| 376 | 
            +
              // tensor_split
         | 
| 377 | 
            +
              static VALUE _llama_context_params_get_tensor_split(VALUE self) {
         | 
| 378 | 
            +
                if (LLAMA_MAX_DEVICES < 1) {
         | 
| 379 | 
            +
                  return rb_ary_new();
         | 
| 380 | 
            +
                }
         | 
| 381 | 
            +
                VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
         | 
| 382 | 
            +
                LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
         | 
| 383 | 
            +
                for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
         | 
| 384 | 
            +
                  rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
         | 
| 385 | 
            +
                }
         | 
| 386 | 
            +
                return ret;
         | 
| 387 | 
            +
              };
         | 
| 388 | 
            +
             | 
| 332 389 | 
             
              // seed
         | 
| 333 390 | 
             
              static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
         | 
| 334 391 | 
             
                LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
         | 
| @@ -424,6 +481,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = { | |
| 424 481 | 
             
              RUBY_TYPED_FREE_IMMEDIATELY
         | 
| 425 482 | 
             
            };
         | 
| 426 483 |  | 
| 484 | 
            +
            class LLaMAModelQuantizeParamsWrapper {
         | 
| 485 | 
            +
            public:
         | 
| 486 | 
            +
              llama_model_quantize_params params;
         | 
| 487 | 
            +
             | 
| 488 | 
            +
              LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
         | 
| 489 | 
            +
             | 
| 490 | 
            +
              ~LLaMAModelQuantizeParamsWrapper(){};
         | 
| 491 | 
            +
            };
         | 
| 492 | 
            +
             | 
| 493 | 
            +
            class RbLLaMAModelQuantizeParams {
         | 
| 494 | 
            +
            public:
         | 
| 495 | 
            +
              static VALUE llama_model_quantize_params_alloc(VALUE self) {
         | 
| 496 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
         | 
| 497 | 
            +
                new (ptr) LLaMAModelQuantizeParamsWrapper();
         | 
| 498 | 
            +
                return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
         | 
| 499 | 
            +
              };
         | 
| 500 | 
            +
             | 
| 501 | 
            +
              static void llama_model_quantize_params_free(void* ptr) {
         | 
| 502 | 
            +
                ((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
         | 
| 503 | 
            +
                ruby_xfree(ptr);
         | 
| 504 | 
            +
              };
         | 
| 505 | 
            +
             | 
| 506 | 
            +
              static size_t llama_model_quantize_params_size(const void* ptr) {
         | 
| 507 | 
            +
                return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
         | 
| 508 | 
            +
              };
         | 
| 509 | 
            +
             | 
| 510 | 
            +
              static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
         | 
| 511 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr;
         | 
| 512 | 
            +
                TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
         | 
| 513 | 
            +
                return ptr;
         | 
| 514 | 
            +
              };
         | 
| 515 | 
            +
             | 
| 516 | 
            +
              static void define_class(VALUE outer) {
         | 
| 517 | 
            +
                rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
         | 
| 518 | 
            +
                rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
         | 
| 519 | 
            +
                rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
         | 
| 520 | 
            +
                rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
         | 
| 521 | 
            +
                rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
         | 
| 522 | 
            +
                rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
         | 
| 523 | 
            +
                rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
         | 
| 524 | 
            +
                rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
         | 
| 525 | 
            +
                rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
         | 
| 526 | 
            +
                rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
         | 
| 527 | 
            +
              };
         | 
| 528 | 
            +
             | 
| 529 | 
            +
            private:
         | 
| 530 | 
            +
              static const rb_data_type_t llama_model_quantize_params_type;
         | 
| 531 | 
            +
             | 
| 532 | 
            +
              // n_thread
         | 
| 533 | 
            +
              static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
         | 
| 534 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
         | 
| 535 | 
            +
                ptr->params.nthread = NUM2INT(n_thread);
         | 
| 536 | 
            +
                return INT2NUM(ptr->params.nthread);
         | 
| 537 | 
            +
              };
         | 
| 538 | 
            +
             | 
| 539 | 
            +
              static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
         | 
| 540 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
         | 
| 541 | 
            +
                return INT2NUM(ptr->params.nthread);
         | 
| 542 | 
            +
              };
         | 
| 543 | 
            +
             | 
| 544 | 
            +
              // ftype
         | 
| 545 | 
            +
              static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
         | 
| 546 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
         | 
| 547 | 
            +
                ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
         | 
| 548 | 
            +
                return INT2NUM(ptr->params.ftype);
         | 
| 549 | 
            +
              };
         | 
| 550 | 
            +
             | 
| 551 | 
            +
              static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
         | 
| 552 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
         | 
| 553 | 
            +
                return INT2NUM(ptr->params.ftype);
         | 
| 554 | 
            +
              };
         | 
| 555 | 
            +
             | 
| 556 | 
            +
              // allow_requantize
         | 
| 557 | 
            +
              static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
         | 
| 558 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
         | 
| 559 | 
            +
                if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
         | 
| 560 | 
            +
                  ptr->params.allow_requantize = false;
         | 
| 561 | 
            +
                } else {
         | 
| 562 | 
            +
                  ptr->params.allow_requantize = true;
         | 
| 563 | 
            +
                }
         | 
| 564 | 
            +
                return ptr->params.allow_requantize ? Qtrue : Qfalse;
         | 
| 565 | 
            +
              };
         | 
| 566 | 
            +
             | 
| 567 | 
            +
              static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
         | 
| 568 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
         | 
| 569 | 
            +
                return ptr->params.allow_requantize ? Qtrue : Qfalse;
         | 
| 570 | 
            +
              };
         | 
| 571 | 
            +
             | 
| 572 | 
            +
              // quantize_output_tensor
         | 
| 573 | 
            +
              static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
         | 
| 574 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
         | 
| 575 | 
            +
                if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
         | 
| 576 | 
            +
                  ptr->params.quantize_output_tensor = false;
         | 
| 577 | 
            +
                } else {
         | 
| 578 | 
            +
                  ptr->params.quantize_output_tensor = true;
         | 
| 579 | 
            +
                }
         | 
| 580 | 
            +
                return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
         | 
| 581 | 
            +
              };
         | 
| 582 | 
            +
             | 
| 583 | 
            +
              static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
         | 
| 584 | 
            +
                LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
         | 
| 585 | 
            +
                return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
         | 
| 586 | 
            +
              };
         | 
| 587 | 
            +
            };
         | 
| 588 | 
            +
             | 
| 589 | 
            +
            const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
         | 
| 590 | 
            +
              "RbLLaMAModelQuantizeParams",
         | 
| 591 | 
            +
              { NULL,
         | 
| 592 | 
            +
                RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
         | 
| 593 | 
            +
                RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
         | 
| 594 | 
            +
              NULL,
         | 
| 595 | 
            +
              NULL,
         | 
| 596 | 
            +
              RUBY_TYPED_FREE_IMMEDIATELY
         | 
| 597 | 
            +
            };
         | 
| 598 | 
            +
             | 
| 427 599 | 
             
            class LLaMAContextWrapper {
         | 
| 428 600 | 
             
            public:
         | 
| 429 601 | 
             
              struct llama_context* ctx;
         | 
| @@ -465,6 +637,7 @@ public: | |
| 465 637 | 
             
                rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
         | 
| 466 638 | 
             
                rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
         | 
| 467 639 | 
             
                rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
         | 
| 640 | 
            +
                rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
         | 
| 468 641 | 
             
                rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
         | 
| 469 642 | 
             
                rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
         | 
| 470 643 | 
             
                rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
         | 
| @@ -517,7 +690,7 @@ private: | |
| 517 690 | 
             
                  return Qnil;
         | 
| 518 691 | 
             
                }
         | 
| 519 692 | 
             
                if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
         | 
| 520 | 
            -
                  rb_raise(rb_eArgError, "params must be a  | 
| 693 | 
            +
                  rb_raise(rb_eArgError, "params must be a ContextParams");
         | 
| 521 694 | 
             
                  return Qnil;
         | 
| 522 695 | 
             
                }
         | 
| 523 696 |  | 
| @@ -599,6 +772,24 @@ private: | |
| 599 772 | 
             
                return Qnil;
         | 
| 600 773 | 
             
              };
         | 
| 601 774 |  | 
| 775 | 
            +
              static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
         | 
| 776 | 
            +
                LLaMAContextWrapper* ptr = get_llama_context(self);
         | 
| 777 | 
            +
                if (ptr->ctx == NULL) {
         | 
| 778 | 
            +
                  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
         | 
| 779 | 
            +
                  return Qnil;
         | 
| 780 | 
            +
                }
         | 
| 781 | 
            +
                if (!RB_TYPE_P(fname_, T_STRING)) {
         | 
| 782 | 
            +
                  rb_raise(rb_eArgError, "fname must be a string");
         | 
| 783 | 
            +
                  return Qnil;
         | 
| 784 | 
            +
                }
         | 
| 785 | 
            +
                const char* fname = StringValueCStr(fname_);
         | 
| 786 | 
            +
                if (llama_eval_export(ptr->ctx, fname) != 0) {
         | 
| 787 | 
            +
                  return Qfalse;
         | 
| 788 | 
            +
                }
         | 
| 789 | 
            +
                RB_GC_GUARD(fname_);
         | 
| 790 | 
            +
                return Qtrue;
         | 
| 791 | 
            +
              };
         | 
| 792 | 
            +
             | 
| 602 793 | 
             
              static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
         | 
| 603 794 | 
             
                VALUE kw_args = Qnil;
         | 
| 604 795 | 
             
                ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
         | 
| @@ -1428,10 +1619,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) { | |
| 1428 1619 |  | 
| 1429 1620 | 
             
            static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
         | 
| 1430 1621 | 
             
              VALUE kw_args = Qnil;
         | 
| 1431 | 
            -
              ID kw_table[ | 
| 1432 | 
            -
              VALUE kw_values[ | 
| 1622 | 
            +
              ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
         | 
| 1623 | 
            +
              VALUE kw_values[3] = { Qundef, Qundef, Qundef };
         | 
| 1433 1624 | 
             
              rb_scan_args(argc, argv, ":", &kw_args);
         | 
| 1434 | 
            -
              rb_get_kwargs(kw_args, kw_table, 3,  | 
| 1625 | 
            +
              rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
         | 
| 1435 1626 |  | 
| 1436 1627 | 
             
              if (!RB_TYPE_P(kw_values[0], T_STRING)) {
         | 
| 1437 1628 | 
             
                rb_raise(rb_eArgError, "input_path must be a string");
         | 
| @@ -1441,21 +1632,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) { | |
| 1441 1632 | 
             
                rb_raise(rb_eArgError, "output_path must be a string");
         | 
| 1442 1633 | 
             
                return Qnil;
         | 
| 1443 1634 | 
             
              }
         | 
| 1444 | 
            -
              if (! | 
| 1445 | 
            -
                rb_raise(rb_eArgError, " | 
| 1446 | 
            -
                return Qnil;
         | 
| 1447 | 
            -
              }
         | 
| 1448 | 
            -
              if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
         | 
| 1449 | 
            -
                rb_raise(rb_eArgError, "n_threads must be an integer");
         | 
| 1635 | 
            +
              if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
         | 
| 1636 | 
            +
                rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
         | 
| 1450 1637 | 
             
                return Qnil;
         | 
| 1451 1638 | 
             
              }
         | 
| 1452 1639 |  | 
| 1453 1640 | 
             
              const char* input_path = StringValueCStr(kw_values[0]);
         | 
| 1454 1641 | 
             
              const char* output_path = StringValueCStr(kw_values[1]);
         | 
| 1455 | 
            -
               | 
| 1456 | 
            -
              const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
         | 
| 1642 | 
            +
              LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
         | 
| 1457 1643 |  | 
| 1458 | 
            -
              if (llama_model_quantize(input_path, output_path, ( | 
| 1644 | 
            +
              if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
         | 
| 1459 1645 | 
             
                rb_raise(rb_eRuntimeError, "Failed to quantize model");
         | 
| 1460 1646 | 
             
                return Qnil;
         | 
| 1461 1647 | 
             
              }
         | 
| @@ -1505,6 +1691,8 @@ extern "C" void Init_llama_cpp(void) { | |
| 1505 1691 | 
             
              rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
         | 
| 1506 1692 | 
             
              rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
         | 
| 1507 1693 |  | 
| 1694 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
         | 
| 1695 | 
            +
             | 
| 1508 1696 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
         | 
| 1509 1697 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
         | 
| 1510 1698 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
         | 
| @@ -1513,6 +1701,15 @@ extern "C" void Init_llama_cpp(void) { | |
| 1513 1701 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
         | 
| 1514 1702 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
         | 
| 1515 1703 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
         | 
| 1704 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
         | 
| 1705 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
         | 
| 1706 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
         | 
| 1707 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
         | 
| 1708 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
         | 
| 1709 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
         | 
| 1710 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
         | 
| 1711 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
         | 
| 1712 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
         | 
| 1516 1713 |  | 
| 1517 1714 | 
             
              std::stringstream ss_magic;
         | 
| 1518 1715 | 
             
              ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
         |