llama_cpp 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +106 -0
- data/ext/llama_cpp/src/ggml-cuda.h +12 -0
- data/ext/llama_cpp/src/ggml.c +2038 -895
- data/ext/llama_cpp/src/ggml.h +21 -1
- data/ext/llama_cpp/src/llama.cpp +376 -62
- data/ext/llama_cpp/src/llama.h +17 -1
- data/ext/llama_cpp/src/llama_util.h +22 -16
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +13 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 00d1390f6d91449d27050952aa9dd80572fb18d6809dc64098e2e4fce79bc91e
|
4
|
+
data.tar.gz: 49f4422e5c8903bb83fbd69a7901ff8ed0cbfae248d6652bb93b43feac331821
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0633f8565d940b618eb1637fa095a7bc940618100698986971274ffb943fca9fab32eb04e5a0e22bd930f18f998b7f4ca59fb0e4f7ac6210efdf7d4c44e8cc9c
|
7
|
+
data.tar.gz: 2e79f037ec38c415cbe9a64485b986c64e3de73c26580b0e6e8be577cec38abcc4eb9506de2bd1d0b84646f4f44b7422aafe83c0ee867028c2a223f345a294ce
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,33 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [[0.0.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.5...v0.0.6)] - 2023-04-22
|
4
|
+
|
5
|
+
- Bump bundled llama.cpp from master-315a95a to master-12b5900.
|
6
|
+
- Add model file type constants
|
7
|
+
- Add `model_quantize` module function to LLaMACpp.
|
8
|
+
- Add cublas config option:
|
9
|
+
```
|
10
|
+
$ gem install llama_cpp -- --with-cublas
|
11
|
+
```
|
12
|
+
|
13
|
+
## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
|
14
|
+
|
15
|
+
- Bump bundled llama.cpp from master-c85e03d to master-315a95a.
|
16
|
+
- Add `apply_lora_from_file` method to LLaMACpp::Context.
|
17
|
+
- Add `mlock_supported?` module function to LLaMACpp.
|
18
|
+
- Add `mmap_supported?` module function to LLaMACpp.
|
19
|
+
- Fix to not destroy original prompt in `LLaMACpp.generate` module function.
|
20
|
+
- Add check for context initialization.
|
21
|
+
- Add blas config options:
|
22
|
+
```
|
23
|
+
$ gem install llama_cpp -- --with-openblas
|
24
|
+
```
|
25
|
+
macOS:
|
26
|
+
```
|
27
|
+
$ gem install llama_cpp -- --with-openblas --with-opt-dir=/opt/homebrew/opt/openblas
|
28
|
+
$ gem install llama_cpp -- --with-accelerate
|
29
|
+
```
|
30
|
+
|
3
31
|
## [[0.0.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.3...v0.0.4)] - 2023-04-15
|
4
32
|
|
5
33
|
- Bump bundled llama.cpp from master-698f7b5 to master-c85e03d.
|
data/README.md
CHANGED
@@ -20,7 +20,8 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
Prepare
|
23
|
+
Prepare the quantized model by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage) or
|
24
|
+
download the qunatized model, for example [ggml-vicuna-7b-4bit](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5541351), from Hugging Face.
|
24
25
|
|
25
26
|
```ruby
|
26
27
|
require 'llama_cpp'
|
@@ -28,7 +29,7 @@ require 'llama_cpp'
|
|
28
29
|
params = LLaMACpp::ContextParams.new
|
29
30
|
params.seed = 12
|
30
31
|
|
31
|
-
context = LLaMACpp::Context.new(model_path: '/path/to/
|
32
|
+
context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
|
32
33
|
|
33
34
|
puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.', n_threads: 4)
|
34
35
|
# => "There are two major cities in Japan, Tokyo and Osaka, which have about 30 million populations."
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -15,6 +15,25 @@ if RUBY_PLATFORM.match?(/darwin|linux|bsd/) && try_compile('#include <stdio.h>',
|
|
15
15
|
$CXXFLAGS << ' -pthread'
|
16
16
|
end
|
17
17
|
|
18
|
+
if with_config('openblas')
|
19
|
+
abort 'libopenblas is not found.' unless have_library('openblas')
|
20
|
+
abort 'cblas.h is not found.' unless have_header('cblas.h')
|
21
|
+
|
22
|
+
$CFLAGS << ' -DGGML_USE_OPENBLAS'
|
23
|
+
end
|
24
|
+
|
25
|
+
if with_config('accelerate')
|
26
|
+
abort 'Accelerate framework is not found.' unless have_framework('Accelerate')
|
27
|
+
|
28
|
+
$CFLAGS << ' -DGGML_USE_ACCELERATE'
|
29
|
+
end
|
30
|
+
|
31
|
+
if with_config('cublas')
|
32
|
+
$CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
33
|
+
$LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
|
34
|
+
$objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
|
35
|
+
end
|
36
|
+
|
18
37
|
UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
|
19
38
|
|
20
39
|
# rubocop:disable Layout/LineLength
|
@@ -37,3 +56,10 @@ end
|
|
37
56
|
# rubocop:enable Layout/LineLength
|
38
57
|
|
39
58
|
create_makefile('llama_cpp/llama_cpp')
|
59
|
+
|
60
|
+
if with_config('cublas')
|
61
|
+
File.open('Makefile', 'a') do |f|
|
62
|
+
f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
|
63
|
+
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
64
|
+
end
|
65
|
+
end
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -228,6 +228,7 @@ public:
|
|
228
228
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
229
229
|
rb_define_method(rb_cLLaMAContext, "free", RUBY_METHOD_FUNC(_llama_context_free), 0);
|
230
230
|
rb_define_method(rb_cLLaMAContext, "load", RUBY_METHOD_FUNC(_llama_context_load), -1);
|
231
|
+
rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
|
231
232
|
};
|
232
233
|
|
233
234
|
private:
|
@@ -311,6 +312,10 @@ private:
|
|
311
312
|
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
312
313
|
|
313
314
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
315
|
+
if (ptr->ctx == NULL) {
|
316
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
317
|
+
return Qnil;
|
318
|
+
}
|
314
319
|
if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
|
315
320
|
rb_raise(rb_eRuntimeError, "Failed to evaluate");
|
316
321
|
return Qnil;
|
@@ -349,6 +354,10 @@ private:
|
|
349
354
|
|
350
355
|
std::vector<llama_token> tokens(n_max_tokens);
|
351
356
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
357
|
+
if (ptr->ctx == NULL) {
|
358
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
359
|
+
return Qnil;
|
360
|
+
}
|
352
361
|
const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
|
353
362
|
if (n < 0) {
|
354
363
|
rb_raise(rb_eRuntimeError, "Failed to tokenize");
|
@@ -449,6 +458,10 @@ private:
|
|
449
458
|
}
|
450
459
|
|
451
460
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
461
|
+
if (ptr->ctx == NULL) {
|
462
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
463
|
+
return Qnil;
|
464
|
+
}
|
452
465
|
llama_token token = llama_sample_top_p_top_k(ptr->ctx, last_n_tokens_data.data(), last_n_tokens_size, top_k, top_p, temp, penalty);
|
453
466
|
|
454
467
|
return INT2NUM(token);
|
@@ -548,6 +561,43 @@ private:
|
|
548
561
|
RB_GC_GUARD(filename);
|
549
562
|
return Qnil;
|
550
563
|
};
|
564
|
+
|
565
|
+
static VALUE _llama_context_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
|
566
|
+
VALUE kw_args = Qnil;
|
567
|
+
ID kw_table[3] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads") };
|
568
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
569
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
570
|
+
rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
|
571
|
+
|
572
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
573
|
+
rb_raise(rb_eArgError, "lora_path must be a string");
|
574
|
+
return Qnil;
|
575
|
+
}
|
576
|
+
if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
|
577
|
+
rb_raise(rb_eArgError, "base_model_path must be a string");
|
578
|
+
return Qnil;
|
579
|
+
}
|
580
|
+
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
581
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
582
|
+
return Qnil;
|
583
|
+
}
|
584
|
+
|
585
|
+
const char* lora_path = StringValueCStr(kw_values[0]);
|
586
|
+
const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
|
587
|
+
const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
|
588
|
+
|
589
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
590
|
+
if (ptr->ctx != NULL) {
|
591
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
|
592
|
+
return Qnil;
|
593
|
+
}
|
594
|
+
|
595
|
+
if (llama_apply_lora_from_file(ptr->ctx, lora_path, base_model_path, n_threads) != 0) {
|
596
|
+
rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
|
597
|
+
return Qnil;
|
598
|
+
}
|
599
|
+
return Qnil;
|
600
|
+
};
|
551
601
|
};
|
552
602
|
|
553
603
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
@@ -562,6 +612,43 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
|
562
612
|
|
563
613
|
// module functions
|
564
614
|
|
615
|
+
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
616
|
+
VALUE kw_args = Qnil;
|
617
|
+
ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
|
618
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
619
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
620
|
+
rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
|
621
|
+
|
622
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
623
|
+
rb_raise(rb_eArgError, "input_path must be a string");
|
624
|
+
return Qnil;
|
625
|
+
}
|
626
|
+
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
|
627
|
+
rb_raise(rb_eArgError, "output_path must be a string");
|
628
|
+
return Qnil;
|
629
|
+
}
|
630
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
631
|
+
rb_raise(rb_eArgError, "ftype must be an integer");
|
632
|
+
return Qnil;
|
633
|
+
}
|
634
|
+
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
|
635
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
636
|
+
return Qnil;
|
637
|
+
}
|
638
|
+
|
639
|
+
const char* input_path = StringValueCStr(kw_values[0]);
|
640
|
+
const char* output_path = StringValueCStr(kw_values[1]);
|
641
|
+
const int ftype = NUM2INT(kw_values[2]);
|
642
|
+
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
643
|
+
|
644
|
+
if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
|
645
|
+
rb_raise(rb_eRuntimeError, "Failed to quantize model");
|
646
|
+
return Qnil;
|
647
|
+
}
|
648
|
+
|
649
|
+
return Qnil;
|
650
|
+
}
|
651
|
+
|
565
652
|
static VALUE rb_llama_token_bos(VALUE self) {
|
566
653
|
return INT2NUM(llama_token_bos());
|
567
654
|
}
|
@@ -575,14 +662,33 @@ static VALUE rb_llama_print_system_info(VALUE self) {
|
|
575
662
|
return rb_utf8_str_new_cstr(result);
|
576
663
|
}
|
577
664
|
|
665
|
+
static VALUE rb_llama_mmap_supported(VALUE self) {
|
666
|
+
return llama_mmap_supported() ? Qtrue : Qfalse;
|
667
|
+
}
|
668
|
+
|
669
|
+
static VALUE rb_llama_mlock_supported(VALUE self) {
|
670
|
+
return llama_mlock_supported() ? Qtrue : Qfalse;
|
671
|
+
}
|
672
|
+
|
578
673
|
extern "C" void Init_llama_cpp(void) {
|
579
674
|
rb_mLLaMACpp = rb_define_module("LLaMACpp");
|
580
675
|
RbLLaMAContext::define_class(rb_mLLaMACpp);
|
581
676
|
RbLLaMAContextParams::define_class(rb_mLLaMACpp);
|
582
677
|
|
678
|
+
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
583
679
|
rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
|
584
680
|
rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
|
585
681
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
682
|
+
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
683
|
+
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
684
|
+
|
685
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
686
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
687
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
688
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
|
689
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
|
690
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
|
691
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_3", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3));
|
586
692
|
|
587
693
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
588
694
|
std::stringstream ss_magic;
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#ifdef __cplusplus
|
2
|
+
extern "C" {
|
3
|
+
#endif
|
4
|
+
|
5
|
+
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
6
|
+
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
7
|
+
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
8
|
+
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
9
|
+
|
10
|
+
#ifdef __cplusplus
|
11
|
+
}
|
12
|
+
#endif
|