llama_cpp 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/extconf.rb +15 -1
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/ext/llama_cpp/src/ggml-cuda.h +12 -0
- data/ext/llama_cpp/src/ggml.c +1343 -800
- data/ext/llama_cpp/src/ggml.h +12 -2
- data/ext/llama_cpp/src/llama.cpp +60 -16
- data/ext/llama_cpp/src/llama.h +5 -1
- data/ext/llama_cpp/src/llama_util.h +0 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 00d1390f6d91449d27050952aa9dd80572fb18d6809dc64098e2e4fce79bc91e
|
4
|
+
data.tar.gz: 49f4422e5c8903bb83fbd69a7901ff8ed0cbfae248d6652bb93b43feac331821
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0633f8565d940b618eb1637fa095a7bc940618100698986971274ffb943fca9fab32eb04e5a0e22bd930f18f998b7f4ca59fb0e4f7ac6210efdf7d4c44e8cc9c
|
7
|
+
data.tar.gz: 2e79f037ec38c415cbe9a64485b986c64e3de73c26580b0e6e8be577cec38abcc4eb9506de2bd1d0b84646f4f44b7422aafe83c0ee867028c2a223f345a294ce
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [[0.0.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.5...v0.0.6)] - 2023-04-22
|
4
|
+
|
5
|
+
- Bump bundled llama.cpp from master-315a95a to master-12b5900.
|
6
|
+
- Add model file type constants
|
7
|
+
- Add `model_quantize` module function to LLaMACpp.
|
8
|
+
- Add cublas config option:
|
9
|
+
```
|
10
|
+
$ gem install llama_cpp -- --with-cublas
|
11
|
+
```
|
12
|
+
|
3
13
|
## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
|
4
14
|
|
5
15
|
- Bump bundled llama.cpp from master-c85e03d to master-315a95a.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -23,8 +23,15 @@ if with_config('openblas')
|
|
23
23
|
end
|
24
24
|
|
25
25
|
if with_config('accelerate')
|
26
|
+
abort 'Accelerate framework is not found.' unless have_framework('Accelerate')
|
27
|
+
|
26
28
|
$CFLAGS << ' -DGGML_USE_ACCELERATE'
|
27
|
-
|
29
|
+
end
|
30
|
+
|
31
|
+
if with_config('cublas')
|
32
|
+
$CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
33
|
+
$LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
|
34
|
+
$objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
|
28
35
|
end
|
29
36
|
|
30
37
|
UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
|
@@ -49,3 +56,10 @@ end
|
|
49
56
|
# rubocop:enable Layout/LineLength
|
50
57
|
|
51
58
|
create_makefile('llama_cpp/llama_cpp')
|
59
|
+
|
60
|
+
if with_config('cublas')
|
61
|
+
File.open('Makefile', 'a') do |f|
|
62
|
+
f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
|
63
|
+
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
64
|
+
end
|
65
|
+
end
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -612,6 +612,43 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
|
612
612
|
|
613
613
|
// module functions
|
614
614
|
|
615
|
+
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
616
|
+
VALUE kw_args = Qnil;
|
617
|
+
ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
|
618
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
619
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
620
|
+
rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
|
621
|
+
|
622
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
623
|
+
rb_raise(rb_eArgError, "input_path must be a string");
|
624
|
+
return Qnil;
|
625
|
+
}
|
626
|
+
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
|
627
|
+
rb_raise(rb_eArgError, "output_path must be a string");
|
628
|
+
return Qnil;
|
629
|
+
}
|
630
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
631
|
+
rb_raise(rb_eArgError, "ftype must be an integer");
|
632
|
+
return Qnil;
|
633
|
+
}
|
634
|
+
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
|
635
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
636
|
+
return Qnil;
|
637
|
+
}
|
638
|
+
|
639
|
+
const char* input_path = StringValueCStr(kw_values[0]);
|
640
|
+
const char* output_path = StringValueCStr(kw_values[1]);
|
641
|
+
const int ftype = NUM2INT(kw_values[2]);
|
642
|
+
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
643
|
+
|
644
|
+
if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
|
645
|
+
rb_raise(rb_eRuntimeError, "Failed to quantize model");
|
646
|
+
return Qnil;
|
647
|
+
}
|
648
|
+
|
649
|
+
return Qnil;
|
650
|
+
}
|
651
|
+
|
615
652
|
static VALUE rb_llama_token_bos(VALUE self) {
|
616
653
|
return INT2NUM(llama_token_bos());
|
617
654
|
}
|
@@ -638,12 +675,21 @@ extern "C" void Init_llama_cpp(void) {
|
|
638
675
|
RbLLaMAContext::define_class(rb_mLLaMACpp);
|
639
676
|
RbLLaMAContextParams::define_class(rb_mLLaMACpp);
|
640
677
|
|
678
|
+
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
641
679
|
rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
|
642
680
|
rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
|
643
681
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
644
682
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
645
683
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
646
684
|
|
685
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
686
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
687
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
688
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
|
689
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
|
690
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
|
691
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_3", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3));
|
692
|
+
|
647
693
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
648
694
|
std::stringstream ss_magic;
|
649
695
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#ifdef __cplusplus
|
2
|
+
extern "C" {
|
3
|
+
#endif
|
4
|
+
|
5
|
+
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
6
|
+
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
7
|
+
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
8
|
+
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
9
|
+
|
10
|
+
#ifdef __cplusplus
|
11
|
+
}
|
12
|
+
#endif
|