llama_cpp 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/extconf.rb +15 -1
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/ext/llama_cpp/src/ggml-cuda.h +12 -0
- data/ext/llama_cpp/src/ggml.c +1343 -800
- data/ext/llama_cpp/src/ggml.h +12 -2
- data/ext/llama_cpp/src/llama.cpp +60 -16
- data/ext/llama_cpp/src/llama.h +5 -1
- data/ext/llama_cpp/src/llama_util.h +0 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 00d1390f6d91449d27050952aa9dd80572fb18d6809dc64098e2e4fce79bc91e
|
4
|
+
data.tar.gz: 49f4422e5c8903bb83fbd69a7901ff8ed0cbfae248d6652bb93b43feac331821
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0633f8565d940b618eb1637fa095a7bc940618100698986971274ffb943fca9fab32eb04e5a0e22bd930f18f998b7f4ca59fb0e4f7ac6210efdf7d4c44e8cc9c
|
7
|
+
data.tar.gz: 2e79f037ec38c415cbe9a64485b986c64e3de73c26580b0e6e8be577cec38abcc4eb9506de2bd1d0b84646f4f44b7422aafe83c0ee867028c2a223f345a294ce
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [[0.0.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.5...v0.0.6)] - 2023-04-22
|
4
|
+
|
5
|
+
- Bump bundled llama.cpp from master-315a95a to master-12b5900.
|
6
|
+
- Add model file type constants
|
7
|
+
- Add `model_quantize` module function to LLaMACpp.
|
8
|
+
- Add cublas config option:
|
9
|
+
```
|
10
|
+
$ gem install llama_cpp -- --with-cublas
|
11
|
+
```
|
12
|
+
|
3
13
|
## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
|
4
14
|
|
5
15
|
- Bump bundled llama.cpp from master-c85e03d to master-315a95a.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -23,8 +23,15 @@ if with_config('openblas')
|
|
23
23
|
end
|
24
24
|
|
25
25
|
if with_config('accelerate')
|
26
|
+
abort 'Accelerate framework is not found.' unless have_framework('Accelerate')
|
27
|
+
|
26
28
|
$CFLAGS << ' -DGGML_USE_ACCELERATE'
|
27
|
-
|
29
|
+
end
|
30
|
+
|
31
|
+
if with_config('cublas')
|
32
|
+
$CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
33
|
+
$LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
|
34
|
+
$objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
|
28
35
|
end
|
29
36
|
|
30
37
|
UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
|
@@ -49,3 +56,10 @@ end
|
|
49
56
|
# rubocop:enable Layout/LineLength
|
50
57
|
|
51
58
|
create_makefile('llama_cpp/llama_cpp')
|
59
|
+
|
60
|
+
if with_config('cublas')
|
61
|
+
File.open('Makefile', 'a') do |f|
|
62
|
+
f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
|
63
|
+
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
64
|
+
end
|
65
|
+
end
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -612,6 +612,43 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
|
612
612
|
|
613
613
|
// module functions
|
614
614
|
|
615
|
+
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
616
|
+
VALUE kw_args = Qnil;
|
617
|
+
ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
|
618
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
619
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
620
|
+
rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
|
621
|
+
|
622
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
623
|
+
rb_raise(rb_eArgError, "input_path must be a string");
|
624
|
+
return Qnil;
|
625
|
+
}
|
626
|
+
if (!RB_TYPE_P(kw_values[1], T_STRING)) {
|
627
|
+
rb_raise(rb_eArgError, "output_path must be a string");
|
628
|
+
return Qnil;
|
629
|
+
}
|
630
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
631
|
+
rb_raise(rb_eArgError, "ftype must be an integer");
|
632
|
+
return Qnil;
|
633
|
+
}
|
634
|
+
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
|
635
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
636
|
+
return Qnil;
|
637
|
+
}
|
638
|
+
|
639
|
+
const char* input_path = StringValueCStr(kw_values[0]);
|
640
|
+
const char* output_path = StringValueCStr(kw_values[1]);
|
641
|
+
const int ftype = NUM2INT(kw_values[2]);
|
642
|
+
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
643
|
+
|
644
|
+
if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
|
645
|
+
rb_raise(rb_eRuntimeError, "Failed to quantize model");
|
646
|
+
return Qnil;
|
647
|
+
}
|
648
|
+
|
649
|
+
return Qnil;
|
650
|
+
}
|
651
|
+
|
615
652
|
static VALUE rb_llama_token_bos(VALUE self) {
|
616
653
|
return INT2NUM(llama_token_bos());
|
617
654
|
}
|
@@ -638,12 +675,21 @@ extern "C" void Init_llama_cpp(void) {
|
|
638
675
|
RbLLaMAContext::define_class(rb_mLLaMACpp);
|
639
676
|
RbLLaMAContextParams::define_class(rb_mLLaMACpp);
|
640
677
|
|
678
|
+
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
641
679
|
rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
|
642
680
|
rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
|
643
681
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
644
682
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
645
683
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
646
684
|
|
685
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
686
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
687
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
688
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
|
689
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
|
690
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
|
691
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_3", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3));
|
692
|
+
|
647
693
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
648
694
|
std::stringstream ss_magic;
|
649
695
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#ifdef __cplusplus
|
2
|
+
extern "C" {
|
3
|
+
#endif
|
4
|
+
|
5
|
+
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
6
|
+
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
7
|
+
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
8
|
+
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
9
|
+
|
10
|
+
#ifdef __cplusplus
|
11
|
+
}
|
12
|
+
#endif
|