llama_cpp 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +146 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +485 -67
- data/ext/llama_cpp/src/ggml-metal.m +52 -43
- data/ext/llama_cpp/src/ggml-metal.metal +587 -470
- data/ext/llama_cpp/src/ggml.c +105 -79
- data/ext/llama_cpp/src/ggml.h +13 -1
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +123 -66
- data/ext/llama_cpp/src/llama.h +34 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +12 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 35afb5cc65c290036ae7e45459eadc9b509f34f33a3f7708244cf47f1a38829f
|
4
|
+
data.tar.gz: 3301158526c63d9d2004e22bda0d1cc8025b4343d8d737df96260786531b074d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b0a50f9f012f44f119a70790d3de07c7fcc64151246791e270e4ff9fc479a85a01c53cf2775945eba3145a3ba89da55a8d14891c6236cfeae16aed5ae455cf0d
|
7
|
+
data.tar.gz: ede388584e115ae93d509b6c15b288303c348f3cfe8ea46879a1b69e6c96be31a321edbb52cfbeb309a8fb456738f3f6b7cc1d3f71ce7addbd05b3a1e73d4755
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,27 @@
|
|
1
|
+
## [[0.3.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.3...v0.3.4)] - 2023-07-23
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-32c5411 to master-d924522.
|
4
|
+
- Add `rope_freq_base` and `rope_freq_scale` options to ContextParams.
|
5
|
+
- Add `max_devices` module function to LLaMACpp.
|
6
|
+
- Add `n_vocab`, `n_ctx`, and `n_embd` methods to Model.
|
7
|
+
- Add `vocab`, `tokenize`, and `token_to_str` methods to Model.
|
8
|
+
```ruby
|
9
|
+
require 'llama_cpp'
|
10
|
+
|
11
|
+
params = LLaMACpp::ContextParams.new
|
12
|
+
model = LLaMACpp::Model.new(model_path: '/path/to/model.bin', params: params)
|
13
|
+
|
14
|
+
p model.tokenize(text: 'hello, world')
|
15
|
+
# => [12199, 29892, 3186]
|
16
|
+
|
17
|
+
p model.token_to_str(12199)
|
18
|
+
# => "hello"
|
19
|
+
```
|
20
|
+
|
21
|
+
**Breaking Changes**
|
22
|
+
- Fix to automatically call `backend_free` method when Ruby script exits.
|
23
|
+
- Remove `smooth_factor` argument from `sample_classifier_free_guidance methos` on Context.
|
24
|
+
|
1
25
|
## [[0.3.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.2...v0.3.3)] - 2023-07-15
|
2
26
|
|
3
27
|
- Bump bundled llama.cpp from master-481f793 to master-32c5411.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -406,6 +406,10 @@ public:
|
|
406
406
|
rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
|
407
407
|
rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
|
408
408
|
rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
|
409
|
+
rb_define_method(rb_cLLaMAContextParams, "rope_freq_base=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_base), 1);
|
410
|
+
rb_define_method(rb_cLLaMAContextParams, "rope_freq_base", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_base), 0);
|
411
|
+
rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_scale), 1);
|
412
|
+
rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_scale), 0);
|
409
413
|
rb_define_method(rb_cLLaMAContextParams, "low_vram=", RUBY_METHOD_FUNC(_llama_context_params_set_low_vram), 1);
|
410
414
|
rb_define_method(rb_cLLaMAContextParams, "low_vram", RUBY_METHOD_FUNC(_llama_context_params_get_low_vram), 0);
|
411
415
|
rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
|
@@ -494,6 +498,30 @@ private:
|
|
494
498
|
return ret;
|
495
499
|
}
|
496
500
|
|
501
|
+
// rope_freq_base
|
502
|
+
static VALUE _llama_context_params_set_rope_freq_base(VALUE self, VALUE rope_freq_base) {
|
503
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
504
|
+
ptr->params.rope_freq_base = NUM2DBL(rope_freq_base);
|
505
|
+
return DBL2NUM(ptr->params.rope_freq_base);
|
506
|
+
}
|
507
|
+
|
508
|
+
static VALUE _llama_context_params_get_rope_freq_base(VALUE self) {
|
509
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
510
|
+
return DBL2NUM(ptr->params.rope_freq_base);
|
511
|
+
}
|
512
|
+
|
513
|
+
// rope_freq_scale
|
514
|
+
static VALUE _llama_context_params_set_rope_freq_scale(VALUE self, VALUE rope_freq_scale) {
|
515
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
516
|
+
ptr->params.rope_freq_scale = NUM2DBL(rope_freq_scale);
|
517
|
+
return DBL2NUM(ptr->params.rope_freq_scale);
|
518
|
+
}
|
519
|
+
|
520
|
+
static VALUE _llama_context_params_get_rope_freq_scale(VALUE self) {
|
521
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
522
|
+
return DBL2NUM(ptr->params.rope_freq_scale);
|
523
|
+
}
|
524
|
+
|
497
525
|
// low_vram
|
498
526
|
static VALUE _llama_context_params_set_low_vram(VALUE self, VALUE low_vram) {
|
499
527
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -764,6 +792,12 @@ public:
|
|
764
792
|
rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
|
765
793
|
rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
|
766
794
|
rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
|
795
|
+
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_n_vocab_from_model), 0);
|
796
|
+
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_n_ctx_from_model), 0);
|
797
|
+
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_n_embd_from_model), 0);
|
798
|
+
rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
|
799
|
+
rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
|
800
|
+
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
767
801
|
}
|
768
802
|
|
769
803
|
private:
|
@@ -908,6 +942,109 @@ private:
|
|
908
942
|
}
|
909
943
|
return Qnil;
|
910
944
|
}
|
945
|
+
|
946
|
+
static VALUE _llama_model_get_n_vocab_from_model(VALUE self) {
|
947
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
948
|
+
return INT2NUM(llama_n_vocab_from_model(ptr->model));
|
949
|
+
}
|
950
|
+
|
951
|
+
static VALUE _llama_model_get_n_ctx_from_model(VALUE self) {
|
952
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
953
|
+
return INT2NUM(llama_n_ctx_from_model(ptr->model));
|
954
|
+
}
|
955
|
+
|
956
|
+
static VALUE _llama_model_get_n_embd_from_model(VALUE self) {
|
957
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
958
|
+
return INT2NUM(llama_n_embd_from_model(ptr->model));
|
959
|
+
}
|
960
|
+
|
961
|
+
static VALUE _llama_model_get_vocab_from_model(int argc, VALUE* argv, VALUE self) {
|
962
|
+
VALUE kw_args = Qnil;
|
963
|
+
ID kw_table[1] = { rb_intern("capacity") };
|
964
|
+
VALUE kw_values[1] = { Qundef };
|
965
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
966
|
+
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
967
|
+
|
968
|
+
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
969
|
+
rb_raise(rb_eArgError, "capacity must be an integer");
|
970
|
+
return Qnil;
|
971
|
+
}
|
972
|
+
|
973
|
+
const int capacity = NUM2INT(kw_values[0]);
|
974
|
+
|
975
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
976
|
+
const int n = std::min(capacity, llama_n_vocab_from_model(ptr->model));
|
977
|
+
const char** vocabs = ALLOCA_N(const char*, n);
|
978
|
+
float* scores = ALLOCA_N(float, n);
|
979
|
+
|
980
|
+
llama_get_vocab_from_model(ptr->model, vocabs, scores, capacity);
|
981
|
+
|
982
|
+
VALUE vocabs_ary = rb_ary_new();
|
983
|
+
VALUE scores_ary = rb_ary_new();
|
984
|
+
|
985
|
+
for (int i = 0; i < n; i++) {
|
986
|
+
rb_ary_push(vocabs_ary, rb_str_new_cstr(vocabs[i]));
|
987
|
+
rb_ary_push(scores_ary, DBL2NUM(scores[i]));
|
988
|
+
}
|
989
|
+
|
990
|
+
VALUE ret = rb_ary_new3(2, vocabs_ary, scores_ary);
|
991
|
+
|
992
|
+
return ret;
|
993
|
+
}
|
994
|
+
|
995
|
+
static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
|
996
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
997
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
998
|
+
return Qnil;
|
999
|
+
}
|
1000
|
+
const llama_token token = NUM2INT(token_);
|
1001
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1002
|
+
const char* str = llama_token_to_str_with_model(ptr->model, token);
|
1003
|
+
return rb_str_new_cstr(str);
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
static VALUE _llama_model_tokenize_with_model(int argc, VALUE* argv, VALUE self) {
|
1007
|
+
VALUE kw_args = Qnil;
|
1008
|
+
ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
|
1009
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
1010
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
1011
|
+
rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
|
1012
|
+
|
1013
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1014
|
+
rb_raise(rb_eArgError, "text must be a String");
|
1015
|
+
return Qnil;
|
1016
|
+
}
|
1017
|
+
if (kw_values[1] != Qundef && !RB_INTEGER_TYPE_P(kw_values[1])) {
|
1018
|
+
rb_raise(rb_eArgError, "n_max_tokens must be an integer");
|
1019
|
+
return Qnil;
|
1020
|
+
}
|
1021
|
+
if (kw_values[2] != Qundef && (kw_values[2] != Qtrue && kw_values[2] != Qfalse)) {
|
1022
|
+
rb_raise(rb_eArgError, "add_bos must be a boolean");
|
1023
|
+
return Qnil;
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
VALUE text_ = kw_values[0];
|
1027
|
+
std::string text = StringValueCStr(text_);
|
1028
|
+
const bool add_bos = kw_values[2] == Qtrue ? true : false;
|
1029
|
+
const int n_max_tokens = kw_values[1] != Qundef ? NUM2INT(kw_values[1]) : text.size() + (add_bos ? 1 : 0);
|
1030
|
+
|
1031
|
+
llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
|
1032
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1033
|
+
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
|
1034
|
+
|
1035
|
+
if (n_tokens < 0) {
|
1036
|
+
rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
|
1037
|
+
return Qnil;
|
1038
|
+
}
|
1039
|
+
|
1040
|
+
VALUE ret = rb_ary_new2(n_tokens);
|
1041
|
+
for (int i = 0; i < n_tokens; i++) {
|
1042
|
+
rb_ary_store(ret, i, INT2NUM(tokens[i]));
|
1043
|
+
}
|
1044
|
+
|
1045
|
+
RB_GC_GUARD(text_);
|
1046
|
+
return ret;
|
1047
|
+
}
|
911
1048
|
};
|
912
1049
|
|
913
1050
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -1581,11 +1718,11 @@ private:
|
|
1581
1718
|
|
1582
1719
|
static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
|
1583
1720
|
VALUE kw_args = Qnil;
|
1584
|
-
ID kw_table[
|
1585
|
-
VALUE kw_values[
|
1721
|
+
ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
|
1722
|
+
VALUE kw_values[2] = { Qundef, Qundef };
|
1586
1723
|
VALUE candidates = Qnil;
|
1587
1724
|
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
1588
|
-
rb_get_kwargs(kw_args, kw_table,
|
1725
|
+
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
1589
1726
|
|
1590
1727
|
if (!rb_obj_is_kind_of(kw_values[0], rb_cLLaMAContext)) {
|
1591
1728
|
rb_raise(rb_eArgError, "guidance must be a Context");
|
@@ -1595,10 +1732,6 @@ private:
|
|
1595
1732
|
rb_raise(rb_eArgError, "scale must be a float");
|
1596
1733
|
return Qnil;
|
1597
1734
|
}
|
1598
|
-
if (!RB_FLOAT_TYPE_P(kw_values[2])) {
|
1599
|
-
rb_raise(rb_eArgError, "smooth_factor must be a float");
|
1600
|
-
return Qnil;
|
1601
|
-
}
|
1602
1735
|
|
1603
1736
|
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
1604
1737
|
if (ctx_ptr->ctx == NULL) {
|
@@ -1617,9 +1750,8 @@ private:
|
|
1617
1750
|
return Qnil;
|
1618
1751
|
}
|
1619
1752
|
const float scale = NUM2DBL(kw_values[1]);
|
1620
|
-
const float smooth_factor = NUM2DBL(kw_values[2]);
|
1621
1753
|
|
1622
|
-
llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale
|
1754
|
+
llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale);
|
1623
1755
|
|
1624
1756
|
return Qnil;
|
1625
1757
|
}
|
@@ -2062,6 +2194,10 @@ static VALUE rb_llama_mlock_supported(VALUE self) {
|
|
2062
2194
|
return llama_mlock_supported() ? Qtrue : Qfalse;
|
2063
2195
|
}
|
2064
2196
|
|
2197
|
+
static VALUE rb_llama_max_devices(VALUE self) {
|
2198
|
+
return INT2NUM(llama_max_devices());
|
2199
|
+
}
|
2200
|
+
|
2065
2201
|
extern "C" void Init_llama_cpp(void) {
|
2066
2202
|
rb_mLLaMACpp = rb_define_module("LLaMACpp");
|
2067
2203
|
|
@@ -2082,6 +2218,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
2082
2218
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
2083
2219
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
2084
2220
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
2221
|
+
rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
|
2085
2222
|
|
2086
2223
|
rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
|
2087
2224
|
|