llama_cpp 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +146 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +485 -67
- data/ext/llama_cpp/src/ggml-metal.m +52 -43
- data/ext/llama_cpp/src/ggml-metal.metal +587 -470
- data/ext/llama_cpp/src/ggml.c +105 -79
- data/ext/llama_cpp/src/ggml.h +13 -1
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +123 -66
- data/ext/llama_cpp/src/llama.h +34 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +12 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 35afb5cc65c290036ae7e45459eadc9b509f34f33a3f7708244cf47f1a38829f
|
4
|
+
data.tar.gz: 3301158526c63d9d2004e22bda0d1cc8025b4343d8d737df96260786531b074d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b0a50f9f012f44f119a70790d3de07c7fcc64151246791e270e4ff9fc479a85a01c53cf2775945eba3145a3ba89da55a8d14891c6236cfeae16aed5ae455cf0d
|
7
|
+
data.tar.gz: ede388584e115ae93d509b6c15b288303c348f3cfe8ea46879a1b69e6c96be31a321edbb52cfbeb309a8fb456738f3f6b7cc1d3f71ce7addbd05b3a1e73d4755
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,27 @@
|
|
1
|
+
## [[0.3.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.3...v0.3.4)] - 2023-07-23
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-32c5411 to master-d924522.
|
4
|
+
- Add `rope_freq_base` and `rope_freq_scale` options to ContextParams.
|
5
|
+
- Add `max_devices` module function to LLaMACpp.
|
6
|
+
- Add `n_vocab`, `n_ctx`, and `n_embd` methods to Model.
|
7
|
+
- Add `vocab`, `tokenize`, and `token_to_str` methods to Model.
|
8
|
+
```ruby
|
9
|
+
require 'llama_cpp'
|
10
|
+
|
11
|
+
params = LLaMACpp::ContextParams.new
|
12
|
+
model = LLaMACpp::Model.new(model_path: '/path/to/model.bin', params: params)
|
13
|
+
|
14
|
+
p model.tokenize(text: 'hello, world')
|
15
|
+
# => [12199, 29892, 3186]
|
16
|
+
|
17
|
+
p model.token_to_str(12199)
|
18
|
+
# => "hello"
|
19
|
+
```
|
20
|
+
|
21
|
+
**Breaking Changes**
|
22
|
+
- Fix to automatically call `backend_free` method when Ruby script exits.
|
23
|
+
- Remove `smooth_factor` argument from `sample_classifier_free_guidance methos` on Context.
|
24
|
+
|
1
25
|
## [[0.3.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.2...v0.3.3)] - 2023-07-15
|
2
26
|
|
3
27
|
- Bump bundled llama.cpp from master-481f793 to master-32c5411.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -406,6 +406,10 @@ public:
|
|
406
406
|
rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
|
407
407
|
rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
|
408
408
|
rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
|
409
|
+
rb_define_method(rb_cLLaMAContextParams, "rope_freq_base=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_base), 1);
|
410
|
+
rb_define_method(rb_cLLaMAContextParams, "rope_freq_base", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_base), 0);
|
411
|
+
rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_scale), 1);
|
412
|
+
rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_scale), 0);
|
409
413
|
rb_define_method(rb_cLLaMAContextParams, "low_vram=", RUBY_METHOD_FUNC(_llama_context_params_set_low_vram), 1);
|
410
414
|
rb_define_method(rb_cLLaMAContextParams, "low_vram", RUBY_METHOD_FUNC(_llama_context_params_get_low_vram), 0);
|
411
415
|
rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
|
@@ -494,6 +498,30 @@ private:
|
|
494
498
|
return ret;
|
495
499
|
}
|
496
500
|
|
501
|
+
// rope_freq_base
|
502
|
+
static VALUE _llama_context_params_set_rope_freq_base(VALUE self, VALUE rope_freq_base) {
|
503
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
504
|
+
ptr->params.rope_freq_base = NUM2DBL(rope_freq_base);
|
505
|
+
return DBL2NUM(ptr->params.rope_freq_base);
|
506
|
+
}
|
507
|
+
|
508
|
+
static VALUE _llama_context_params_get_rope_freq_base(VALUE self) {
|
509
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
510
|
+
return DBL2NUM(ptr->params.rope_freq_base);
|
511
|
+
}
|
512
|
+
|
513
|
+
// rope_freq_scale
|
514
|
+
static VALUE _llama_context_params_set_rope_freq_scale(VALUE self, VALUE rope_freq_scale) {
|
515
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
516
|
+
ptr->params.rope_freq_scale = NUM2DBL(rope_freq_scale);
|
517
|
+
return DBL2NUM(ptr->params.rope_freq_scale);
|
518
|
+
}
|
519
|
+
|
520
|
+
static VALUE _llama_context_params_get_rope_freq_scale(VALUE self) {
|
521
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
522
|
+
return DBL2NUM(ptr->params.rope_freq_scale);
|
523
|
+
}
|
524
|
+
|
497
525
|
// low_vram
|
498
526
|
static VALUE _llama_context_params_set_low_vram(VALUE self, VALUE low_vram) {
|
499
527
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -764,6 +792,12 @@ public:
|
|
764
792
|
rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
|
765
793
|
rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
|
766
794
|
rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
|
795
|
+
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_n_vocab_from_model), 0);
|
796
|
+
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_n_ctx_from_model), 0);
|
797
|
+
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_n_embd_from_model), 0);
|
798
|
+
rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
|
799
|
+
rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
|
800
|
+
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
767
801
|
}
|
768
802
|
|
769
803
|
private:
|
@@ -908,6 +942,109 @@ private:
|
|
908
942
|
}
|
909
943
|
return Qnil;
|
910
944
|
}
|
945
|
+
|
946
|
+
static VALUE _llama_model_get_n_vocab_from_model(VALUE self) {
|
947
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
948
|
+
return INT2NUM(llama_n_vocab_from_model(ptr->model));
|
949
|
+
}
|
950
|
+
|
951
|
+
static VALUE _llama_model_get_n_ctx_from_model(VALUE self) {
|
952
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
953
|
+
return INT2NUM(llama_n_ctx_from_model(ptr->model));
|
954
|
+
}
|
955
|
+
|
956
|
+
static VALUE _llama_model_get_n_embd_from_model(VALUE self) {
|
957
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
958
|
+
return INT2NUM(llama_n_embd_from_model(ptr->model));
|
959
|
+
}
|
960
|
+
|
961
|
+
static VALUE _llama_model_get_vocab_from_model(int argc, VALUE* argv, VALUE self) {
|
962
|
+
VALUE kw_args = Qnil;
|
963
|
+
ID kw_table[1] = { rb_intern("capacity") };
|
964
|
+
VALUE kw_values[1] = { Qundef };
|
965
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
966
|
+
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
967
|
+
|
968
|
+
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
969
|
+
rb_raise(rb_eArgError, "capacity must be an integer");
|
970
|
+
return Qnil;
|
971
|
+
}
|
972
|
+
|
973
|
+
const int capacity = NUM2INT(kw_values[0]);
|
974
|
+
|
975
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
976
|
+
const int n = std::min(capacity, llama_n_vocab_from_model(ptr->model));
|
977
|
+
const char** vocabs = ALLOCA_N(const char*, n);
|
978
|
+
float* scores = ALLOCA_N(float, n);
|
979
|
+
|
980
|
+
llama_get_vocab_from_model(ptr->model, vocabs, scores, capacity);
|
981
|
+
|
982
|
+
VALUE vocabs_ary = rb_ary_new();
|
983
|
+
VALUE scores_ary = rb_ary_new();
|
984
|
+
|
985
|
+
for (int i = 0; i < n; i++) {
|
986
|
+
rb_ary_push(vocabs_ary, rb_str_new_cstr(vocabs[i]));
|
987
|
+
rb_ary_push(scores_ary, DBL2NUM(scores[i]));
|
988
|
+
}
|
989
|
+
|
990
|
+
VALUE ret = rb_ary_new3(2, vocabs_ary, scores_ary);
|
991
|
+
|
992
|
+
return ret;
|
993
|
+
}
|
994
|
+
|
995
|
+
static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
|
996
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
997
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
998
|
+
return Qnil;
|
999
|
+
}
|
1000
|
+
const llama_token token = NUM2INT(token_);
|
1001
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1002
|
+
const char* str = llama_token_to_str_with_model(ptr->model, token);
|
1003
|
+
return rb_str_new_cstr(str);
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
static VALUE _llama_model_tokenize_with_model(int argc, VALUE* argv, VALUE self) {
|
1007
|
+
VALUE kw_args = Qnil;
|
1008
|
+
ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
|
1009
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
1010
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
1011
|
+
rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
|
1012
|
+
|
1013
|
+
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1014
|
+
rb_raise(rb_eArgError, "text must be a String");
|
1015
|
+
return Qnil;
|
1016
|
+
}
|
1017
|
+
if (kw_values[1] != Qundef && !RB_INTEGER_TYPE_P(kw_values[1])) {
|
1018
|
+
rb_raise(rb_eArgError, "n_max_tokens must be an integer");
|
1019
|
+
return Qnil;
|
1020
|
+
}
|
1021
|
+
if (kw_values[2] != Qundef && (kw_values[2] != Qtrue && kw_values[2] != Qfalse)) {
|
1022
|
+
rb_raise(rb_eArgError, "add_bos must be a boolean");
|
1023
|
+
return Qnil;
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
VALUE text_ = kw_values[0];
|
1027
|
+
std::string text = StringValueCStr(text_);
|
1028
|
+
const bool add_bos = kw_values[2] == Qtrue ? true : false;
|
1029
|
+
const int n_max_tokens = kw_values[1] != Qundef ? NUM2INT(kw_values[1]) : text.size() + (add_bos ? 1 : 0);
|
1030
|
+
|
1031
|
+
llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
|
1032
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1033
|
+
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
|
1034
|
+
|
1035
|
+
if (n_tokens < 0) {
|
1036
|
+
rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
|
1037
|
+
return Qnil;
|
1038
|
+
}
|
1039
|
+
|
1040
|
+
VALUE ret = rb_ary_new2(n_tokens);
|
1041
|
+
for (int i = 0; i < n_tokens; i++) {
|
1042
|
+
rb_ary_store(ret, i, INT2NUM(tokens[i]));
|
1043
|
+
}
|
1044
|
+
|
1045
|
+
RB_GC_GUARD(text_);
|
1046
|
+
return ret;
|
1047
|
+
}
|
911
1048
|
};
|
912
1049
|
|
913
1050
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -1581,11 +1718,11 @@ private:
|
|
1581
1718
|
|
1582
1719
|
static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
|
1583
1720
|
VALUE kw_args = Qnil;
|
1584
|
-
ID kw_table[
|
1585
|
-
VALUE kw_values[
|
1721
|
+
ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
|
1722
|
+
VALUE kw_values[2] = { Qundef, Qundef };
|
1586
1723
|
VALUE candidates = Qnil;
|
1587
1724
|
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
1588
|
-
rb_get_kwargs(kw_args, kw_table,
|
1725
|
+
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
1589
1726
|
|
1590
1727
|
if (!rb_obj_is_kind_of(kw_values[0], rb_cLLaMAContext)) {
|
1591
1728
|
rb_raise(rb_eArgError, "guidance must be a Context");
|
@@ -1595,10 +1732,6 @@ private:
|
|
1595
1732
|
rb_raise(rb_eArgError, "scale must be a float");
|
1596
1733
|
return Qnil;
|
1597
1734
|
}
|
1598
|
-
if (!RB_FLOAT_TYPE_P(kw_values[2])) {
|
1599
|
-
rb_raise(rb_eArgError, "smooth_factor must be a float");
|
1600
|
-
return Qnil;
|
1601
|
-
}
|
1602
1735
|
|
1603
1736
|
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
1604
1737
|
if (ctx_ptr->ctx == NULL) {
|
@@ -1617,9 +1750,8 @@ private:
|
|
1617
1750
|
return Qnil;
|
1618
1751
|
}
|
1619
1752
|
const float scale = NUM2DBL(kw_values[1]);
|
1620
|
-
const float smooth_factor = NUM2DBL(kw_values[2]);
|
1621
1753
|
|
1622
|
-
llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale
|
1754
|
+
llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale);
|
1623
1755
|
|
1624
1756
|
return Qnil;
|
1625
1757
|
}
|
@@ -2062,6 +2194,10 @@ static VALUE rb_llama_mlock_supported(VALUE self) {
|
|
2062
2194
|
return llama_mlock_supported() ? Qtrue : Qfalse;
|
2063
2195
|
}
|
2064
2196
|
|
2197
|
+
static VALUE rb_llama_max_devices(VALUE self) {
|
2198
|
+
return INT2NUM(llama_max_devices());
|
2199
|
+
}
|
2200
|
+
|
2065
2201
|
extern "C" void Init_llama_cpp(void) {
|
2066
2202
|
rb_mLLaMACpp = rb_define_module("LLaMACpp");
|
2067
2203
|
|
@@ -2082,6 +2218,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
2082
2218
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
2083
2219
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
2084
2220
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
2221
|
+
rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
|
2085
2222
|
|
2086
2223
|
rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
|
2087
2224
|
|