llama_cpp 0.3.8 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +110 -117
- data/ext/llama_cpp/src/ggml-alloc.c +79 -65
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +330 -69
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +102 -66
- data/ext/llama_cpp/src/ggml-metal.metal +113 -9
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama.cpp +4520 -2978
- data/ext/llama_cpp/src/llama.h +133 -125
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +7 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: af3a0e01bc9f3cfad4cee3f21144dd354640e1d4558125be36d4b499fa3b4c24
|
4
|
+
data.tar.gz: 042a3b0491d98fa6a093c684e6ab751152f37c8438a3b4a7b19cb2d8c7ab95a7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ed85bd8438ee3b3adab884795c4aecb5b0d72ad57b7e02bc281b62c3b1d669efab62a020e03b09defe3084ecd8afacc4220303e99167d04d668650768c7392b
|
7
|
+
data.tar.gz: b705a0ccd2c7c1e15aed6383acb9d5a3d79d0a0c882a74c42b9099df9a27aff88ba08a2f06aa4d195382e8f41c1b16c0014a2047d1923369f275ca481d52bb21
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
- Bump bundled llama.cpp from master-097e121 to master-b1060.
|
5
|
+
- Support new file format GGUF.
|
6
|
+
- You should re-convert / re-quantize your model files.
|
7
|
+
- Remove vocab methods.
|
8
|
+
- Move token_bos, token_eos, and token_nl methods to Context.
|
9
|
+
- Add text, score, and type methods to Context.
|
10
|
+
|
1
11
|
## [[0.3.8](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.7...v0.3.8)] - 2023-08-19
|
2
12
|
|
3
13
|
- Bump bundled llama.cpp from master-9ca4abe to master-097e121.
|
data/README.md
CHANGED
@@ -51,7 +51,7 @@ $ git clone https://huggingface.co/openlm-research/open_llama_7b
|
|
51
51
|
$ cd ../
|
52
52
|
$ python3 convert.py models/open_llama_7b
|
53
53
|
$ make
|
54
|
-
$ ./quantize ./models/open_llama_7b/ggml-model-f16.
|
54
|
+
$ ./quantize ./models/open_llama_7b/ggml-model-f16.gguf ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
|
55
55
|
```
|
56
56
|
|
57
57
|
An example of Ruby code that generates sentences with the quantization model is as follows:
|
data/examples/chat.rb
CHANGED
@@ -49,8 +49,6 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
49
49
|
n_keep = options[:keep]
|
50
50
|
n_keep = embd_input.size if n_keep > embd_input.size
|
51
51
|
|
52
|
-
token_newline = context.tokenize(text: "\n", add_bos: false)
|
53
|
-
|
54
52
|
last_n_tokens = [0] * n_ctx
|
55
53
|
interactive = true
|
56
54
|
is_interacting = false
|
@@ -101,8 +99,8 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
101
99
|
last_n_tokens.shift
|
102
100
|
last_n_tokens.push(id)
|
103
101
|
|
104
|
-
if id ==
|
105
|
-
id =
|
102
|
+
if id == context.token_eos
|
103
|
+
id = context.token_nl
|
106
104
|
unless antiprompt.empty?
|
107
105
|
first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
|
108
106
|
embd_input.concat(first_antiprompt)
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -53,7 +53,7 @@ if with_config('metal')
|
|
53
53
|
$CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
|
54
54
|
$CXXFLAGS << ' -DGGML_USE_METAL'
|
55
55
|
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
56
|
-
$objs = %w[ggml.o ggml-alloc.o
|
56
|
+
$objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
|
57
57
|
$objs << 'k_quants.o' unless with_config('no_k_quants')
|
58
58
|
end
|
59
59
|
|
@@ -61,7 +61,7 @@ if with_config('cublas')
|
|
61
61
|
$CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
62
62
|
$CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
63
63
|
$LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
|
64
|
-
$objs = %w[ggml-
|
64
|
+
$objs = %w[ggml.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
|
65
65
|
$objs << 'k_quants.o' unless with_config('no_k_quants')
|
66
66
|
end
|
67
67
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -808,10 +808,9 @@ public:
|
|
808
808
|
rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
|
809
809
|
rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
|
810
810
|
rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
|
811
|
-
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(
|
812
|
-
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(
|
813
|
-
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(
|
814
|
-
rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
|
811
|
+
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
|
+
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
813
|
+
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
815
814
|
rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
|
816
815
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
817
816
|
rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
|
@@ -960,53 +959,19 @@ private:
|
|
960
959
|
return Qnil;
|
961
960
|
}
|
962
961
|
|
963
|
-
static VALUE
|
962
|
+
static VALUE _llama_model_get_model_n_vocab(VALUE self) {
|
964
963
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
965
|
-
return INT2NUM(
|
964
|
+
return INT2NUM(llama_model_n_vocab(ptr->model));
|
966
965
|
}
|
967
966
|
|
968
|
-
static VALUE
|
967
|
+
static VALUE _llama_model_get_model_n_ctx(VALUE self) {
|
969
968
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
970
|
-
return INT2NUM(
|
969
|
+
return INT2NUM(llama_model_n_ctx(ptr->model));
|
971
970
|
}
|
972
971
|
|
973
|
-
static VALUE
|
972
|
+
static VALUE _llama_model_get_model_n_embd(VALUE self) {
|
974
973
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
975
|
-
return INT2NUM(
|
976
|
-
}
|
977
|
-
|
978
|
-
static VALUE _llama_model_get_vocab_from_model(int argc, VALUE* argv, VALUE self) {
|
979
|
-
VALUE kw_args = Qnil;
|
980
|
-
ID kw_table[1] = { rb_intern("capacity") };
|
981
|
-
VALUE kw_values[1] = { Qundef };
|
982
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
983
|
-
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
984
|
-
|
985
|
-
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
986
|
-
rb_raise(rb_eArgError, "capacity must be an integer");
|
987
|
-
return Qnil;
|
988
|
-
}
|
989
|
-
|
990
|
-
const int capacity = NUM2INT(kw_values[0]);
|
991
|
-
|
992
|
-
LLaMAModelWrapper* ptr = get_llama_model(self);
|
993
|
-
const int n = std::min(capacity, llama_n_vocab_from_model(ptr->model));
|
994
|
-
const char** vocabs = ALLOCA_N(const char*, n);
|
995
|
-
float* scores = ALLOCA_N(float, n);
|
996
|
-
|
997
|
-
llama_get_vocab_from_model(ptr->model, vocabs, scores, capacity);
|
998
|
-
|
999
|
-
VALUE vocabs_ary = rb_ary_new();
|
1000
|
-
VALUE scores_ary = rb_ary_new();
|
1001
|
-
|
1002
|
-
for (int i = 0; i < n; i++) {
|
1003
|
-
rb_ary_push(vocabs_ary, rb_str_new_cstr(vocabs[i]));
|
1004
|
-
rb_ary_push(scores_ary, DBL2NUM(scores[i]));
|
1005
|
-
}
|
1006
|
-
|
1007
|
-
VALUE ret = rb_ary_new3(2, vocabs_ary, scores_ary);
|
1008
|
-
|
1009
|
-
return ret;
|
974
|
+
return INT2NUM(llama_model_n_embd(ptr->model));
|
1010
975
|
}
|
1011
976
|
|
1012
977
|
static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
|
@@ -1016,8 +981,20 @@ private:
|
|
1016
981
|
}
|
1017
982
|
const llama_token token = NUM2INT(token_);
|
1018
983
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1019
|
-
|
1020
|
-
|
984
|
+
std::vector<char> result(8, 0);
|
985
|
+
const int n_tokens = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
|
986
|
+
if (n_tokens < 0) {
|
987
|
+
result.resize(-n_tokens);
|
988
|
+
const int check = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
|
989
|
+
if (check != -n_tokens) {
|
990
|
+
rb_raise(rb_eRuntimeError, "failed to convert");
|
991
|
+
return Qnil;
|
992
|
+
}
|
993
|
+
} else {
|
994
|
+
result.resize(n_tokens);
|
995
|
+
}
|
996
|
+
std::string ret(result.data(), result.size());
|
997
|
+
return rb_str_new_cstr(ret.c_str());
|
1021
998
|
}
|
1022
999
|
|
1023
1000
|
static VALUE _llama_model_tokenize_with_model(int argc, VALUE* argv, VALUE self) {
|
@@ -1343,7 +1320,12 @@ public:
|
|
1343
1320
|
rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
|
1344
1321
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
1345
1322
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
1346
|
-
rb_define_method(rb_cLLaMAContext, "
|
1323
|
+
rb_define_method(rb_cLLaMAContext, "text", RUBY_METHOD_FUNC(_llama_context_text), 1);
|
1324
|
+
rb_define_method(rb_cLLaMAContext, "score", RUBY_METHOD_FUNC(_llama_context_score), 1);
|
1325
|
+
rb_define_method(rb_cLLaMAContext, "type", RUBY_METHOD_FUNC(_llama_context_type), 1);
|
1326
|
+
rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
|
1327
|
+
rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
|
1328
|
+
rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
|
1347
1329
|
rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
|
1348
1330
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1349
1331
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
@@ -1592,8 +1574,20 @@ private:
|
|
1592
1574
|
return Qnil;
|
1593
1575
|
}
|
1594
1576
|
const llama_token token = NUM2INT(token_);
|
1595
|
-
|
1596
|
-
|
1577
|
+
std::vector<char> result(8, 0);
|
1578
|
+
const int n_tokens = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
|
1579
|
+
if (n_tokens < 0) {
|
1580
|
+
result.resize(-n_tokens);
|
1581
|
+
const int check = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
|
1582
|
+
if (check != -n_tokens) {
|
1583
|
+
rb_raise(rb_eRuntimeError, "failed to convert");
|
1584
|
+
return Qnil;
|
1585
|
+
}
|
1586
|
+
} else {
|
1587
|
+
result.resize(n_tokens);
|
1588
|
+
}
|
1589
|
+
std::string ret(result.data(), result.size());
|
1590
|
+
return rb_str_new_cstr(ret.c_str());
|
1597
1591
|
}
|
1598
1592
|
|
1599
1593
|
static VALUE _llama_context_logits(VALUE self) {
|
@@ -1649,41 +1643,64 @@ private:
|
|
1649
1643
|
return output;
|
1650
1644
|
}
|
1651
1645
|
|
1652
|
-
static VALUE
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
1657
|
-
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
1658
|
-
|
1659
|
-
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
1660
|
-
rb_raise(rb_eArgError, "capacity must be an integer");
|
1646
|
+
static VALUE _llama_context_text(VALUE self, VALUE token_) {
|
1647
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1648
|
+
if (ptr->ctx == NULL) {
|
1649
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1661
1650
|
return Qnil;
|
1662
1651
|
}
|
1652
|
+
const llama_token token = NUM2INT(token_);
|
1653
|
+
const char* text = llama_token_get_text(ptr->ctx, token);
|
1654
|
+
return rb_str_new_cstr(text);
|
1655
|
+
}
|
1663
1656
|
|
1657
|
+
static VALUE _llama_context_score(VALUE self, VALUE token_) {
|
1664
1658
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1665
1659
|
if (ptr->ctx == NULL) {
|
1666
1660
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1667
1661
|
return Qnil;
|
1668
1662
|
}
|
1663
|
+
const llama_token token = NUM2INT(token_);
|
1664
|
+
const float score = llama_token_get_score(ptr->ctx, token);
|
1665
|
+
return DBL2NUM(score);
|
1666
|
+
}
|
1669
1667
|
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1668
|
+
static VALUE _llama_context_type(VALUE self, VALUE token_) {
|
1669
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1670
|
+
if (ptr->ctx == NULL) {
|
1671
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1672
|
+
return Qnil;
|
1673
|
+
}
|
1674
|
+
const llama_token token = NUM2INT(token_);
|
1675
|
+
const int type = llama_token_get_type(ptr->ctx, token);
|
1676
|
+
return INT2NUM(type);
|
1677
|
+
}
|
1676
1678
|
|
1677
|
-
|
1679
|
+
static VALUE _llama_context_token_bos(VALUE self) {
|
1680
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1681
|
+
if (ptr->ctx == NULL) {
|
1682
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1683
|
+
return Qnil;
|
1684
|
+
}
|
1685
|
+
return INT2NUM(llama_token_bos(ptr->ctx));
|
1686
|
+
}
|
1678
1687
|
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1683
|
-
|
1688
|
+
static VALUE _llama_context_token_eos(VALUE self) {
|
1689
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1690
|
+
if (ptr->ctx == NULL) {
|
1691
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1692
|
+
return Qnil;
|
1684
1693
|
}
|
1694
|
+
return INT2NUM(llama_token_eos(ptr->ctx));
|
1695
|
+
}
|
1685
1696
|
|
1686
|
-
|
1697
|
+
static VALUE _llama_context_token_nl(VALUE self) {
|
1698
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1699
|
+
if (ptr->ctx == NULL) {
|
1700
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1701
|
+
return Qnil;
|
1702
|
+
}
|
1703
|
+
return INT2NUM(llama_token_nl(ptr->ctx));
|
1687
1704
|
}
|
1688
1705
|
|
1689
1706
|
static VALUE _llama_context_n_vocab(VALUE self) {
|
@@ -2474,23 +2491,15 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
|
2474
2491
|
return Qnil;
|
2475
2492
|
}
|
2476
2493
|
|
2477
|
-
static VALUE rb_llama_token_bos(VALUE self) {
|
2478
|
-
return INT2NUM(llama_token_bos());
|
2479
|
-
}
|
2480
|
-
|
2481
|
-
static VALUE rb_llama_token_eos(VALUE self) {
|
2482
|
-
return INT2NUM(llama_token_eos());
|
2483
|
-
}
|
2484
|
-
|
2485
|
-
static VALUE rb_llama_token_nl(VALUE self) {
|
2486
|
-
return INT2NUM(llama_token_nl());
|
2487
|
-
}
|
2488
|
-
|
2489
2494
|
static VALUE rb_llama_print_system_info(VALUE self) {
|
2490
2495
|
const char* result = llama_print_system_info();
|
2491
2496
|
return rb_utf8_str_new_cstr(result);
|
2492
2497
|
}
|
2493
2498
|
|
2499
|
+
static VALUE rb_llama_time_us(VALUE self) {
|
2500
|
+
return LONG2NUM(llama_time_us());
|
2501
|
+
}
|
2502
|
+
|
2494
2503
|
static VALUE rb_llama_mmap_supported(VALUE self) {
|
2495
2504
|
return llama_mmap_supported() ? Qtrue : Qfalse;
|
2496
2505
|
}
|
@@ -2519,16 +2528,29 @@ extern "C" void Init_llama_cpp(void) {
|
|
2519
2528
|
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
|
2520
2529
|
rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
|
2521
2530
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
2522
|
-
rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
|
2523
|
-
rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
|
2524
|
-
rb_define_module_function(rb_mLLaMACpp, "token_nl", rb_llama_token_nl, 0);
|
2525
2531
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
2532
|
+
rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
|
2526
2533
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
2527
2534
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
2528
2535
|
rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
|
2529
2536
|
|
2530
2537
|
rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
|
2531
2538
|
|
2539
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_ERROR", INT2NUM(LLAMA_LOG_LEVEL_ERROR));
|
2540
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_WARN", INT2NUM(LLAMA_LOG_LEVEL_WARN));
|
2541
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_INFO", INT2NUM(LLAMA_LOG_LEVEL_INFO));
|
2542
|
+
|
2543
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
2544
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
2545
|
+
|
2546
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
2547
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
2548
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
|
2549
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_CONTROL", INT2NUM(LLAMA_TOKEN_TYPE_CONTROL));
|
2550
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_USER_DEFINED", INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED));
|
2551
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
|
2552
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
|
2553
|
+
|
2532
2554
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
2533
2555
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
2534
2556
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
@@ -2547,6 +2569,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
2547
2569
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
2548
2570
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
2549
2571
|
|
2572
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
2573
|
+
|
2550
2574
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
2551
2575
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
2552
2576
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_RULE_REF", INT2NUM(LLAMA_GRETYPE_RULE_REF));
|
@@ -2556,39 +2580,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
2556
2580
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
|
2557
2581
|
|
2558
2582
|
std::stringstream ss_magic;
|
2559
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
|
2560
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGJT", rb_str_new2(ss_magic.str().c_str()));
|
2561
|
-
|
2562
|
-
ss_magic.str("");
|
2563
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2564
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
|
2565
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
|
2566
|
-
|
2567
|
-
ss_magic.str("");
|
2568
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2569
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGMF;
|
2570
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGMF", rb_str_new2(ss_magic.str().c_str()));
|
2571
|
-
|
2572
|
-
ss_magic.str("");
|
2573
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2574
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGML;
|
2575
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGML", rb_str_new2(ss_magic.str().c_str()));
|
2576
|
-
|
2577
|
-
ss_magic.str("");
|
2578
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2579
2583
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
|
2580
2584
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
|
2581
2585
|
|
2582
|
-
ss_magic.str("");
|
2583
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2584
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
|
2585
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
2586
|
-
|
2587
|
-
ss_magic.str("");
|
2588
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2589
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_UNVERSIONED;
|
2590
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_UNVERSIONED", rb_str_new2(ss_magic.str().c_str()));
|
2591
|
-
|
2592
2586
|
ss_magic.str("");
|
2593
2587
|
ss_magic.clear(std::stringstream::goodbit);
|
2594
2588
|
ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
|
@@ -2599,6 +2593,5 @@ extern "C" void Init_llama_cpp(void) {
|
|
2599
2593
|
ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
|
2600
2594
|
rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
|
2601
2595
|
|
2602
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
2603
2596
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
|
2604
2597
|
}
|
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
#define UNUSED(x) (void)(x)
|
10
10
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
|
+
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
11
12
|
|
12
13
|
//#define GGML_ALLOCATOR_DEBUG
|
13
14
|
|
@@ -67,8 +68,8 @@ struct ggml_allocr {
|
|
67
68
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
68
69
|
size_t max_size;
|
69
70
|
bool measure;
|
70
|
-
int parse_seq[
|
71
|
-
|
71
|
+
int parse_seq[GGML_MAX_CONCUR];
|
72
|
+
int parse_seq_len;
|
72
73
|
|
73
74
|
#ifdef GGML_ALLOCATOR_DEBUG
|
74
75
|
struct ggml_tensor * allocated_tensors[1024];
|
@@ -76,7 +77,7 @@ struct ggml_allocr {
|
|
76
77
|
};
|
77
78
|
|
78
79
|
#ifdef GGML_ALLOCATOR_DEBUG
|
79
|
-
static void add_allocated_tensor(struct
|
80
|
+
static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
80
81
|
for (int i = 0; i < 1024; i++) {
|
81
82
|
if (alloc->allocated_tensors[i] == NULL) {
|
82
83
|
alloc->allocated_tensors[i] = tensor;
|
@@ -85,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
|
|
85
86
|
}
|
86
87
|
GGML_ASSERT(!"out of allocated_tensors");
|
87
88
|
}
|
88
|
-
static void remove_allocated_tensor(struct
|
89
|
+
static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
89
90
|
for (int i = 0; i < 1024; i++) {
|
90
91
|
if (alloc->allocated_tensors[i] == tensor ||
|
91
92
|
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
@@ -238,15 +239,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|
238
239
|
alloc->n_free_blocks++;
|
239
240
|
}
|
240
241
|
|
241
|
-
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
242
|
-
int pos = 0;
|
242
|
+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
243
243
|
for (int i = 0; i < n; i++) {
|
244
|
-
|
245
|
-
alloc->parse_seq[pos] = list[i];
|
246
|
-
pos++;
|
247
|
-
}
|
244
|
+
alloc->parse_seq[i] = list[i];
|
248
245
|
}
|
249
|
-
alloc->
|
246
|
+
alloc->parse_seq_len = n;
|
250
247
|
}
|
251
248
|
|
252
249
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
@@ -269,7 +266,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
269
266
|
/*.max_size = */ 0,
|
270
267
|
/*.measure = */ false,
|
271
268
|
/*.parse_seq = */ {0},
|
272
|
-
/*.
|
269
|
+
/*.parse_seq_len = */ 0,
|
273
270
|
#ifdef GGML_ALLOCATOR_DEBUG
|
274
271
|
/*.allocated_tensors = */ = {0},
|
275
272
|
#endif
|
@@ -298,7 +295,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
298
295
|
/*.max_size = */ 0,
|
299
296
|
/*.measure = */ true,
|
300
297
|
/*.parse_seq = */ {0},
|
301
|
-
/*.
|
298
|
+
/*.parse_seq_len = */ 0,
|
302
299
|
#ifdef GGML_ALLOCATOR_DEBUG
|
303
300
|
/*.allocated_tensors = */ = {0},
|
304
301
|
#endif
|
@@ -445,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
445
442
|
else {
|
446
443
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
447
444
|
node->data = parent->data;
|
445
|
+
return;
|
448
446
|
}
|
449
|
-
return;
|
450
447
|
}
|
451
448
|
}
|
452
449
|
}
|
@@ -497,69 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
497
494
|
allocate_node(alloc, input);
|
498
495
|
}
|
499
496
|
}
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
497
|
+
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
498
|
+
int last_barrier_pos = 0;
|
499
|
+
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
500
|
+
|
501
|
+
for (int ind = 0; ind < n_nodes; ind++) {
|
502
|
+
// allocate a node if there is no parse_seq or this is not a barrier
|
503
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
|
504
|
+
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
|
505
|
+
struct ggml_tensor * node = gf->nodes[i];
|
506
|
+
|
507
|
+
// allocate parents (leafs)
|
508
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
509
|
+
struct ggml_tensor * parent = node->src[j];
|
510
|
+
if (parent == NULL) {
|
511
|
+
break;
|
512
|
+
}
|
513
|
+
allocate_node(alloc, parent);
|
514
514
|
}
|
515
|
-
allocate_node(alloc, parent);
|
516
|
-
}
|
517
515
|
|
518
|
-
|
519
|
-
|
516
|
+
// allocate node
|
517
|
+
allocate_node(alloc, node);
|
520
518
|
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
519
|
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
520
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
521
|
+
struct ggml_tensor * parent = node->src[j];
|
522
|
+
if (parent == NULL) {
|
523
|
+
break;
|
524
|
+
}
|
525
|
+
AT_PRINTF("%s", parent->name);
|
526
|
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
527
|
+
AT_PRINTF(", ");
|
528
|
+
}
|
530
529
|
}
|
530
|
+
AT_PRINTF("\n");
|
531
531
|
}
|
532
|
-
|
532
|
+
|
533
533
|
|
534
534
|
// update parents
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
549
|
-
view_src_hn->n_views -= 1;
|
550
|
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
551
|
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
552
|
-
ggml_allocator_free_tensor(alloc, view_src);
|
535
|
+
// update immediately if there is no parse_seq
|
536
|
+
// update only at barriers if there is parse_seq
|
537
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
538
|
+
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
539
|
+
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
540
|
+
for (int i = update_start; i < update_end; i++) {
|
541
|
+
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
542
|
+
struct ggml_tensor * node = gf->nodes[node_i];
|
543
|
+
|
544
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
545
|
+
struct ggml_tensor * parent = node->src[j];
|
546
|
+
if (parent == NULL) {
|
547
|
+
break;
|
553
548
|
}
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
549
|
+
struct hash_node * p_hn = hash_get(ht, parent);
|
550
|
+
p_hn->n_children -= 1;
|
551
|
+
|
552
|
+
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
553
|
+
|
554
|
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
555
|
+
if (ggml_is_view(parent)) {
|
556
|
+
struct ggml_tensor * view_src = get_view_source(parent);
|
557
|
+
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
558
|
+
view_src_hn->n_views -= 1;
|
559
|
+
AT_PRINTF("view_src %s\n", view_src->name);
|
560
|
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
561
|
+
ggml_allocator_free_tensor(alloc, view_src);
|
562
|
+
}
|
563
|
+
}
|
564
|
+
else {
|
565
|
+
if (parent->data != node->data) {
|
566
|
+
ggml_allocator_free_tensor(alloc, parent);
|
567
|
+
}
|
568
|
+
}
|
558
569
|
}
|
559
570
|
}
|
560
571
|
}
|
572
|
+
AT_PRINTF("\n");
|
573
|
+
if (alloc->parse_seq_len) {
|
574
|
+
last_barrier_pos = ind + 1;
|
575
|
+
}
|
561
576
|
}
|
562
|
-
AT_PRINTF("\n");
|
563
577
|
}
|
564
578
|
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
565
579
|
if (outputs != NULL && outputs[g] != NULL) {
|
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
|
12
12
|
|
13
13
|
// tell the allocator to parse nodes following the order described in the list
|
14
14
|
// you should call this if your graph are optimized to execute out-of-order
|
15
|
-
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
15
|
+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16
16
|
|
17
17
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
18
18
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|