llama_cpp 0.3.8 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +110 -117
- data/ext/llama_cpp/src/ggml-alloc.c +79 -65
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +330 -69
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +102 -66
- data/ext/llama_cpp/src/ggml-metal.metal +113 -9
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama.cpp +4520 -2978
- data/ext/llama_cpp/src/llama.h +133 -125
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +7 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: af3a0e01bc9f3cfad4cee3f21144dd354640e1d4558125be36d4b499fa3b4c24
|
4
|
+
data.tar.gz: 042a3b0491d98fa6a093c684e6ab751152f37c8438a3b4a7b19cb2d8c7ab95a7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ed85bd8438ee3b3adab884795c4aecb5b0d72ad57b7e02bc281b62c3b1d669efab62a020e03b09defe3084ecd8afacc4220303e99167d04d668650768c7392b
|
7
|
+
data.tar.gz: b705a0ccd2c7c1e15aed6383acb9d5a3d79d0a0c882a74c42b9099df9a27aff88ba08a2f06aa4d195382e8f41c1b16c0014a2047d1923369f275ca481d52bb21
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
- Bump bundled llama.cpp from master-097e121 to master-b1060.
|
5
|
+
- Support new file format GGUF.
|
6
|
+
- You should re-convert / re-quantize your model files.
|
7
|
+
- Remove vocab methods.
|
8
|
+
- Move token_bos, token_eos, and token_nl methods to Context.
|
9
|
+
- Add text, score, and type methods to Context.
|
10
|
+
|
1
11
|
## [[0.3.8](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.7...v0.3.8)] - 2023-08-19
|
2
12
|
|
3
13
|
- Bump bundled llama.cpp from master-9ca4abe to master-097e121.
|
data/README.md
CHANGED
@@ -51,7 +51,7 @@ $ git clone https://huggingface.co/openlm-research/open_llama_7b
|
|
51
51
|
$ cd ../
|
52
52
|
$ python3 convert.py models/open_llama_7b
|
53
53
|
$ make
|
54
|
-
$ ./quantize ./models/open_llama_7b/ggml-model-f16.
|
54
|
+
$ ./quantize ./models/open_llama_7b/ggml-model-f16.gguf ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
|
55
55
|
```
|
56
56
|
|
57
57
|
An example of Ruby code that generates sentences with the quantization model is as follows:
|
data/examples/chat.rb
CHANGED
@@ -49,8 +49,6 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
49
49
|
n_keep = options[:keep]
|
50
50
|
n_keep = embd_input.size if n_keep > embd_input.size
|
51
51
|
|
52
|
-
token_newline = context.tokenize(text: "\n", add_bos: false)
|
53
|
-
|
54
52
|
last_n_tokens = [0] * n_ctx
|
55
53
|
interactive = true
|
56
54
|
is_interacting = false
|
@@ -101,8 +99,8 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
101
99
|
last_n_tokens.shift
|
102
100
|
last_n_tokens.push(id)
|
103
101
|
|
104
|
-
if id ==
|
105
|
-
id =
|
102
|
+
if id == context.token_eos
|
103
|
+
id = context.token_nl
|
106
104
|
unless antiprompt.empty?
|
107
105
|
first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
|
108
106
|
embd_input.concat(first_antiprompt)
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -53,7 +53,7 @@ if with_config('metal')
|
|
53
53
|
$CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
|
54
54
|
$CXXFLAGS << ' -DGGML_USE_METAL'
|
55
55
|
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
56
|
-
$objs = %w[ggml.o ggml-alloc.o
|
56
|
+
$objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
|
57
57
|
$objs << 'k_quants.o' unless with_config('no_k_quants')
|
58
58
|
end
|
59
59
|
|
@@ -61,7 +61,7 @@ if with_config('cublas')
|
|
61
61
|
$CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
62
62
|
$CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
63
63
|
$LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
|
64
|
-
$objs = %w[ggml-
|
64
|
+
$objs = %w[ggml.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
|
65
65
|
$objs << 'k_quants.o' unless with_config('no_k_quants')
|
66
66
|
end
|
67
67
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -808,10 +808,9 @@ public:
|
|
808
808
|
rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
|
809
809
|
rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
|
810
810
|
rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
|
811
|
-
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(
|
812
|
-
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(
|
813
|
-
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(
|
814
|
-
rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
|
811
|
+
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
|
+
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
813
|
+
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
815
814
|
rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
|
816
815
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
817
816
|
rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
|
@@ -960,53 +959,19 @@ private:
|
|
960
959
|
return Qnil;
|
961
960
|
}
|
962
961
|
|
963
|
-
static VALUE
|
962
|
+
static VALUE _llama_model_get_model_n_vocab(VALUE self) {
|
964
963
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
965
|
-
return INT2NUM(
|
964
|
+
return INT2NUM(llama_model_n_vocab(ptr->model));
|
966
965
|
}
|
967
966
|
|
968
|
-
static VALUE
|
967
|
+
static VALUE _llama_model_get_model_n_ctx(VALUE self) {
|
969
968
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
970
|
-
return INT2NUM(
|
969
|
+
return INT2NUM(llama_model_n_ctx(ptr->model));
|
971
970
|
}
|
972
971
|
|
973
|
-
static VALUE
|
972
|
+
static VALUE _llama_model_get_model_n_embd(VALUE self) {
|
974
973
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
975
|
-
return INT2NUM(
|
976
|
-
}
|
977
|
-
|
978
|
-
static VALUE _llama_model_get_vocab_from_model(int argc, VALUE* argv, VALUE self) {
|
979
|
-
VALUE kw_args = Qnil;
|
980
|
-
ID kw_table[1] = { rb_intern("capacity") };
|
981
|
-
VALUE kw_values[1] = { Qundef };
|
982
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
983
|
-
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
984
|
-
|
985
|
-
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
986
|
-
rb_raise(rb_eArgError, "capacity must be an integer");
|
987
|
-
return Qnil;
|
988
|
-
}
|
989
|
-
|
990
|
-
const int capacity = NUM2INT(kw_values[0]);
|
991
|
-
|
992
|
-
LLaMAModelWrapper* ptr = get_llama_model(self);
|
993
|
-
const int n = std::min(capacity, llama_n_vocab_from_model(ptr->model));
|
994
|
-
const char** vocabs = ALLOCA_N(const char*, n);
|
995
|
-
float* scores = ALLOCA_N(float, n);
|
996
|
-
|
997
|
-
llama_get_vocab_from_model(ptr->model, vocabs, scores, capacity);
|
998
|
-
|
999
|
-
VALUE vocabs_ary = rb_ary_new();
|
1000
|
-
VALUE scores_ary = rb_ary_new();
|
1001
|
-
|
1002
|
-
for (int i = 0; i < n; i++) {
|
1003
|
-
rb_ary_push(vocabs_ary, rb_str_new_cstr(vocabs[i]));
|
1004
|
-
rb_ary_push(scores_ary, DBL2NUM(scores[i]));
|
1005
|
-
}
|
1006
|
-
|
1007
|
-
VALUE ret = rb_ary_new3(2, vocabs_ary, scores_ary);
|
1008
|
-
|
1009
|
-
return ret;
|
974
|
+
return INT2NUM(llama_model_n_embd(ptr->model));
|
1010
975
|
}
|
1011
976
|
|
1012
977
|
static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
|
@@ -1016,8 +981,20 @@ private:
|
|
1016
981
|
}
|
1017
982
|
const llama_token token = NUM2INT(token_);
|
1018
983
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1019
|
-
|
1020
|
-
|
984
|
+
std::vector<char> result(8, 0);
|
985
|
+
const int n_tokens = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
|
986
|
+
if (n_tokens < 0) {
|
987
|
+
result.resize(-n_tokens);
|
988
|
+
const int check = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
|
989
|
+
if (check != -n_tokens) {
|
990
|
+
rb_raise(rb_eRuntimeError, "failed to convert");
|
991
|
+
return Qnil;
|
992
|
+
}
|
993
|
+
} else {
|
994
|
+
result.resize(n_tokens);
|
995
|
+
}
|
996
|
+
std::string ret(result.data(), result.size());
|
997
|
+
return rb_str_new_cstr(ret.c_str());
|
1021
998
|
}
|
1022
999
|
|
1023
1000
|
static VALUE _llama_model_tokenize_with_model(int argc, VALUE* argv, VALUE self) {
|
@@ -1343,7 +1320,12 @@ public:
|
|
1343
1320
|
rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
|
1344
1321
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
1345
1322
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
1346
|
-
rb_define_method(rb_cLLaMAContext, "
|
1323
|
+
rb_define_method(rb_cLLaMAContext, "text", RUBY_METHOD_FUNC(_llama_context_text), 1);
|
1324
|
+
rb_define_method(rb_cLLaMAContext, "score", RUBY_METHOD_FUNC(_llama_context_score), 1);
|
1325
|
+
rb_define_method(rb_cLLaMAContext, "type", RUBY_METHOD_FUNC(_llama_context_type), 1);
|
1326
|
+
rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
|
1327
|
+
rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
|
1328
|
+
rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
|
1347
1329
|
rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
|
1348
1330
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1349
1331
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
@@ -1592,8 +1574,20 @@ private:
|
|
1592
1574
|
return Qnil;
|
1593
1575
|
}
|
1594
1576
|
const llama_token token = NUM2INT(token_);
|
1595
|
-
|
1596
|
-
|
1577
|
+
std::vector<char> result(8, 0);
|
1578
|
+
const int n_tokens = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
|
1579
|
+
if (n_tokens < 0) {
|
1580
|
+
result.resize(-n_tokens);
|
1581
|
+
const int check = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
|
1582
|
+
if (check != -n_tokens) {
|
1583
|
+
rb_raise(rb_eRuntimeError, "failed to convert");
|
1584
|
+
return Qnil;
|
1585
|
+
}
|
1586
|
+
} else {
|
1587
|
+
result.resize(n_tokens);
|
1588
|
+
}
|
1589
|
+
std::string ret(result.data(), result.size());
|
1590
|
+
return rb_str_new_cstr(ret.c_str());
|
1597
1591
|
}
|
1598
1592
|
|
1599
1593
|
static VALUE _llama_context_logits(VALUE self) {
|
@@ -1649,41 +1643,64 @@ private:
|
|
1649
1643
|
return output;
|
1650
1644
|
}
|
1651
1645
|
|
1652
|
-
static VALUE
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
1657
|
-
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
1658
|
-
|
1659
|
-
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
1660
|
-
rb_raise(rb_eArgError, "capacity must be an integer");
|
1646
|
+
static VALUE _llama_context_text(VALUE self, VALUE token_) {
|
1647
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1648
|
+
if (ptr->ctx == NULL) {
|
1649
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1661
1650
|
return Qnil;
|
1662
1651
|
}
|
1652
|
+
const llama_token token = NUM2INT(token_);
|
1653
|
+
const char* text = llama_token_get_text(ptr->ctx, token);
|
1654
|
+
return rb_str_new_cstr(text);
|
1655
|
+
}
|
1663
1656
|
|
1657
|
+
static VALUE _llama_context_score(VALUE self, VALUE token_) {
|
1664
1658
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1665
1659
|
if (ptr->ctx == NULL) {
|
1666
1660
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1667
1661
|
return Qnil;
|
1668
1662
|
}
|
1663
|
+
const llama_token token = NUM2INT(token_);
|
1664
|
+
const float score = llama_token_get_score(ptr->ctx, token);
|
1665
|
+
return DBL2NUM(score);
|
1666
|
+
}
|
1669
1667
|
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1668
|
+
static VALUE _llama_context_type(VALUE self, VALUE token_) {
|
1669
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1670
|
+
if (ptr->ctx == NULL) {
|
1671
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1672
|
+
return Qnil;
|
1673
|
+
}
|
1674
|
+
const llama_token token = NUM2INT(token_);
|
1675
|
+
const int type = llama_token_get_type(ptr->ctx, token);
|
1676
|
+
return INT2NUM(type);
|
1677
|
+
}
|
1676
1678
|
|
1677
|
-
|
1679
|
+
static VALUE _llama_context_token_bos(VALUE self) {
|
1680
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1681
|
+
if (ptr->ctx == NULL) {
|
1682
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1683
|
+
return Qnil;
|
1684
|
+
}
|
1685
|
+
return INT2NUM(llama_token_bos(ptr->ctx));
|
1686
|
+
}
|
1678
1687
|
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1683
|
-
|
1688
|
+
static VALUE _llama_context_token_eos(VALUE self) {
|
1689
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1690
|
+
if (ptr->ctx == NULL) {
|
1691
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1692
|
+
return Qnil;
|
1684
1693
|
}
|
1694
|
+
return INT2NUM(llama_token_eos(ptr->ctx));
|
1695
|
+
}
|
1685
1696
|
|
1686
|
-
|
1697
|
+
static VALUE _llama_context_token_nl(VALUE self) {
|
1698
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1699
|
+
if (ptr->ctx == NULL) {
|
1700
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1701
|
+
return Qnil;
|
1702
|
+
}
|
1703
|
+
return INT2NUM(llama_token_nl(ptr->ctx));
|
1687
1704
|
}
|
1688
1705
|
|
1689
1706
|
static VALUE _llama_context_n_vocab(VALUE self) {
|
@@ -2474,23 +2491,15 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
|
2474
2491
|
return Qnil;
|
2475
2492
|
}
|
2476
2493
|
|
2477
|
-
static VALUE rb_llama_token_bos(VALUE self) {
|
2478
|
-
return INT2NUM(llama_token_bos());
|
2479
|
-
}
|
2480
|
-
|
2481
|
-
static VALUE rb_llama_token_eos(VALUE self) {
|
2482
|
-
return INT2NUM(llama_token_eos());
|
2483
|
-
}
|
2484
|
-
|
2485
|
-
static VALUE rb_llama_token_nl(VALUE self) {
|
2486
|
-
return INT2NUM(llama_token_nl());
|
2487
|
-
}
|
2488
|
-
|
2489
2494
|
static VALUE rb_llama_print_system_info(VALUE self) {
|
2490
2495
|
const char* result = llama_print_system_info();
|
2491
2496
|
return rb_utf8_str_new_cstr(result);
|
2492
2497
|
}
|
2493
2498
|
|
2499
|
+
static VALUE rb_llama_time_us(VALUE self) {
|
2500
|
+
return LONG2NUM(llama_time_us());
|
2501
|
+
}
|
2502
|
+
|
2494
2503
|
static VALUE rb_llama_mmap_supported(VALUE self) {
|
2495
2504
|
return llama_mmap_supported() ? Qtrue : Qfalse;
|
2496
2505
|
}
|
@@ -2519,16 +2528,29 @@ extern "C" void Init_llama_cpp(void) {
|
|
2519
2528
|
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
|
2520
2529
|
rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
|
2521
2530
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
2522
|
-
rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
|
2523
|
-
rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
|
2524
|
-
rb_define_module_function(rb_mLLaMACpp, "token_nl", rb_llama_token_nl, 0);
|
2525
2531
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
2532
|
+
rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
|
2526
2533
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
2527
2534
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
2528
2535
|
rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
|
2529
2536
|
|
2530
2537
|
rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
|
2531
2538
|
|
2539
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_ERROR", INT2NUM(LLAMA_LOG_LEVEL_ERROR));
|
2540
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_WARN", INT2NUM(LLAMA_LOG_LEVEL_WARN));
|
2541
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_INFO", INT2NUM(LLAMA_LOG_LEVEL_INFO));
|
2542
|
+
|
2543
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
2544
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
2545
|
+
|
2546
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
2547
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
2548
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
|
2549
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_CONTROL", INT2NUM(LLAMA_TOKEN_TYPE_CONTROL));
|
2550
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_USER_DEFINED", INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED));
|
2551
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
|
2552
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
|
2553
|
+
|
2532
2554
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
2533
2555
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
2534
2556
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
@@ -2547,6 +2569,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
2547
2569
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
2548
2570
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
2549
2571
|
|
2572
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
2573
|
+
|
2550
2574
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
2551
2575
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
2552
2576
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_RULE_REF", INT2NUM(LLAMA_GRETYPE_RULE_REF));
|
@@ -2556,39 +2580,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
2556
2580
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
|
2557
2581
|
|
2558
2582
|
std::stringstream ss_magic;
|
2559
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
|
2560
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGJT", rb_str_new2(ss_magic.str().c_str()));
|
2561
|
-
|
2562
|
-
ss_magic.str("");
|
2563
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2564
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
|
2565
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
|
2566
|
-
|
2567
|
-
ss_magic.str("");
|
2568
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2569
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGMF;
|
2570
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGMF", rb_str_new2(ss_magic.str().c_str()));
|
2571
|
-
|
2572
|
-
ss_magic.str("");
|
2573
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2574
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGML;
|
2575
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGML", rb_str_new2(ss_magic.str().c_str()));
|
2576
|
-
|
2577
|
-
ss_magic.str("");
|
2578
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2579
2583
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
|
2580
2584
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
|
2581
2585
|
|
2582
|
-
ss_magic.str("");
|
2583
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2584
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
|
2585
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
2586
|
-
|
2587
|
-
ss_magic.str("");
|
2588
|
-
ss_magic.clear(std::stringstream::goodbit);
|
2589
|
-
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_UNVERSIONED;
|
2590
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_UNVERSIONED", rb_str_new2(ss_magic.str().c_str()));
|
2591
|
-
|
2592
2586
|
ss_magic.str("");
|
2593
2587
|
ss_magic.clear(std::stringstream::goodbit);
|
2594
2588
|
ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
|
@@ -2599,6 +2593,5 @@ extern "C" void Init_llama_cpp(void) {
|
|
2599
2593
|
ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
|
2600
2594
|
rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
|
2601
2595
|
|
2602
|
-
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
2603
2596
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
|
2604
2597
|
}
|
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
#define UNUSED(x) (void)(x)
|
10
10
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
|
+
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
11
12
|
|
12
13
|
//#define GGML_ALLOCATOR_DEBUG
|
13
14
|
|
@@ -67,8 +68,8 @@ struct ggml_allocr {
|
|
67
68
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
68
69
|
size_t max_size;
|
69
70
|
bool measure;
|
70
|
-
int parse_seq[
|
71
|
-
|
71
|
+
int parse_seq[GGML_MAX_CONCUR];
|
72
|
+
int parse_seq_len;
|
72
73
|
|
73
74
|
#ifdef GGML_ALLOCATOR_DEBUG
|
74
75
|
struct ggml_tensor * allocated_tensors[1024];
|
@@ -76,7 +77,7 @@ struct ggml_allocr {
|
|
76
77
|
};
|
77
78
|
|
78
79
|
#ifdef GGML_ALLOCATOR_DEBUG
|
79
|
-
static void add_allocated_tensor(struct
|
80
|
+
static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
80
81
|
for (int i = 0; i < 1024; i++) {
|
81
82
|
if (alloc->allocated_tensors[i] == NULL) {
|
82
83
|
alloc->allocated_tensors[i] = tensor;
|
@@ -85,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
|
|
85
86
|
}
|
86
87
|
GGML_ASSERT(!"out of allocated_tensors");
|
87
88
|
}
|
88
|
-
static void remove_allocated_tensor(struct
|
89
|
+
static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
89
90
|
for (int i = 0; i < 1024; i++) {
|
90
91
|
if (alloc->allocated_tensors[i] == tensor ||
|
91
92
|
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
@@ -238,15 +239,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|
238
239
|
alloc->n_free_blocks++;
|
239
240
|
}
|
240
241
|
|
241
|
-
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
242
|
-
int pos = 0;
|
242
|
+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
243
243
|
for (int i = 0; i < n; i++) {
|
244
|
-
|
245
|
-
alloc->parse_seq[pos] = list[i];
|
246
|
-
pos++;
|
247
|
-
}
|
244
|
+
alloc->parse_seq[i] = list[i];
|
248
245
|
}
|
249
|
-
alloc->
|
246
|
+
alloc->parse_seq_len = n;
|
250
247
|
}
|
251
248
|
|
252
249
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
@@ -269,7 +266,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
269
266
|
/*.max_size = */ 0,
|
270
267
|
/*.measure = */ false,
|
271
268
|
/*.parse_seq = */ {0},
|
272
|
-
/*.
|
269
|
+
/*.parse_seq_len = */ 0,
|
273
270
|
#ifdef GGML_ALLOCATOR_DEBUG
|
274
271
|
/*.allocated_tensors = */ = {0},
|
275
272
|
#endif
|
@@ -298,7 +295,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
298
295
|
/*.max_size = */ 0,
|
299
296
|
/*.measure = */ true,
|
300
297
|
/*.parse_seq = */ {0},
|
301
|
-
/*.
|
298
|
+
/*.parse_seq_len = */ 0,
|
302
299
|
#ifdef GGML_ALLOCATOR_DEBUG
|
303
300
|
/*.allocated_tensors = */ = {0},
|
304
301
|
#endif
|
@@ -445,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
445
442
|
else {
|
446
443
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
447
444
|
node->data = parent->data;
|
445
|
+
return;
|
448
446
|
}
|
449
|
-
return;
|
450
447
|
}
|
451
448
|
}
|
452
449
|
}
|
@@ -497,69 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
497
494
|
allocate_node(alloc, input);
|
498
495
|
}
|
499
496
|
}
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
497
|
+
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
498
|
+
int last_barrier_pos = 0;
|
499
|
+
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
500
|
+
|
501
|
+
for (int ind = 0; ind < n_nodes; ind++) {
|
502
|
+
// allocate a node if there is no parse_seq or this is not a barrier
|
503
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
|
504
|
+
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
|
505
|
+
struct ggml_tensor * node = gf->nodes[i];
|
506
|
+
|
507
|
+
// allocate parents (leafs)
|
508
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
509
|
+
struct ggml_tensor * parent = node->src[j];
|
510
|
+
if (parent == NULL) {
|
511
|
+
break;
|
512
|
+
}
|
513
|
+
allocate_node(alloc, parent);
|
514
514
|
}
|
515
|
-
allocate_node(alloc, parent);
|
516
|
-
}
|
517
515
|
|
518
|
-
|
519
|
-
|
516
|
+
// allocate node
|
517
|
+
allocate_node(alloc, node);
|
520
518
|
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
519
|
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
520
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
521
|
+
struct ggml_tensor * parent = node->src[j];
|
522
|
+
if (parent == NULL) {
|
523
|
+
break;
|
524
|
+
}
|
525
|
+
AT_PRINTF("%s", parent->name);
|
526
|
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
527
|
+
AT_PRINTF(", ");
|
528
|
+
}
|
530
529
|
}
|
530
|
+
AT_PRINTF("\n");
|
531
531
|
}
|
532
|
-
|
532
|
+
|
533
533
|
|
534
534
|
// update parents
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
549
|
-
view_src_hn->n_views -= 1;
|
550
|
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
551
|
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
552
|
-
ggml_allocator_free_tensor(alloc, view_src);
|
535
|
+
// update immediately if there is no parse_seq
|
536
|
+
// update only at barriers if there is parse_seq
|
537
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
538
|
+
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
539
|
+
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
540
|
+
for (int i = update_start; i < update_end; i++) {
|
541
|
+
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
542
|
+
struct ggml_tensor * node = gf->nodes[node_i];
|
543
|
+
|
544
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
545
|
+
struct ggml_tensor * parent = node->src[j];
|
546
|
+
if (parent == NULL) {
|
547
|
+
break;
|
553
548
|
}
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
549
|
+
struct hash_node * p_hn = hash_get(ht, parent);
|
550
|
+
p_hn->n_children -= 1;
|
551
|
+
|
552
|
+
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
553
|
+
|
554
|
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
555
|
+
if (ggml_is_view(parent)) {
|
556
|
+
struct ggml_tensor * view_src = get_view_source(parent);
|
557
|
+
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
558
|
+
view_src_hn->n_views -= 1;
|
559
|
+
AT_PRINTF("view_src %s\n", view_src->name);
|
560
|
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
561
|
+
ggml_allocator_free_tensor(alloc, view_src);
|
562
|
+
}
|
563
|
+
}
|
564
|
+
else {
|
565
|
+
if (parent->data != node->data) {
|
566
|
+
ggml_allocator_free_tensor(alloc, parent);
|
567
|
+
}
|
568
|
+
}
|
558
569
|
}
|
559
570
|
}
|
560
571
|
}
|
572
|
+
AT_PRINTF("\n");
|
573
|
+
if (alloc->parse_seq_len) {
|
574
|
+
last_barrier_pos = ind + 1;
|
575
|
+
}
|
561
576
|
}
|
562
|
-
AT_PRINTF("\n");
|
563
577
|
}
|
564
578
|
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
565
579
|
if (outputs != NULL && outputs[g] != NULL) {
|
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
|
12
12
|
|
13
13
|
// tell the allocator to parse nodes following the order described in the list
|
14
14
|
// you should call this if your graph are optimized to execute out-of-order
|
15
|
-
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
15
|
+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16
16
|
|
17
17
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
18
18
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|