llama_cpp 0.5.1 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +32 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -11
- data/ext/llama_cpp/src/ggml-cuda.cu +1108 -699
- data/ext/llama_cpp/src/ggml-metal.m +93 -24
- data/ext/llama_cpp/src/ggml-metal.metal +407 -174
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +75 -43
- data/ext/llama_cpp/src/ggml.h +42 -32
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +1040 -201
- data/ext/llama_cpp/src/llama.h +13 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c45589a61587acfbe88add77ffb135a7949619ba2936178c59126c24c30e23cc
|
4
|
+
data.tar.gz: 5866b5b5f8dab59432cc91beca290f927a8d1bc694f83c8ccbe366c6f636f47c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 89714a2a920172c1ddc4fff56a11390ad97db62eb6bd4eefe3ba9376132bd6646eda7569acea49ff3a1ce87486cac0e623cac4fddfcb2b57629c30ee3457d38b
|
7
|
+
data.tar.gz: 2c5494528f55b86c57fccb18658350058210e85f31e9ecb8b3587a4da68a1465a0987a98e86dbae16fa5fea9a9502aed96afec3679e5553a82805f4436ef3020
|
data/CHANGELOG.md
CHANGED
@@ -1,11 +1,23 @@
|
|
1
|
+
## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1 to b1266.
|
4
|
+
|
5
|
+
## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
|
6
|
+
|
7
|
+
- Bump bundled llama.cpp from b1198 to b1.
|
8
|
+
- Add `n_ctx_train` method to Model and Context.
|
9
|
+
- Add nvcc option to avoid link error ([#8](https://github.com/yoshoku/llama_cpp.rb/pull/8)).
|
10
|
+
- Set encoding on output of `generate` module function to avoid encoding error ([#9](https://github.com/yoshoku/llama_cpp.rb/pull/9)).
|
11
|
+
- Add `only_copy` option to ModelQuantizeParams.
|
12
|
+
|
1
13
|
## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
|
2
14
|
|
3
|
-
- Bump bundled llama.cpp from
|
15
|
+
- Bump bundled llama.cpp from b1140 to b1198.
|
4
16
|
|
5
17
|
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
6
18
|
|
7
19
|
**Breaking Changes**
|
8
|
-
- Bump bundled llama.cpp from
|
20
|
+
- Bump bundled llama.cpp from b1060 to b1140.
|
9
21
|
- Rename `token_to_str` method on Context to `token_to_piece` method.
|
10
22
|
- Rename `token_to_str` method on Model to `token_to_piece` method.
|
11
23
|
- Rename `type` method on Model to `desc` method.
|
@@ -14,7 +26,7 @@
|
|
14
26
|
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
15
27
|
|
16
28
|
**Breaking Changes**
|
17
|
-
- Bump bundled llama.cpp from master-097e121 to
|
29
|
+
- Bump bundled llama.cpp from master-097e121 to b1060.
|
18
30
|
- Support new file format GGUF.
|
19
31
|
- You should re-convert / re-quantize your model files.
|
20
32
|
- Remove vocab methods.
|
data/examples/prompt_jp.txt
CHANGED
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -112,7 +112,7 @@ create_makefile('llama_cpp/llama_cpp')
|
|
112
112
|
if with_config('cublas')
|
113
113
|
File.open('Makefile', 'a') do |f|
|
114
114
|
f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
|
115
|
-
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
115
|
+
f.puts "\tnvcc -shared -Xcompiler -fPIC -arch=native -c -o $@ $<"
|
116
116
|
end
|
117
117
|
end
|
118
118
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -692,6 +692,8 @@ public:
|
|
692
692
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
|
693
693
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
|
694
694
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
|
695
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_only_copy), 1);
|
696
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
695
697
|
}
|
696
698
|
|
697
699
|
private:
|
@@ -752,6 +754,18 @@ private:
|
|
752
754
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
753
755
|
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
754
756
|
}
|
757
|
+
|
758
|
+
// only_copy
|
759
|
+
static VALUE _llama_model_quantize_params_set_only_copy(VALUE self, VALUE only_copy) {
|
760
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
761
|
+
ptr->params.only_copy = RTEST(only_copy) ? true : false;
|
762
|
+
return ptr->params.only_copy ? Qtrue : Qfalse;
|
763
|
+
}
|
764
|
+
|
765
|
+
static VALUE _llama_model_quantize_params_get_only_copy(VALUE self) {
|
766
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
767
|
+
return ptr->params.only_copy ? Qtrue : Qfalse;
|
768
|
+
}
|
755
769
|
};
|
756
770
|
|
757
771
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -810,6 +824,7 @@ public:
|
|
810
824
|
rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
|
811
825
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
826
|
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
827
|
+
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
813
828
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
814
829
|
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
|
815
830
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
@@ -971,6 +986,11 @@ private:
|
|
971
986
|
return INT2NUM(llama_model_n_ctx(ptr->model));
|
972
987
|
}
|
973
988
|
|
989
|
+
static VALUE _llama_model_get_model_n_ctx_train(VALUE self) {
|
990
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
991
|
+
return INT2NUM(llama_model_n_ctx_train(ptr->model));
|
992
|
+
}
|
993
|
+
|
974
994
|
static VALUE _llama_model_get_model_n_embd(VALUE self) {
|
975
995
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
976
996
|
return INT2NUM(llama_model_n_embd(ptr->model));
|
@@ -1026,7 +1046,7 @@ private:
|
|
1026
1046
|
|
1027
1047
|
llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
|
1028
1048
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1029
|
-
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
|
1049
|
+
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
|
1030
1050
|
|
1031
1051
|
if (n_tokens < 0) {
|
1032
1052
|
rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
|
@@ -1341,6 +1361,7 @@ public:
|
|
1341
1361
|
rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
|
1342
1362
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1343
1363
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1364
|
+
rb_define_method(rb_cLLaMAContext, "n_ctx_train", RUBY_METHOD_FUNC(_llama_context_n_ctx_train), 0);
|
1344
1365
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
1345
1366
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
1346
1367
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
@@ -1564,7 +1585,7 @@ private:
|
|
1564
1585
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1565
1586
|
return Qnil;
|
1566
1587
|
}
|
1567
|
-
const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
|
1588
|
+
const int n = llama_tokenize(ptr->ctx, text.c_str(), text.size(), tokens.data(), n_max_tokens, add_bos);
|
1568
1589
|
if (n < 0) {
|
1569
1590
|
rb_raise(rb_eRuntimeError, "Failed to tokenize");
|
1570
1591
|
return Qnil;
|
@@ -1733,6 +1754,15 @@ private:
|
|
1733
1754
|
return INT2NUM(llama_n_ctx(ptr->ctx));
|
1734
1755
|
}
|
1735
1756
|
|
1757
|
+
static VALUE _llama_context_n_ctx_train(VALUE self) {
|
1758
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1759
|
+
if (ptr->ctx == NULL) {
|
1760
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1761
|
+
return Qnil;
|
1762
|
+
}
|
1763
|
+
return INT2NUM(llama_n_ctx_train(ptr->ctx));
|
1764
|
+
}
|
1765
|
+
|
1736
1766
|
static VALUE _llama_context_n_embd(VALUE self) {
|
1737
1767
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1738
1768
|
if (ptr->ctx == NULL) {
|
@@ -1,8 +1,3 @@
|
|
1
|
-
// defines MAP_ANONYMOUS
|
2
|
-
#ifndef _GNU_SOURCE
|
3
|
-
#define _GNU_SOURCE
|
4
|
-
#endif
|
5
|
-
|
6
1
|
#include "ggml-alloc.h"
|
7
2
|
#include "ggml.h"
|
8
3
|
#include <assert.h>
|
@@ -136,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
|
|
136
131
|
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
137
132
|
}
|
138
133
|
|
134
|
+
static bool ggml_is_view(struct ggml_tensor * t) {
|
135
|
+
return t->view_src != NULL;
|
136
|
+
}
|
137
|
+
|
139
138
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
140
139
|
#ifdef GGML_ALLOCATOR_DEBUG
|
141
140
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
@@ -343,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
|
|
343
342
|
|
344
343
|
// allocate uncommitted virtual memory to measure the size of the graph
|
345
344
|
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
346
|
-
//
|
347
|
-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<
|
345
|
+
// 128GB for 64-bit, 1GB for 32-bit
|
346
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
348
347
|
do {
|
349
348
|
*base_addr = alloc_vmem(*size);
|
350
349
|
if (*base_addr != NULL) {
|
@@ -404,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
404
403
|
|
405
404
|
//////////// compute graph allocator
|
406
405
|
|
407
|
-
static bool ggml_is_view(struct ggml_tensor * t) {
|
408
|
-
return t->view_src != NULL;
|
409
|
-
}
|
410
|
-
|
411
406
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
412
407
|
if (a->type != b->type) {
|
413
408
|
return false;
|