llama_cpp 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +32 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -11
- data/ext/llama_cpp/src/ggml-cuda.cu +1108 -699
- data/ext/llama_cpp/src/ggml-metal.m +93 -24
- data/ext/llama_cpp/src/ggml-metal.metal +407 -174
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +75 -43
- data/ext/llama_cpp/src/ggml.h +42 -32
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +1040 -201
- data/ext/llama_cpp/src/llama.h +13 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c45589a61587acfbe88add77ffb135a7949619ba2936178c59126c24c30e23cc
|
4
|
+
data.tar.gz: 5866b5b5f8dab59432cc91beca290f927a8d1bc694f83c8ccbe366c6f636f47c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 89714a2a920172c1ddc4fff56a11390ad97db62eb6bd4eefe3ba9376132bd6646eda7569acea49ff3a1ce87486cac0e623cac4fddfcb2b57629c30ee3457d38b
|
7
|
+
data.tar.gz: 2c5494528f55b86c57fccb18658350058210e85f31e9ecb8b3587a4da68a1465a0987a98e86dbae16fa5fea9a9502aed96afec3679e5553a82805f4436ef3020
|
data/CHANGELOG.md
CHANGED
@@ -1,11 +1,23 @@
|
|
1
|
+
## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1 to b1266.
|
4
|
+
|
5
|
+
## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
|
6
|
+
|
7
|
+
- Bump bundled llama.cpp from b1198 to b1.
|
8
|
+
- Add `n_ctx_train` method to Model and Context.
|
9
|
+
- Add nvcc option to avoid link error ([#8](https://github.com/yoshoku/llama_cpp.rb/pull/8)).
|
10
|
+
- Set encoding on output of `generate` module function to avoid encoding error ([#9](https://github.com/yoshoku/llama_cpp.rb/pull/9)).
|
11
|
+
- Add `only_copy` option to ModelQuantizeParams.
|
12
|
+
|
1
13
|
## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
|
2
14
|
|
3
|
-
- Bump bundled llama.cpp from
|
15
|
+
- Bump bundled llama.cpp from b1140 to b1198.
|
4
16
|
|
5
17
|
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
6
18
|
|
7
19
|
**Breaking Changes**
|
8
|
-
- Bump bundled llama.cpp from
|
20
|
+
- Bump bundled llama.cpp from b1060 to b1140.
|
9
21
|
- Rename `token_to_str` method on Context to `token_to_piece` method.
|
10
22
|
- Rename `token_to_str` method on Model to `token_to_piece` method.
|
11
23
|
- Rename `type` method on Model to `desc` method.
|
@@ -14,7 +26,7 @@
|
|
14
26
|
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
15
27
|
|
16
28
|
**Breaking Changes**
|
17
|
-
- Bump bundled llama.cpp from master-097e121 to
|
29
|
+
- Bump bundled llama.cpp from master-097e121 to b1060.
|
18
30
|
- Support new file format GGUF.
|
19
31
|
- You should re-convert / re-quantize your model files.
|
20
32
|
- Remove vocab methods.
|
data/examples/prompt_jp.txt
CHANGED
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -112,7 +112,7 @@ create_makefile('llama_cpp/llama_cpp')
|
|
112
112
|
if with_config('cublas')
|
113
113
|
File.open('Makefile', 'a') do |f|
|
114
114
|
f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
|
115
|
-
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
115
|
+
f.puts "\tnvcc -shared -Xcompiler -fPIC -arch=native -c -o $@ $<"
|
116
116
|
end
|
117
117
|
end
|
118
118
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -692,6 +692,8 @@ public:
|
|
692
692
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
|
693
693
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
|
694
694
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
|
695
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_only_copy), 1);
|
696
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
695
697
|
}
|
696
698
|
|
697
699
|
private:
|
@@ -752,6 +754,18 @@ private:
|
|
752
754
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
753
755
|
return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
|
754
756
|
}
|
757
|
+
|
758
|
+
// only_copy
|
759
|
+
static VALUE _llama_model_quantize_params_set_only_copy(VALUE self, VALUE only_copy) {
|
760
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
761
|
+
ptr->params.only_copy = RTEST(only_copy) ? true : false;
|
762
|
+
return ptr->params.only_copy ? Qtrue : Qfalse;
|
763
|
+
}
|
764
|
+
|
765
|
+
static VALUE _llama_model_quantize_params_get_only_copy(VALUE self) {
|
766
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
767
|
+
return ptr->params.only_copy ? Qtrue : Qfalse;
|
768
|
+
}
|
755
769
|
};
|
756
770
|
|
757
771
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -810,6 +824,7 @@ public:
|
|
810
824
|
rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
|
811
825
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
826
|
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
827
|
+
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
813
828
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
814
829
|
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
|
815
830
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
@@ -971,6 +986,11 @@ private:
|
|
971
986
|
return INT2NUM(llama_model_n_ctx(ptr->model));
|
972
987
|
}
|
973
988
|
|
989
|
+
static VALUE _llama_model_get_model_n_ctx_train(VALUE self) {
|
990
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
991
|
+
return INT2NUM(llama_model_n_ctx_train(ptr->model));
|
992
|
+
}
|
993
|
+
|
974
994
|
static VALUE _llama_model_get_model_n_embd(VALUE self) {
|
975
995
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
976
996
|
return INT2NUM(llama_model_n_embd(ptr->model));
|
@@ -1026,7 +1046,7 @@ private:
|
|
1026
1046
|
|
1027
1047
|
llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
|
1028
1048
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1029
|
-
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
|
1049
|
+
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
|
1030
1050
|
|
1031
1051
|
if (n_tokens < 0) {
|
1032
1052
|
rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
|
@@ -1341,6 +1361,7 @@ public:
|
|
1341
1361
|
rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
|
1342
1362
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1343
1363
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1364
|
+
rb_define_method(rb_cLLaMAContext, "n_ctx_train", RUBY_METHOD_FUNC(_llama_context_n_ctx_train), 0);
|
1344
1365
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
1345
1366
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
1346
1367
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
@@ -1564,7 +1585,7 @@ private:
|
|
1564
1585
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1565
1586
|
return Qnil;
|
1566
1587
|
}
|
1567
|
-
const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
|
1588
|
+
const int n = llama_tokenize(ptr->ctx, text.c_str(), text.size(), tokens.data(), n_max_tokens, add_bos);
|
1568
1589
|
if (n < 0) {
|
1569
1590
|
rb_raise(rb_eRuntimeError, "Failed to tokenize");
|
1570
1591
|
return Qnil;
|
@@ -1733,6 +1754,15 @@ private:
|
|
1733
1754
|
return INT2NUM(llama_n_ctx(ptr->ctx));
|
1734
1755
|
}
|
1735
1756
|
|
1757
|
+
static VALUE _llama_context_n_ctx_train(VALUE self) {
|
1758
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1759
|
+
if (ptr->ctx == NULL) {
|
1760
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1761
|
+
return Qnil;
|
1762
|
+
}
|
1763
|
+
return INT2NUM(llama_n_ctx_train(ptr->ctx));
|
1764
|
+
}
|
1765
|
+
|
1736
1766
|
static VALUE _llama_context_n_embd(VALUE self) {
|
1737
1767
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1738
1768
|
if (ptr->ctx == NULL) {
|
@@ -1,8 +1,3 @@
|
|
1
|
-
// defines MAP_ANONYMOUS
|
2
|
-
#ifndef _GNU_SOURCE
|
3
|
-
#define _GNU_SOURCE
|
4
|
-
#endif
|
5
|
-
|
6
1
|
#include "ggml-alloc.h"
|
7
2
|
#include "ggml.h"
|
8
3
|
#include <assert.h>
|
@@ -136,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
|
|
136
131
|
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
137
132
|
}
|
138
133
|
|
134
|
+
static bool ggml_is_view(struct ggml_tensor * t) {
|
135
|
+
return t->view_src != NULL;
|
136
|
+
}
|
137
|
+
|
139
138
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
140
139
|
#ifdef GGML_ALLOCATOR_DEBUG
|
141
140
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
@@ -343,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
|
|
343
342
|
|
344
343
|
// allocate uncommitted virtual memory to measure the size of the graph
|
345
344
|
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
346
|
-
//
|
347
|
-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<
|
345
|
+
// 128GB for 64-bit, 1GB for 32-bit
|
346
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
348
347
|
do {
|
349
348
|
*base_addr = alloc_vmem(*size);
|
350
349
|
if (*base_addr != NULL) {
|
@@ -404,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
404
403
|
|
405
404
|
//////////// compute graph allocator
|
406
405
|
|
407
|
-
static bool ggml_is_view(struct ggml_tensor * t) {
|
408
|
-
return t->view_src != NULL;
|
409
|
-
}
|
410
|
-
|
411
406
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
412
407
|
if (a->type != b->type) {
|
413
408
|
return false;
|