llama_cpp 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
4
- data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
3
+ metadata.gz: c45589a61587acfbe88add77ffb135a7949619ba2936178c59126c24c30e23cc
4
+ data.tar.gz: 5866b5b5f8dab59432cc91beca290f927a8d1bc694f83c8ccbe366c6f636f47c
5
5
  SHA512:
6
- metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
7
- data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
6
+ metadata.gz: 89714a2a920172c1ddc4fff56a11390ad97db62eb6bd4eefe3ba9376132bd6646eda7569acea49ff3a1ce87486cac0e623cac4fddfcb2b57629c30ee3457d38b
7
+ data.tar.gz: 2c5494528f55b86c57fccb18658350058210e85f31e9ecb8b3587a4da68a1465a0987a98e86dbae16fa5fea9a9502aed96afec3679e5553a82805f4436ef3020
data/CHANGELOG.md CHANGED
@@ -1,11 +1,23 @@
1
+ ## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
2
+
3
+ - Bump bundled llama.cpp from b1 to b1266.
4
+
5
+ ## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
6
+
7
+ - Bump bundled llama.cpp from b1198 to b1.
8
+ - Add `n_ctx_train` method to Model and Context.
9
+ - Add nvcc option to avoid link error ([#8](https://github.com/yoshoku/llama_cpp.rb/pull/8)).
10
+ - Set encoding on output of `generate` module function to avoid encoding error ([#9](https://github.com/yoshoku/llama_cpp.rb/pull/9)).
11
+ - Add `only_copy` option to ModelQuantizeParams.
12
+
1
13
  ## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
2
14
 
3
- - Bump bundled llama.cpp from master-b1140 to master-b1198.
15
+ - Bump bundled llama.cpp from b1140 to b1198.
4
16
 
5
17
  ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
6
18
 
7
19
  **Breaking Changes**
8
- - Bump bundled llama.cpp from master-b1060 to master-b1140.
20
+ - Bump bundled llama.cpp from b1060 to b1140.
9
21
  - Rename `token_to_str` method on Context to `token_to_piece` method.
10
22
  - Rename `token_to_str` method on Model to `token_to_piece` method.
11
23
  - Rename `type` method on Model to `desc` method.
@@ -14,7 +26,7 @@
14
26
  ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
15
27
 
16
28
  **Breaking Changes**
17
- - Bump bundled llama.cpp from master-097e121 to master-b1060.
29
+ - Bump bundled llama.cpp from master-097e121 to b1060.
18
30
  - Support new file format GGUF.
19
31
  - You should re-convert / re-quantize your model files.
20
32
  - Remove vocab methods.
@@ -1,5 +1,5 @@
1
1
  UserがTaroという名前のアシスタントと対話するダイアログのトランスクリプト。
2
- Taroは親切で、親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
2
+ Taroは親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
3
3
 
4
4
  User: こんにちには、Taro。
5
5
  Taro: こんにちは、今日はどのような要件ですか?
@@ -112,7 +112,7 @@ create_makefile('llama_cpp/llama_cpp')
112
112
  if with_config('cublas')
113
113
  File.open('Makefile', 'a') do |f|
114
114
  f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
115
- f.puts "\tnvcc -arch=native -c -o $@ $<"
115
+ f.puts "\tnvcc -shared -Xcompiler -fPIC -arch=native -c -o $@ $<"
116
116
  end
117
117
  end
118
118
 
@@ -692,6 +692,8 @@ public:
692
692
  rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
693
693
  rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
694
694
  rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
695
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_only_copy), 1);
696
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
695
697
  }
696
698
 
697
699
  private:
@@ -752,6 +754,18 @@ private:
752
754
  LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
753
755
  return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
754
756
  }
757
+
758
+ // only_copy
759
+ static VALUE _llama_model_quantize_params_set_only_copy(VALUE self, VALUE only_copy) {
760
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
761
+ ptr->params.only_copy = RTEST(only_copy) ? true : false;
762
+ return ptr->params.only_copy ? Qtrue : Qfalse;
763
+ }
764
+
765
+ static VALUE _llama_model_quantize_params_get_only_copy(VALUE self) {
766
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
767
+ return ptr->params.only_copy ? Qtrue : Qfalse;
768
+ }
755
769
  };
756
770
 
757
771
  const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -810,6 +824,7 @@ public:
810
824
  rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
811
825
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
812
826
  rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
827
+ rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
813
828
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
814
829
  rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
815
830
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
@@ -971,6 +986,11 @@ private:
971
986
  return INT2NUM(llama_model_n_ctx(ptr->model));
972
987
  }
973
988
 
989
+ static VALUE _llama_model_get_model_n_ctx_train(VALUE self) {
990
+ LLaMAModelWrapper* ptr = get_llama_model(self);
991
+ return INT2NUM(llama_model_n_ctx_train(ptr->model));
992
+ }
993
+
974
994
  static VALUE _llama_model_get_model_n_embd(VALUE self) {
975
995
  LLaMAModelWrapper* ptr = get_llama_model(self);
976
996
  return INT2NUM(llama_model_n_embd(ptr->model));
@@ -1026,7 +1046,7 @@ private:
1026
1046
 
1027
1047
  llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
1028
1048
  LLaMAModelWrapper* ptr = get_llama_model(self);
1029
- const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
1049
+ const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
1030
1050
 
1031
1051
  if (n_tokens < 0) {
1032
1052
  rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
@@ -1341,6 +1361,7 @@ public:
1341
1361
  rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
1342
1362
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
1343
1363
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1364
+ rb_define_method(rb_cLLaMAContext, "n_ctx_train", RUBY_METHOD_FUNC(_llama_context_n_ctx_train), 0);
1344
1365
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
1345
1366
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
1346
1367
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
@@ -1564,7 +1585,7 @@ private:
1564
1585
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1565
1586
  return Qnil;
1566
1587
  }
1567
- const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
1588
+ const int n = llama_tokenize(ptr->ctx, text.c_str(), text.size(), tokens.data(), n_max_tokens, add_bos);
1568
1589
  if (n < 0) {
1569
1590
  rb_raise(rb_eRuntimeError, "Failed to tokenize");
1570
1591
  return Qnil;
@@ -1733,6 +1754,15 @@ private:
1733
1754
  return INT2NUM(llama_n_ctx(ptr->ctx));
1734
1755
  }
1735
1756
 
1757
+ static VALUE _llama_context_n_ctx_train(VALUE self) {
1758
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1759
+ if (ptr->ctx == NULL) {
1760
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1761
+ return Qnil;
1762
+ }
1763
+ return INT2NUM(llama_n_ctx_train(ptr->ctx));
1764
+ }
1765
+
1736
1766
  static VALUE _llama_context_n_embd(VALUE self) {
1737
1767
  LLaMAContextWrapper* ptr = get_llama_context(self);
1738
1768
  if (ptr->ctx == NULL) {
@@ -1,8 +1,3 @@
1
- // defines MAP_ANONYMOUS
2
- #ifndef _GNU_SOURCE
3
- #define _GNU_SOURCE
4
- #endif
5
-
6
1
  #include "ggml-alloc.h"
7
2
  #include "ggml.h"
8
3
  #include <assert.h>
@@ -136,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
136
131
  return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
137
132
  }
138
133
 
134
+ static bool ggml_is_view(struct ggml_tensor * t) {
135
+ return t->view_src != NULL;
136
+ }
137
+
139
138
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
140
139
  #ifdef GGML_ALLOCATOR_DEBUG
141
140
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
@@ -343,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
343
342
 
344
343
  // allocate uncommitted virtual memory to measure the size of the graph
345
344
  static void alloc_measure_vmem(void ** base_addr, size_t * size) {
346
- // 1TB for 64-bit, 1GB for 32-bit
347
- *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
345
+ // 128GB for 64-bit, 1GB for 32-bit
346
+ *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
348
347
  do {
349
348
  *base_addr = alloc_vmem(*size);
350
349
  if (*base_addr != NULL) {
@@ -404,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
404
403
 
405
404
  //////////// compute graph allocator
406
405
 
407
- static bool ggml_is_view(struct ggml_tensor * t) {
408
- return t->view_src != NULL;
409
- }
410
-
411
406
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
412
407
  if (a->type != b->type) {
413
408
  return false;