RubyGems - llama_cpp - Versions diffs - 0.5.1 → 0.5.3 - Mend

llama_cpp 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +15 -3
data/examples/prompt_jp.txt +1 -1
data/ext/llama_cpp/extconf.rb +1 -1
data/ext/llama_cpp/llama_cpp.cpp +32 -2
data/ext/llama_cpp/src/ggml-alloc.c +6 -11
data/ext/llama_cpp/src/ggml-cuda.cu +1108 -699
data/ext/llama_cpp/src/ggml-metal.m +93 -24
data/ext/llama_cpp/src/ggml-metal.metal +407 -174
data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
data/ext/llama_cpp/src/ggml.c +75 -43
data/ext/llama_cpp/src/ggml.h +42 -32
data/ext/llama_cpp/src/k_quants.c +4 -1
data/ext/llama_cpp/src/llama.cpp +1040 -201
data/ext/llama_cpp/src/llama.h +13 -7
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +1 -1
data/sig/llama_cpp.rbs +4 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
-  data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
+  metadata.gz: c45589a61587acfbe88add77ffb135a7949619ba2936178c59126c24c30e23cc
+  data.tar.gz: 5866b5b5f8dab59432cc91beca290f927a8d1bc694f83c8ccbe366c6f636f47c
 SHA512:
-  metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
-  data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
+  metadata.gz: 89714a2a920172c1ddc4fff56a11390ad97db62eb6bd4eefe3ba9376132bd6646eda7569acea49ff3a1ce87486cac0e623cac4fddfcb2b57629c30ee3457d38b
+  data.tar.gz: 2c5494528f55b86c57fccb18658350058210e85f31e9ecb8b3587a4da68a1465a0987a98e86dbae16fa5fea9a9502aed96afec3679e5553a82805f4436ef3020

data/CHANGELOG.md CHANGED Viewed

@@ -1,11 +1,23 @@
+## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
+- Bump bundled llama.cpp from b1 to b1266.
+## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
+- Bump bundled llama.cpp from b1198 to b1.
+  - Add `n_ctx_train` method to Model and Context.
+- Add nvcc option to avoid link error ([#8](https://github.com/yoshoku/llama_cpp.rb/pull/8)).
+- Set encoding on output of `generate` module function to avoid encoding error ([#9](https://github.com/yoshoku/llama_cpp.rb/pull/9)).
+- Add `only_copy` option to ModelQuantizeParams.
 ## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
-- Bump bundled llama.cpp from master-b1140 to master-b1198.
+- Bump bundled llama.cpp from b1140 to b1198.
 ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
 **Breaking Changes**
-- Bump bundled llama.cpp from master-b1060 to master-b1140.
+- Bump bundled llama.cpp from b1060 to b1140.
   - Rename `token_to_str` method on Context to `token_to_piece` method.
   - Rename `token_to_str` method on Model to `token_to_piece` method.
   - Rename `type` method on Model to `desc` method.
@@ -14,7 +26,7 @@
 ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
 **Breaking Changes**
-- Bump bundled llama.cpp from master-097e121 to master-b1060.
+- Bump bundled llama.cpp from master-097e121 to b1060.
   - Support new file format GGUF.
     - You should re-convert / re-quantize your model files.
   - Remove vocab methods.

data/examples/prompt_jp.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 UserがTaroという名前のアシスタントと対話するダイアログのトランスクリプト。
-Taroは親切で、親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
+Taroは親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
 User: こんにちには、Taro。
 Taro: こんにちは、今日はどのような要件ですか？

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -112,7 +112,7 @@ create_makefile('llama_cpp/llama_cpp')
 if with_config('cublas')
   File.open('Makefile', 'a') do |f|
     f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
-    f.puts "\tnvcc -arch=native -c -o $@ $<"
+    f.puts "\tnvcc -shared -Xcompiler -fPIC -arch=native -c -o $@ $<"
   end
 end

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -692,6 +692,8 @@ public:
     rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
     rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
     rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_only_copy), 1);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
   }
 private:
@@ -752,6 +754,18 @@ private:
     LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
     return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
   }
+  // only_copy
+  static VALUE _llama_model_quantize_params_set_only_copy(VALUE self, VALUE only_copy) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    ptr->params.only_copy = RTEST(only_copy) ? true : false;
+    return ptr->params.only_copy ? Qtrue : Qfalse;
+  }
+  static VALUE _llama_model_quantize_params_get_only_copy(VALUE self) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    return ptr->params.only_copy ? Qtrue : Qfalse;
+  }
 };
 const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -810,6 +824,7 @@ public:
     rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
     rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
     rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
+    rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
     rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
     rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
     rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
@@ -971,6 +986,11 @@ private:
     return INT2NUM(llama_model_n_ctx(ptr->model));
   }
+  static VALUE _llama_model_get_model_n_ctx_train(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_model_n_ctx_train(ptr->model));
+  }
   static VALUE _llama_model_get_model_n_embd(VALUE self) {
     LLaMAModelWrapper* ptr = get_llama_model(self);
     return INT2NUM(llama_model_n_embd(ptr->model));
@@ -1026,7 +1046,7 @@ private:
     llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
     LLaMAModelWrapper* ptr = get_llama_model(self);
-    const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
+    const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
     if (n_tokens < 0) {
       rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
@@ -1341,6 +1361,7 @@ public:
     rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
     rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
     rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
+    rb_define_method(rb_cLLaMAContext, "n_ctx_train", RUBY_METHOD_FUNC(_llama_context_n_ctx_train), 0);
     rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
     rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
     rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
@@ -1564,7 +1585,7 @@ private:
       rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
       return Qnil;
     }
-    const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
+    const int n = llama_tokenize(ptr->ctx, text.c_str(), text.size(), tokens.data(), n_max_tokens, add_bos);
     if (n < 0) {
       rb_raise(rb_eRuntimeError, "Failed to tokenize");
       return Qnil;
@@ -1733,6 +1754,15 @@ private:
     return INT2NUM(llama_n_ctx(ptr->ctx));
   }
+  static VALUE _llama_context_n_ctx_train(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    return INT2NUM(llama_n_ctx_train(ptr->ctx));
+  }
   static VALUE _llama_context_n_embd(VALUE self) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {

data/ext/llama_cpp/src/ggml-alloc.c CHANGED Viewed

@@ -1,8 +1,3 @@
-// defines MAP_ANONYMOUS
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
 #include "ggml-alloc.h"
 #include "ggml.h"
 #include <assert.h>
@@ -136,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
     return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
 }
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->view_src != NULL;
+}
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 #ifdef GGML_ALLOCATOR_DEBUG
     GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
@@ -343,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
 // allocate uncommitted virtual memory to measure the size of the graph
 static void alloc_measure_vmem(void ** base_addr, size_t * size) {
-    // 1TB for 64-bit, 1GB for 32-bit
-    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
+    // 128GB for 64-bit, 1GB for 32-bit
+    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
     do {
         *base_addr = alloc_vmem(*size);
         if (*base_addr != NULL) {
@@ -404,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
 //////////// compute graph allocator
-static bool ggml_is_view(struct ggml_tensor * t) {
-    return t->view_src != NULL;
-}
 static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
     if (a->type != b->type) {
         return false;