RubyGems - llama_cpp - Versions diffs - 0.0.3 → 0.0.5 - Mend

llama_cpp 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +36 -0
data/README.md +5 -4
data/ext/llama_cpp/extconf.rb +38 -0
data/ext/llama_cpp/llama_cpp.cpp +118 -2
data/ext/llama_cpp/src/ggml.c +1740 -658
data/ext/llama_cpp/src/ggml.h +84 -16
data/ext/llama_cpp/src/llama.cpp +1108 -756
data/ext/llama_cpp/src/llama.h +37 -1
data/ext/llama_cpp/src/llama_util.h +396 -0
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +3 -3
data/sig/llama_cpp.rbs +6 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3ce894c9b013134688dffb18229c6f18073cdc8aceafa7d8a519803ae8ffc8a4
-  data.tar.gz: b9a09f3b7217c120d0eae5e89ecf15a4ccbedcdef92db7d5c4508d03ecd65d3c
+  metadata.gz: 2df0c858faac117b7317683fb7b9a52fc0eb4f7329f728ac6a209085af487142
+  data.tar.gz: 6b5c5d5d5d4e9020b92c7d76c12086fd77089ecc9c2181fb9d8157df5267da96
 SHA512:
-  metadata.gz: a979c8a488ec410f214873664288f618af9363d60b6ef6b3ef44de9bd7486bd223b8b38704eab09c1cec1f210c55e5d08ba03af8d6ddc87c10d8836da983c1de
-  data.tar.gz: 47228be684c3ce577b066b2255482c42f6979c4cce5852c22e85a9f0b66bdcaea58d667c56f2eefef6cfc121822a2761406cd2911abccd754c07e8568bb8550e
+  metadata.gz: 8e9d3ccdb8cdc9d4cb7b60f32a709c874953c357fdaccc057502e5761efdec62a0fc0b39929448203ffc4210dbf0ca2f6019dc13f88cf0db84b754f44fd77bea
+  data.tar.gz: 75fc1d6674c8d509ae0557308277d6d3d7e05f5a6fbea512c2472c46bea1de6e2541a67ec3dda43f874d7f64e6981b720aa1c722d3ec7ea3b96ae9084a4d201b

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,41 @@
 ## [Unreleased]
+## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
+- Bump bundled llama.cpp from master-c85e03d to master-315a95a.
+- Add `apply_lora_from_file` method to LLaMACpp::Context.
+- Add `mlock_supported?` module function to LLaMACpp.
+- Add `mmap_supported?` module function to LLaMACpp.
+- Fix to not destroy original prompt in `LLaMACpp.generate` module function.
+- Add check for context initialization.
+- Add blas config options:
+  ```
+  $ gem install llama_cpp -- --with-openblas
+  ```
+  macOS:
+  ```
+  $ gem install llama_cpp -- --with-openblas --with-opt-dir=/opt/homebrew/opt/openblas
+  $ gem install llama_cpp -- --with-accelerate
+  ```
+## [[0.0.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.3...v0.0.4)] - 2023-04-15
+- Bump bundled llama.cpp from master-698f7b5 to master-c85e03d.
+- Add parameterless constructor to LLaMACpp::Context.
+- Add free and load methods to LLaMACpp::Context.
+  ```ruby
+  require 'llama_cpp'
+  context = LLaMACpp::Context.new
+  params = LLaMACpp::ContextParams.new
+  context.load(model_path: '/path/to/ggml-model-q4_0.bin', params: params)
+  # ...
+  context.free
+  ```
 ## [[0.0.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.2...v0.0.3)] - 2023-04-08
 - Bump bundled llama.cpp from master-5b70e7d to master-698f7b5.

data/README.md CHANGED Viewed

@@ -20,17 +20,18 @@ If bundler is not being used to manage dependencies, install the gem by executin
 ## Usage
-Prepare a quantized model file by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage).
+Prepare the quantized model by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage) or
+download the qunatized model, for example [ggml-vicuna-7b-4bit](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5541351), from Hugging Face.
 ```ruby
 require 'llama_cpp'
 params = LLaMACpp::ContextParams.new
-params.seed = 123456
+params.seed = 12
-context = LLaMACpp::Context.new(model_path: '/path/to/ggml-model-q4_0.bin', params: params)
+context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
-puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.')
+puts LLaMACpp.generate(context, 'Please tell me the largest city in Japan.', n_threads: 4)
 # => "There are two major cities in Japan, Tokyo and Osaka, which have about 30 million populations."
 ```

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -10,4 +10,42 @@ $CXXFLAGS << ' -std=c++11'
 $INCFLAGS << ' -I$(srcdir)/src'
 $VPATH << '$(srcdir)/src'
+if RUBY_PLATFORM.match?(/darwin|linux|bsd/) && try_compile('#include <stdio.h>', '-pthread')
+  $CFLAGS << ' -pthread'
+  $CXXFLAGS << ' -pthread'
+end
+if with_config('openblas')
+  abort 'libopenblas is not found.' unless have_library('openblas')
+  abort 'cblas.h is not found.' unless have_header('cblas.h')
+  $CFLAGS << ' -DGGML_USE_OPENBLAS'
+end
+if with_config('accelerate')
+  $CFLAGS << ' -DGGML_USE_ACCELERATE'
+  $LDFLAGS << ' -framework Accelerate'
+end
+UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
+# rubocop:disable Layout/LineLength
+if UNAME_M.match?(/x86_64|i686/) && try_compile('#include <stdio.h>', '-march=native -mtune=native')
+  $CFLAGS << ' -march=native -mtune=native'
+  $CXXFLAGS << ' -march=native -mtune=native'
+elsif UNAME_M.match?(/aarch64/) && try_compile('#include <stdio.h>', '-mcpu=native')
+  $CFLAGS << ' -mcpu=native'
+  $CXXFLAGS << ' -mcpu=native'
+elsif UNAME_M.match?(/armv6/) && try_compile('#include <stdio.h>', '-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access')
+  $CFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access'
+  $CXXFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access'
+elsif UNAME_M.match?(/armv7/) && try_compile('#include <stdio.h>', '-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations')
+  $CFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations'
+  $CXXFLAGS << ' -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations'
+elsif UNAME_M.match?(/armv8/) && try_compile('#include <stdio.h>', '-mfp16-format=ieee -mno-unaligned-access')
+  $CFLAGS << ' -mfp16-format=ieee -mno-unaligned-access'
+  $CXXFLAGS << ' -mfp16-format=ieee -mno-unaligned-access'
+end
+# rubocop:enable Layout/LineLength
 create_makefile('llama_cpp/llama_cpp')

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -226,6 +226,9 @@ public:
     rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
     rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
     rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
+    rb_define_method(rb_cLLaMAContext, "free", RUBY_METHOD_FUNC(_llama_context_free), 0);
+    rb_define_method(rb_cLLaMAContext, "load", RUBY_METHOD_FUNC(_llama_context_load), -1);
+    rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
   };
 private:
@@ -236,7 +239,13 @@ private:
     ID kw_table[2] = { rb_intern("model_path"), rb_intern("params") };
     VALUE kw_values[2] = { Qundef, Qundef };
     rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
+    rb_get_kwargs(kw_args, kw_table, 0, 2, kw_values);
+    if (kw_values[0] == Qundef && kw_values[1] == Qundef) {
+      rb_iv_set(self, "@params", Qnil);
+      rb_iv_set(self, "@has_evaluated", Qfalse);
+      return Qnil;
+    }
     if (!RB_TYPE_P(kw_values[0], T_STRING)) {
       rb_raise(rb_eArgError, "model_path must be a string");
@@ -260,7 +269,7 @@ private:
     rb_iv_set(self, "@has_evaluated", Qfalse);
     RB_GC_GUARD(filename);
-    return self;
+    return Qnil;
   };
   static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
@@ -303,6 +312,10 @@ private:
     const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
     LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
     if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
       rb_raise(rb_eRuntimeError, "Failed to evaluate");
       return Qnil;
@@ -341,6 +354,10 @@ private:
     std::vector<llama_token> tokens(n_max_tokens);
     LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
     const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
     if (n < 0) {
       rb_raise(rb_eRuntimeError, "Failed to tokenize");
@@ -441,6 +458,10 @@ private:
     }
     LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
     llama_token token = llama_sample_top_p_top_k(ptr->ctx, last_n_tokens_data.data(), last_n_tokens_size, top_k, top_p, temp, penalty);
     return INT2NUM(token);
@@ -492,6 +513,91 @@ private:
     llama_reset_timings(ptr->ctx);
     return Qnil;
   };
+  static VALUE _llama_context_free(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx != NULL) {
+      llama_free(ptr->ctx);
+      ptr->ctx = NULL;
+      rb_iv_set(self, "@params", Qnil);
+      rb_iv_set(self, "@has_evaluated", Qfalse);
+    }
+    return Qnil;
+  }
+  static VALUE _llama_context_load(int argc, VALUE* argv, VALUE self) {
+    VALUE kw_args = Qnil;
+    ID kw_table[2] = { rb_intern("model_path"), rb_intern("params") };
+    VALUE kw_values[2] = { Qundef, Qundef };
+    rb_scan_args(argc, argv, ":", &kw_args);
+    rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
+    if (!RB_TYPE_P(kw_values[0], T_STRING)) {
+      rb_raise(rb_eArgError, "model_path must be a string");
+      return Qnil;
+    }
+    if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
+      rb_raise(rb_eArgError, "params must be a LLaMAContextParams");
+      return Qnil;
+    }
+    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
+    if (ctx_ptr->ctx != NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
+      return Qnil;
+    }
+    VALUE filename = kw_values[0];
+    LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(kw_values[1]);
+    ctx_ptr->ctx = llama_init_from_file(StringValueCStr(filename), prms_ptr->params);
+    if (ctx_ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "Failed to initialize LLaMA context");
+      return Qnil;
+    }
+    rb_iv_set(self, "@params", kw_values[1]);
+    rb_iv_set(self, "@has_evaluated", Qfalse);
+    RB_GC_GUARD(filename);
+    return Qnil;
+  };
+  static VALUE _llama_context_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
+    VALUE kw_args = Qnil;
+    ID kw_table[3] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads") };
+    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
+    rb_scan_args(argc, argv, ":", &kw_args);
+    rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
+    if (!RB_TYPE_P(kw_values[0], T_STRING)) {
+      rb_raise(rb_eArgError, "lora_path must be a string");
+      return Qnil;
+    }
+    if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
+      rb_raise(rb_eArgError, "base_model_path must be a string");
+      return Qnil;
+    }
+    if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
+      rb_raise(rb_eArgError, "n_threads must be an integer");
+      return Qnil;
+    }
+    const char* lora_path = StringValueCStr(kw_values[0]);
+    const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
+    const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx != NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is already loaded");
+      return Qnil;
+    }
+    if (llama_apply_lora_from_file(ptr->ctx, lora_path, base_model_path, n_threads) != 0) {
+      rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
+      return Qnil;
+    }
+    return Qnil;
+  };
 };
 const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -519,6 +625,14 @@ static VALUE rb_llama_print_system_info(VALUE self) {
   return rb_utf8_str_new_cstr(result);
 }
+static VALUE rb_llama_mmap_supported(VALUE self) {
+  return llama_mmap_supported() ? Qtrue : Qfalse;
+}
+static VALUE rb_llama_mlock_supported(VALUE self) {
+  return llama_mlock_supported() ? Qtrue : Qfalse;
+}
 extern "C" void Init_llama_cpp(void) {
   rb_mLLaMACpp = rb_define_module("LLaMACpp");
   RbLLaMAContext::define_class(rb_mLLaMACpp);
@@ -527,6 +641,8 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
   rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
   rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
+  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
+  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
   rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
   std::stringstream ss_magic;