RubyGems - llama_cpp - Versions diffs - 0.8.0 → 0.9.0 - Mend

llama_cpp 0.8.0 → 0.9.0

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/examples/chat.rb +8 -6
data/ext/llama_cpp/extconf.rb +2 -2
data/ext/llama_cpp/llama_cpp.cpp +81 -162
data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
data/ext/llama_cpp/src/ggml-metal.m +13 -5
data/ext/llama_cpp/src/ggml-metal.metal +9 -1
data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
data/ext/llama_cpp/src/ggml.c +362 -84
data/ext/llama_cpp/src/ggml.h +8 -7
data/ext/llama_cpp/src/llama.cpp +100 -95
data/ext/llama_cpp/src/llama.h +16 -21
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +4 -4
data/sig/llama_cpp.rbs +11 -12
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8045208b5f7801979212a4f6ed395217e78f06bcfbc2d0362aaaa04c529745cd
-  data.tar.gz: 4011dfe279d8d4041c6c79dc5a6bad199777f83b5f0559f11ccd2f68c957e462
+  metadata.gz: 683f2d81aff9e82234925ba08cd5b46b56a2283ff8397a6c06ce50d34a95dbfc
+  data.tar.gz: d3005cab273b8d85f47f4cb4314fbab3a540d366a42829e5ec8d2c29576ae09e
 SHA512:
-  metadata.gz: d15e74da491773961006eca8ca6c6d80b30ffc995c56a9140961be0002eb09134f1a029c4e8ee192497fb7256fe36cf1c3ed928967ce57ece4c7a0904392c8fe
-  data.tar.gz: a863596304ddb9ac5e4be2b2b65bebc7d3913705b8a0f516debfee0ca213f9dca69707edda8d70cfafb15500fcb6e70cffb6d5d1119302d24e05059c50f0da77
+  metadata.gz: 559f1ba1253a704c38480336decd315c65b4d80e6895ad1dc0faa3b5b81570a1faeaadcb6ec7ee3145f0fff758ab5e38e6cb8163382ce9b693d893deebe9a8f9
+  data.tar.gz: cb3d96b8c3f79cd20d4169a175270e8768c04bcaa24e51cb2c4d7872db88bc6e3349e6b1e93a130b89d21daab8be6e57b5305412059ea722084c7cb7d4a01e93

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,13 @@
+## [[0.9.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.8.0...v0.9.0)] - 2023-10-28
+- Fix missing object file for ggml-backend when building with metal and cublas options.
+**Breaking Changes**
+- Bump bundled llama.cpp from b1405 to b1429
+  - Move following methods from Context to Model:
+    - text, score, type, token_bos, token_eos, token_nl, token_prefix, token_middle, token_suffix, and token_eos.
+  - Add `sample_repetition_penalties` method, which integrates sample_frequency_and_presence_penalties and sample_repetition_penalty methods.
 ## [[0.8.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.1...v0.8.0)] - 2023-10-21
 **Breaking Changes**

data/examples/chat.rb CHANGED Viewed

@@ -83,10 +83,12 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         candidates = LLaMACpp::TokenDataArray.new(base_candidates)
         last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
-        context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: options[:repeat_penalty])
-        context.sample_frequency_and_presence_penalties(
-          candidates, last_n_tokens[-last_n_repeat..],
-          frequency: options[:frequency_penalty], presence: options[:presence_penalty]
+        context.sample_repetition_penalties(
+          candidates,
+          last_n_tokens[-last_n_repeat..],
+          penalty_repeat: options[:repeat_penalty],
+          penalty_freq: options[:frequency_penalty],
+          penalty_present: options[:presence_penalty]
         )
         context.sample_top_k(candidates, k: options[:top_k])
@@ -99,8 +101,8 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         last_n_tokens.shift
         last_n_tokens.push(id)
-        if id == context.token_eos
-          id = context.token_nl
+        if id == context.model.token_eos
+          id = context.model.token_nl
           unless antiprompt.empty?
             first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
             embd_input.concat(first_antiprompt)

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -53,7 +53,7 @@ if with_config('metal')
   $CFLAGS << ' -DGGML_USE_METAL'
   $CXXFLAGS << ' -DGGML_USE_METAL'
   $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
-  $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
+  $objs = %w[ggml.o ggml-backend.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
   $objs << 'k_quants.o' unless with_config('no_k_quants')
 end
@@ -61,7 +61,7 @@ if with_config('cublas')
   $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
   $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
   $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
-  $objs = %w[ggml.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
+  $objs = %w[ggml.o ggml-backend.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
   $objs << 'k_quants.o' unless with_config('no_k_quants')
 end

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -1148,6 +1148,16 @@ public:
     rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
     rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
     rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
+    rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
+    rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
+    rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
+    rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
+    rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
+    rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
+    rb_define_method(rb_cLLaMAModel, "token_prefix", RUBY_METHOD_FUNC(_llama_model_token_prefix), 0);
+    rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
+    rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
+    rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
   }
 private:
@@ -1396,6 +1406,62 @@ private:
     LLaMAModelWrapper* ptr = get_llama_model(self);
     return UINT2NUM(llama_model_n_params(ptr->model));
   }
+  static VALUE _llama_model_get_text(VALUE self, VALUE token_) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    const llama_token token = NUM2INT(token_);
+    const char* text = llama_token_get_text(ptr->model, token);
+    return rb_utf8_str_new_cstr(text);
+  }
+  static VALUE _llama_model_get_score(VALUE self, VALUE token_) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    const llama_token token = NUM2INT(token_);
+    const float score = llama_token_get_score(ptr->model, token);
+    return DBL2NUM(score);
+  }
+  static VALUE _llama_model_get_type(VALUE self, VALUE token_) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    const llama_token token = NUM2INT(token_);
+    const int type = llama_token_get_type(ptr->model, token);
+    return INT2NUM(type);
+  }
+  static VALUE _llama_model_token_bos(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_bos(ptr->model));
+  }
+  static VALUE _llama_model_token_eos(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_eos(ptr->model));
+  }
+  static VALUE _llama_model_token_nl(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_nl(ptr->model));
+  }
+  static VALUE _llama_model_token_prefix(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_prefix(ptr->model));
+  }
+  static VALUE _llama_model_token_middle(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_middle(ptr->model));
+  }
+  static VALUE _llama_model_token_suffix(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_suffix(ptr->model));
+  }
+  static VALUE _llama_model_token_eot(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_eot(ptr->model));
+  }
 };
 const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -1670,16 +1736,6 @@ public:
     rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
     rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
-    rb_define_method(rb_cLLaMAContext, "text", RUBY_METHOD_FUNC(_llama_context_text), 1);
-    rb_define_method(rb_cLLaMAContext, "score", RUBY_METHOD_FUNC(_llama_context_score), 1);
-    rb_define_method(rb_cLLaMAContext, "type", RUBY_METHOD_FUNC(_llama_context_type), 1);
-    rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
-    rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
-    rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
-    rb_define_method(rb_cLLaMAContext, "token_prefix", RUBY_METHOD_FUNC(_llama_context_token_prefix), 0);
-    rb_define_method(rb_cLLaMAContext, "token_middle", RUBY_METHOD_FUNC(_llama_context_token_middle), 0);
-    rb_define_method(rb_cLLaMAContext, "token_suffix", RUBY_METHOD_FUNC(_llama_context_token_suffix), 0);
-    rb_define_method(rb_cLLaMAContext, "token_eot", RUBY_METHOD_FUNC(_llama_context_token_eot), 0);
     rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
     rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
     rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
@@ -1693,8 +1749,7 @@ public:
     rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
     rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_repetition_penalty", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalty), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_frequency_and_presence_penalties", RUBY_METHOD_FUNC(_llama_context_sample_frequency_and_presence_penalties), -1);
+    rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
     rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
     rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
     rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
@@ -1927,102 +1982,6 @@ private:
     return output;
   }
-  static VALUE _llama_context_text(VALUE self, VALUE token_) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    const llama_token token = NUM2INT(token_);
-    const char* text = llama_token_get_text(ptr->ctx, token);
-    return rb_utf8_str_new_cstr(text);
-  }
-  static VALUE _llama_context_score(VALUE self, VALUE token_) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    const llama_token token = NUM2INT(token_);
-    const float score = llama_token_get_score(ptr->ctx, token);
-    return DBL2NUM(score);
-  }
-  static VALUE _llama_context_type(VALUE self, VALUE token_) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    const llama_token token = NUM2INT(token_);
-    const int type = llama_token_get_type(ptr->ctx, token);
-    return INT2NUM(type);
-  }
-  static VALUE _llama_context_token_bos(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_bos(ptr->ctx));
-  }
-  static VALUE _llama_context_token_eos(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_eos(ptr->ctx));
-  }
-  static VALUE _llama_context_token_nl(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_nl(ptr->ctx));
-  }
-  static VALUE _llama_context_token_prefix(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_prefix(ptr->ctx));
-  }
-  static VALUE _llama_context_token_middle(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_middle(ptr->ctx));
-  }
-  static VALUE _llama_context_token_suffix(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_suffix(ptr->ctx));
-  }
-  static VALUE _llama_context_token_eot(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_eot(ptr->ctx));
-  }
   static VALUE _llama_context_n_ctx(VALUE self) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2231,14 +2190,14 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_sample_repetition_penalty(int argc, VALUE* argv, VALUE self) {
+  static VALUE _llama_context_sample_repetition_penalties(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
-    ID kw_table[1] = { rb_intern("penalty") };
-    VALUE kw_values[1] = { Qundef };
+    ID kw_table[3] = { rb_intern("penalty_repeat"), rb_intern("penalty_freq"), rb_intern("penalty_present") };
+    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
     VALUE candidates = Qnil;
     VALUE last_n_tokens = Qnil;
     rb_scan_args(argc, argv, "2:", &candidates, &last_n_tokens, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
+    rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
     if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
       rb_raise(rb_eArgError, "candidates must be a TokenDataArray");
@@ -2249,56 +2208,15 @@ private:
       return Qnil;
     }
     if (!RB_FLOAT_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "penalty must be a float");
+      rb_raise(rb_eArgError, "penalty_repeat must be a float");
       return Qnil;
     }
-    const size_t last_tokens_size = RARRAY_LEN(last_n_tokens);
-    std::vector<llama_token> last_n_tokens_data(last_tokens_size);
-    for (size_t i = 0; i < last_tokens_size; i++) {
-      last_n_tokens_data[i] = NUM2INT(rb_ary_entry(last_n_tokens, i));
-    }
-    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
-    if (ctx_ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
-    if (cnd_ptr->array.data == nullptr) {
-      rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
-      return Qnil;
-    }
-    const float penalty = NUM2DBL(kw_values[0]);
-    llama_sample_repetition_penalty(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size, penalty);
-    return Qnil;
-  }
-  static VALUE _llama_context_sample_frequency_and_presence_penalties(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[2] = { rb_intern("frequency"), rb_intern("presence") };
-    VALUE kw_values[2] = { Qundef, Qundef };
-    VALUE candidates = Qnil;
-    VALUE last_n_tokens = Qnil;
-    rb_scan_args(argc, argv, "2:", &candidates, &last_n_tokens, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
-    if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
-      rb_raise(rb_eArgError, "candidates must be a TokenDataArray");
-      return Qnil;
-    }
-    if (!RB_TYPE_P(last_n_tokens, T_ARRAY)) {
-      rb_raise(rb_eArgError, "last_n_tokens must be an Array");
-      return Qnil;
-    }
-    if (!RB_FLOAT_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "frequency must be a float");
+    if (!RB_FLOAT_TYPE_P(kw_values[1])) {
+      rb_raise(rb_eArgError, "penalty_freq must be a float");
       return Qnil;
     }
-    if (!RB_FLOAT_TYPE_P(kw_values[1])) {
-      rb_raise(rb_eArgError, "presence must be a float");
+    if (!RB_FLOAT_TYPE_P(kw_values[2])) {
+      rb_raise(rb_eArgError, "penalty_present must be a float");
       return Qnil;
     }
@@ -2318,11 +2236,12 @@ private:
       rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
       return Qnil;
     }
+    const float penalty_repeat = NUM2DBL(kw_values[0]);
+    const float penalty_freq = NUM2DBL(kw_values[1]);
+    const float penalty_present = NUM2DBL(kw_values[2]);
-    const float alpha_frequency = NUM2DBL(kw_values[0]);
-    const float alpha_presence = NUM2DBL(kw_values[1]);
-    llama_sample_frequency_and_presence_penalties(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size, alpha_frequency, alpha_presence);
+    llama_sample_repetition_penalties(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size,
+        penalty_repeat, penalty_freq, penalty_present);
     return Qnil;
   }