RubyGems - llama_cpp - Versions diffs - 0.7.1 → 0.9.0 - Mend

llama_cpp 0.7.1 → 0.9.0

Files changed (18) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +18 -0
data/examples/chat.rb +8 -6
data/ext/llama_cpp/extconf.rb +2 -2
data/ext/llama_cpp/llama_cpp.cpp +122 -183
data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
data/ext/llama_cpp/src/ggml-metal.m +57 -8
data/ext/llama_cpp/src/ggml-metal.metal +171 -2
data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
data/ext/llama_cpp/src/ggml.c +375 -93
data/ext/llama_cpp/src/ggml.h +11 -9
data/ext/llama_cpp/src/k_quants.c +12 -20
data/ext/llama_cpp/src/llama.cpp +459 -153
data/ext/llama_cpp/src/llama.h +34 -33
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +4 -4
data/sig/llama_cpp.rbs +15 -16
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6688a7296f7a7e7ba4aa593b2d9b792beb1d569f7f2e0e872e1dbda64a336b57
-  data.tar.gz: 3f683714c3b11b8f247d9ef40774b90e297c25f3bf2ab478e763bda9c983d73a
+  metadata.gz: 683f2d81aff9e82234925ba08cd5b46b56a2283ff8397a6c06ce50d34a95dbfc
+  data.tar.gz: d3005cab273b8d85f47f4cb4314fbab3a540d366a42829e5ec8d2c29576ae09e
 SHA512:
-  metadata.gz: d7dc061516e688624f4090b956fd40999c9e2e5d2ae41fe8a1baac3caaf61ed9aef3ef31e8ca971e0a210a592cb3618f67533483e5808e2e9205e2ba9a7dfcf8
-  data.tar.gz: aae1a4952d19aa186aa2ea97ce59af1dac7295f5430108aaf6545949218851b31c266472cf6111a62f7a5784c5f23fd3e3697f1181d5e659c217975890eed299
+  metadata.gz: 559f1ba1253a704c38480336decd315c65b4d80e6895ad1dc0faa3b5b81570a1faeaadcb6ec7ee3145f0fff758ab5e38e6cb8163382ce9b693d893deebe9a8f9
+  data.tar.gz: cb3d96b8c3f79cd20d4169a175270e8768c04bcaa24e51cb2c4d7872db88bc6e3349e6b1e93a130b89d21daab8be6e57b5305412059ea722084c7cb7d4a01e93

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,21 @@
+## [[0.9.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.8.0...v0.9.0)] - 2023-10-28
+- Fix missing object file for ggml-backend when building with metal and cublas options.
+**Breaking Changes**
+- Bump bundled llama.cpp from b1405 to b1429
+  - Move following methods from Context to Model:
+    - text, score, type, token_bos, token_eos, token_nl, token_prefix, token_middle, token_suffix, and token_eos.
+  - Add `sample_repetition_penalties` method, which integrates sample_frequency_and_presence_penalties and sample_repetition_penalty methods.
+## [[0.8.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.1...v0.8.0)] - 2023-10-21
+**Breaking Changes**
+- Bump bundled llama.cpp from b1380 to b1405
+  - Add column index argument to `set_seq_id` and `get_seq_id` methods in Batch.
+  - Add `special` keyword argument to `tokenize` method in Model.
+  - Add `n_seq_max` keyword argument to `initialize` method in Batch.
 ## [[0.7.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.0...v0.7.1)] - 2023-10-14
 - Bump bundled llama.cpp from b1334 to b1380.

data/examples/chat.rb CHANGED Viewed

@@ -83,10 +83,12 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         candidates = LLaMACpp::TokenDataArray.new(base_candidates)
         last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
-        context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: options[:repeat_penalty])
-        context.sample_frequency_and_presence_penalties(
-          candidates, last_n_tokens[-last_n_repeat..],
-          frequency: options[:frequency_penalty], presence: options[:presence_penalty]
+        context.sample_repetition_penalties(
+          candidates,
+          last_n_tokens[-last_n_repeat..],
+          penalty_repeat: options[:repeat_penalty],
+          penalty_freq: options[:frequency_penalty],
+          penalty_present: options[:presence_penalty]
         )
         context.sample_top_k(candidates, k: options[:top_k])
@@ -99,8 +101,8 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         last_n_tokens.shift
         last_n_tokens.push(id)
-        if id == context.token_eos
-          id = context.token_nl
+        if id == context.model.token_eos
+          id = context.model.token_nl
           unless antiprompt.empty?
             first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
             embd_input.concat(first_antiprompt)

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -53,7 +53,7 @@ if with_config('metal')
   $CFLAGS << ' -DGGML_USE_METAL'
   $CXXFLAGS << ' -DGGML_USE_METAL'
   $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
-  $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
+  $objs = %w[ggml.o ggml-backend.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
   $objs << 'k_quants.o' unless with_config('no_k_quants')
 end
@@ -61,7 +61,7 @@ if with_config('cublas')
   $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
   $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
   $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
-  $objs = %w[ggml.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
+  $objs = %w[ggml.o ggml-backend.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
   $objs << 'k_quants.o' unless with_config('no_k_quants')
 end

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -63,8 +63,8 @@ public:
     rb_define_method(rb_cLLaMABatch, "get_token", RUBY_METHOD_FUNC(_llama_batch_get_token), 1);
     rb_define_method(rb_cLLaMABatch, "set_pos", RUBY_METHOD_FUNC(_llama_batch_set_pos), 2);
     rb_define_method(rb_cLLaMABatch, "get_pos", RUBY_METHOD_FUNC(_llama_batch_get_pos), 1);
-    rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id), 2);
-    rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id), 1);
+    rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id), 3);
+    rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id), 2);
     rb_define_method(rb_cLLaMABatch, "set_logits", RUBY_METHOD_FUNC(_llama_batch_set_logits), 2);
     rb_define_method(rb_cLLaMABatch, "get_logits", RUBY_METHOD_FUNC(_llama_batch_get_logits), 1);
   }
@@ -74,10 +74,10 @@ private:
   static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
-    ID kw_table[2] = { rb_intern("n_tokens"), rb_intern("embd") };
-    VALUE kw_values[2] = { Qundef, Qundef };
+    ID kw_table[3] = { rb_intern("n_tokens"), rb_intern("embd"), rb_intern("n_seq_max") };
+    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
     rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
+    rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
     if (!RB_INTEGER_TYPE_P(kw_values[0])) {
       rb_raise(rb_eArgError, "n_tokens must be an integer");
@@ -87,12 +87,17 @@ private:
       rb_raise(rb_eArgError, "embd must be an integer");
       return Qnil;
     }
+    if (!RB_INTEGER_TYPE_P(kw_values[2])) {
+      rb_raise(rb_eArgError, "n_seq_max must be an integer");
+      return Qnil;
+    }
     const int32_t n_tokens = NUM2INT(kw_values[0]);
     const int32_t embd = NUM2INT(kw_values[1]);
+    const int32_t n_seq_max = NUM2INT(kw_values[2]);
     LLaMABatchWrapper* ptr = get_llama_batch(self);
-    ptr->batch = llama_batch_init(n_tokens, embd);
+    ptr->batch = llama_batch_init(n_tokens, embd, n_seq_max);
     return Qnil;
   }
@@ -190,25 +195,35 @@ private:
   }
   // seq_id
-  static VALUE _llama_batch_set_seq_id(VALUE self, VALUE idx, VALUE value) {
+  static VALUE _llama_batch_set_seq_id(VALUE self, VALUE i_, VALUE j_, VALUE value) {
     LLaMABatchWrapper* ptr = get_llama_batch(self);
-    const int32_t id = NUM2INT(idx);
-    if (id < 0 || id >= ptr->batch.n_tokens) {
-      rb_raise(rb_eArgError, "id must be in [0, n_tokens)");
+    const int32_t i = NUM2INT(i_);
+    if (i < 0 || i >= ptr->batch.n_tokens) {
+      rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
+      return Qnil;
+    }
+    const int32_t j = NUM2INT(j_);
+    if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
+      rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
       return Qnil;
     }
-    ptr->batch.seq_id[id] = NUM2INT(value);
-    return INT2NUM(ptr->batch.seq_id[id]);
+    ptr->batch.seq_id[i][j] = NUM2INT(value);
+    return INT2NUM(ptr->batch.seq_id[i][j]);
   }
-  static VALUE _llama_batch_get_seq_id(VALUE self, VALUE idx) {
+  static VALUE _llama_batch_get_seq_id(VALUE self, VALUE i_, VALUE j_) {
     LLaMABatchWrapper* ptr = get_llama_batch(self);
-    const int32_t id = NUM2INT(idx);
-    if (id < 0 || id >= ptr->batch.n_tokens) {
-      rb_raise(rb_eArgError, "id must be in [0, n_tokens)");
+    const int32_t i = NUM2INT(i_);
+    if (i < 0 || i >= ptr->batch.n_tokens) {
+      rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
       return Qnil;
     }
-    return INT2NUM(ptr->batch.seq_id[id]);
+    const int32_t j = NUM2INT(j_);
+    if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
+      rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
+      return Qnil;
+    }
+    return INT2NUM(ptr->batch.seq_id[i][j]);
   }
   // logits
@@ -1133,6 +1148,16 @@ public:
     rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
     rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
     rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
+    rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
+    rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
+    rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
+    rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
+    rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
+    rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
+    rb_define_method(rb_cLLaMAModel, "token_prefix", RUBY_METHOD_FUNC(_llama_model_token_prefix), 0);
+    rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
+    rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
+    rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
   }
 private:
@@ -1319,10 +1344,10 @@ private:
   static VALUE _llama_model_tokenize(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
-    ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
-    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
+    ID kw_table[4] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos"), rb_intern("special") };
+    VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
     rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
+    rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
     if (!RB_TYPE_P(kw_values[0], T_STRING)) {
       rb_raise(rb_eArgError, "text must be a String");
@@ -1336,15 +1361,20 @@ private:
       rb_raise(rb_eArgError, "add_bos must be a boolean");
       return Qnil;
     }
+    if (kw_values[3] != Qundef && (kw_values[3] != Qtrue && kw_values[3] != Qfalse)) {
+      rb_raise(rb_eArgError, "special must be a boolean");
+      return Qnil;
+    }
     VALUE text_ = kw_values[0];
     std::string text = StringValueCStr(text_);
     const bool add_bos = kw_values[2] == Qtrue ? true : false;
+    const bool special = kw_values[3] == Qtrue ? true : false;
     const int n_max_tokens = kw_values[1] != Qundef ? NUM2INT(kw_values[1]) : text.size() + (add_bos ? 1 : 0);
     llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
     LLaMAModelWrapper* ptr = get_llama_model(self);
-    const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
+    const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos, special);
     if (n_tokens < 0) {
       rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
@@ -1376,6 +1406,62 @@ private:
     LLaMAModelWrapper* ptr = get_llama_model(self);
     return UINT2NUM(llama_model_n_params(ptr->model));
   }
+  static VALUE _llama_model_get_text(VALUE self, VALUE token_) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    const llama_token token = NUM2INT(token_);
+    const char* text = llama_token_get_text(ptr->model, token);
+    return rb_utf8_str_new_cstr(text);
+  }
+  static VALUE _llama_model_get_score(VALUE self, VALUE token_) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    const llama_token token = NUM2INT(token_);
+    const float score = llama_token_get_score(ptr->model, token);
+    return DBL2NUM(score);
+  }
+  static VALUE _llama_model_get_type(VALUE self, VALUE token_) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    const llama_token token = NUM2INT(token_);
+    const int type = llama_token_get_type(ptr->model, token);
+    return INT2NUM(type);
+  }
+  static VALUE _llama_model_token_bos(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_bos(ptr->model));
+  }
+  static VALUE _llama_model_token_eos(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_eos(ptr->model));
+  }
+  static VALUE _llama_model_token_nl(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_nl(ptr->model));
+  }
+  static VALUE _llama_model_token_prefix(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_prefix(ptr->model));
+  }
+  static VALUE _llama_model_token_middle(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_middle(ptr->model));
+  }
+  static VALUE _llama_model_token_suffix(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_suffix(ptr->model));
+  }
+  static VALUE _llama_model_token_eot(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_eot(ptr->model));
+  }
 };
 const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -1650,16 +1736,6 @@ public:
     rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
     rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
-    rb_define_method(rb_cLLaMAContext, "text", RUBY_METHOD_FUNC(_llama_context_text), 1);
-    rb_define_method(rb_cLLaMAContext, "score", RUBY_METHOD_FUNC(_llama_context_score), 1);
-    rb_define_method(rb_cLLaMAContext, "type", RUBY_METHOD_FUNC(_llama_context_type), 1);
-    rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
-    rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
-    rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
-    rb_define_method(rb_cLLaMAContext, "token_prefix", RUBY_METHOD_FUNC(_llama_context_token_prefix), 0);
-    rb_define_method(rb_cLLaMAContext, "token_middle", RUBY_METHOD_FUNC(_llama_context_token_middle), 0);
-    rb_define_method(rb_cLLaMAContext, "token_suffix", RUBY_METHOD_FUNC(_llama_context_token_suffix), 0);
-    rb_define_method(rb_cLLaMAContext, "token_eot", RUBY_METHOD_FUNC(_llama_context_token_eot), 0);
     rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
     rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
     rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
@@ -1673,8 +1749,7 @@ public:
     rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
     rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_repetition_penalty", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalty), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_frequency_and_presence_penalties", RUBY_METHOD_FUNC(_llama_context_sample_frequency_and_presence_penalties), -1);
+    rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
     rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
     rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
     rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
@@ -1907,102 +1982,6 @@ private:
     return output;
   }
-  static VALUE _llama_context_text(VALUE self, VALUE token_) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    const llama_token token = NUM2INT(token_);
-    const char* text = llama_token_get_text(ptr->ctx, token);
-    return rb_utf8_str_new_cstr(text);
-  }
-  static VALUE _llama_context_score(VALUE self, VALUE token_) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    const llama_token token = NUM2INT(token_);
-    const float score = llama_token_get_score(ptr->ctx, token);
-    return DBL2NUM(score);
-  }
-  static VALUE _llama_context_type(VALUE self, VALUE token_) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    const llama_token token = NUM2INT(token_);
-    const int type = llama_token_get_type(ptr->ctx, token);
-    return INT2NUM(type);
-  }
-  static VALUE _llama_context_token_bos(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_bos(ptr->ctx));
-  }
-  static VALUE _llama_context_token_eos(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_eos(ptr->ctx));
-  }
-  static VALUE _llama_context_token_nl(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_nl(ptr->ctx));
-  }
-  static VALUE _llama_context_token_prefix(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_prefix(ptr->ctx));
-  }
-  static VALUE _llama_context_token_middle(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_middle(ptr->ctx));
-  }
-  static VALUE _llama_context_token_suffix(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_suffix(ptr->ctx));
-  }
-  static VALUE _llama_context_token_eot(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_eot(ptr->ctx));
-  }
   static VALUE _llama_context_n_ctx(VALUE self) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2211,14 +2190,14 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_sample_repetition_penalty(int argc, VALUE* argv, VALUE self) {
+  static VALUE _llama_context_sample_repetition_penalties(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
-    ID kw_table[1] = { rb_intern("penalty") };
-    VALUE kw_values[1] = { Qundef };
+    ID kw_table[3] = { rb_intern("penalty_repeat"), rb_intern("penalty_freq"), rb_intern("penalty_present") };
+    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
     VALUE candidates = Qnil;
     VALUE last_n_tokens = Qnil;
     rb_scan_args(argc, argv, "2:", &candidates, &last_n_tokens, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
+    rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
     if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
       rb_raise(rb_eArgError, "candidates must be a TokenDataArray");
@@ -2229,56 +2208,15 @@ private:
       return Qnil;
     }
     if (!RB_FLOAT_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "penalty must be a float");
+      rb_raise(rb_eArgError, "penalty_repeat must be a float");
       return Qnil;
     }
-    const size_t last_tokens_size = RARRAY_LEN(last_n_tokens);
-    std::vector<llama_token> last_n_tokens_data(last_tokens_size);
-    for (size_t i = 0; i < last_tokens_size; i++) {
-      last_n_tokens_data[i] = NUM2INT(rb_ary_entry(last_n_tokens, i));
-    }
-    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
-    if (ctx_ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
-    if (cnd_ptr->array.data == nullptr) {
-      rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
-      return Qnil;
-    }
-    const float penalty = NUM2DBL(kw_values[0]);
-    llama_sample_repetition_penalty(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size, penalty);
-    return Qnil;
-  }
-  static VALUE _llama_context_sample_frequency_and_presence_penalties(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[2] = { rb_intern("frequency"), rb_intern("presence") };
-    VALUE kw_values[2] = { Qundef, Qundef };
-    VALUE candidates = Qnil;
-    VALUE last_n_tokens = Qnil;
-    rb_scan_args(argc, argv, "2:", &candidates, &last_n_tokens, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
-    if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
-      rb_raise(rb_eArgError, "candidates must be a TokenDataArray");
-      return Qnil;
-    }
-    if (!RB_TYPE_P(last_n_tokens, T_ARRAY)) {
-      rb_raise(rb_eArgError, "last_n_tokens must be an Array");
-      return Qnil;
-    }
-    if (!RB_FLOAT_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "frequency must be a float");
+    if (!RB_FLOAT_TYPE_P(kw_values[1])) {
+      rb_raise(rb_eArgError, "penalty_freq must be a float");
       return Qnil;
     }
-    if (!RB_FLOAT_TYPE_P(kw_values[1])) {
-      rb_raise(rb_eArgError, "presence must be a float");
+    if (!RB_FLOAT_TYPE_P(kw_values[2])) {
+      rb_raise(rb_eArgError, "penalty_present must be a float");
       return Qnil;
     }
@@ -2298,11 +2236,12 @@ private:
       rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
       return Qnil;
     }
+    const float penalty_repeat = NUM2DBL(kw_values[0]);
+    const float penalty_freq = NUM2DBL(kw_values[1]);
+    const float penalty_present = NUM2DBL(kw_values[2]);
-    const float alpha_frequency = NUM2DBL(kw_values[0]);
-    const float alpha_presence = NUM2DBL(kw_values[1]);
-    llama_sample_frequency_and_presence_penalties(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size, alpha_frequency, alpha_presence);
+    llama_sample_repetition_penalties(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size,
+        penalty_repeat, penalty_freq, penalty_present);
     return Qnil;
   }