RubyGems - llama_cpp - Versions diffs - 0.8.0 → 0.9.1 - Mend

llama_cpp 0.8.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +19 -0
data/examples/chat.rb +8 -6
data/ext/llama_cpp/extconf.rb +3 -11
data/ext/llama_cpp/llama_cpp.cpp +228 -165
data/ext/llama_cpp/src/ggml-cuda.cu +441 -77
data/ext/llama_cpp/src/ggml-impl.h +237 -0
data/ext/llama_cpp/src/ggml-metal.m +71 -42
data/ext/llama_cpp/src/ggml-metal.metal +171 -35
data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
data/ext/llama_cpp/src/ggml.c +1303 -3419
data/ext/llama_cpp/src/ggml.h +33 -11
data/ext/llama_cpp/src/llama.cpp +1925 -2655
data/ext/llama_cpp/src/llama.h +48 -33
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +4 -4
data/sig/llama_cpp.rbs +34 -14
metadata +5 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8045208b5f7801979212a4f6ed395217e78f06bcfbc2d0362aaaa04c529745cd
-  data.tar.gz: 4011dfe279d8d4041c6c79dc5a6bad199777f83b5f0559f11ccd2f68c957e462
+  metadata.gz: dae7507ce41f18e3fd0fb2d7445275a387a3914068aa9eef922f260de699970a
+  data.tar.gz: d66cc2629aeca3285bc10988f8c410fb8cf5b7f1fe6f835b5dc60e9dcab4be9d
 SHA512:
-  metadata.gz: d15e74da491773961006eca8ca6c6d80b30ffc995c56a9140961be0002eb09134f1a029c4e8ee192497fb7256fe36cf1c3ed928967ce57ece4c7a0904392c8fe
-  data.tar.gz: a863596304ddb9ac5e4be2b2b65bebc7d3913705b8a0f516debfee0ca213f9dca69707edda8d70cfafb15500fcb6e70cffb6d5d1119302d24e05059c50f0da77
+  metadata.gz: 3e3e92aa38413877620947ec7996494cd720a3c211fcdf1973ce0d7a9a7e8803e293e2ce2f601b11e35858c5b4ef6b00d716069e322ea8d6b4c93412990fd746
+  data.tar.gz: 20a1e9e0e5812da9b00787afbf0f3aa0b762c8168f54ce3b7f2f25ff5b61cca5b2e7ab5faa065fbc3e266468d1c5747b8e0779fc7e073cc66240d1f3085e71c7

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,22 @@
+## [[0.9.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.0...v0.9.1)] - 2023-11-03
+- Bump bundled llama.cpp from b1429 to b1472
+  - Rename `kv_cahe_tokens_rm` method to `kv_cahce_clear` in Context.
+  - Add `sample_min_p method` to Context.
+  - Add `rope_scaling_type`, `rope_freq_base`, `rope_freq_scale`, `yarn_ext_factor`, `yarn_attn_factor`, `yarn_beta_fast`, `yarn_beta_slow`, and `yarn_orig_ctx` to ContextParams.
+  - Add `pure` to ModelQuantizeParams.
+  - Add contstants for RoPE scaling type.
+## [[0.9.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.8.0...v0.9.0)] - 2023-10-28
+- Fix missing object file for ggml-backend when building with metal and cublas options.
+**Breaking Changes**
+- Bump bundled llama.cpp from b1405 to b1429
+  - Move following methods from Context to Model:
+    - text, score, type, token_bos, token_eos, token_nl, token_prefix, token_middle, token_suffix, and token_eos.
+  - Add `sample_repetition_penalties` method, which integrates sample_frequency_and_presence_penalties and sample_repetition_penalty methods.
 ## [[0.8.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.1...v0.8.0)] - 2023-10-21
 **Breaking Changes**

data/examples/chat.rb CHANGED Viewed

@@ -83,10 +83,12 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         candidates = LLaMACpp::TokenDataArray.new(base_candidates)
         last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
-        context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: options[:repeat_penalty])
-        context.sample_frequency_and_presence_penalties(
-          candidates, last_n_tokens[-last_n_repeat..],
-          frequency: options[:frequency_penalty], presence: options[:presence_penalty]
+        context.sample_repetition_penalties(
+          candidates,
+          last_n_tokens[-last_n_repeat..],
+          penalty_repeat: options[:repeat_penalty],
+          penalty_freq: options[:frequency_penalty],
+          penalty_present: options[:presence_penalty]
         )
         context.sample_top_k(candidates, k: options[:top_k])
@@ -99,8 +101,8 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         last_n_tokens.shift
         last_n_tokens.push(id)
-        if id == context.token_eos
-          id = context.token_nl
+        if id == context.model.token_eos
+          id = context.model.token_nl
           unless antiprompt.empty?
             first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
             embd_input.concat(first_antiprompt)

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require 'fileutils'
 abort 'libstdc++ is not found.' unless have_library('stdc++')
-$srcs = %w[ggml.c ggml-backend.c ggml-alloc.c llama.cpp llama_cpp.cpp]
+$srcs = %w[ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c llama.cpp llama_cpp.cpp]
 $srcs << 'ggml-opencl.cpp' if with_config('clblast')
 $srcs << 'ggml-mpi.c' if with_config('mpi')
 $CFLAGS << ' -w -DNDEBUG'
@@ -18,12 +18,6 @@ if RUBY_PLATFORM.match?(/darwin|linux|bsd/) && try_compile('#include <stdio.h>',
   $CXXFLAGS << ' -pthread'
 end
-unless with_config('no_k_quants')
-  $CFLAGS << ' -DGGML_USE_K_QUANTS'
-  $CXXFLAGS << ' -DGGML_USE_K_QUANTS'
-  $srcs << 'k_quants.c'
-end
 if with_config('qkk_64')
   $CFLAGS << ' -DGGML_QKK_64'
   $CXXFLAGS << ' -DGGML_QKK_64'
@@ -53,16 +47,14 @@ if with_config('metal')
   $CFLAGS << ' -DGGML_USE_METAL'
   $CXXFLAGS << ' -DGGML_USE_METAL'
   $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
-  $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
-  $objs << 'k_quants.o' unless with_config('no_k_quants')
+  $objs = %w[ggml.o ggml-backend.o ggml-alloc.o ggml-quants.o ggml-metal.o llama.o llama_cpp.o]
 end
 if with_config('cublas')
   $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
   $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
   $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
-  $objs = %w[ggml.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
-  $objs << 'k_quants.o' unless with_config('no_k_quants')
+  $objs = %w[ggml.o ggml-backend.o ggml-alloc.o ggml-quants.o ggml-cuda.o llama.o llama_cpp.o]
 end
 if with_config('clblast')

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -796,10 +796,22 @@ public:
     rb_define_method(rb_cLLaMAContextParams, "n_threads", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads), 0);
     rb_define_method(rb_cLLaMAContextParams, "n_threads_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads_batch), 1);
     rb_define_method(rb_cLLaMAContextParams, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads_batch), 0);
+    rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_scaling_type), 1);
+    rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type", RUBY_METHOD_FUNC(_llama_context_params_get_rope_scaling_type), 0);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_base=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_base), 1);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_base", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_base), 0);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_scale), 1);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_scale), 0);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_ext_factor=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_ext_factor), 1);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_ext_factor", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_ext_factor), 0);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_attn_factor=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_attn_factor), 1);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_attn_factor", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_attn_factor), 0);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_beta_fast=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_beta_fast), 1);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_beta_fast", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_fast), 0);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_beta_slow), 1);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_slow), 0);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_orig_ctx), 1);
+    rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_orig_ctx), 0);
     rb_define_method(rb_cLLaMAContextParams, "mul_mat_q=", RUBY_METHOD_FUNC(_llama_context_params_set_mul_mat_q), 1);
     rb_define_method(rb_cLLaMAContextParams, "mul_mat_q", RUBY_METHOD_FUNC(_llama_context_params_get_mul_mat_q), 0);
     rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -883,6 +895,18 @@ private:
     return INT2NUM(ptr->params.n_threads_batch);
   }
+  // rope_scaling_type
+  static VALUE _llama_context_params_set_rope_scaling_type(VALUE self, VALUE scaling_type) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.rope_scaling_type = NUM2INT(scaling_type);
+    return INT2NUM(ptr->params.rope_scaling_type);
+  }
+  static VALUE _llama_context_params_get_rope_scaling_type(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return INT2NUM(ptr->params.rope_scaling_type);
+  }
   // rope_freq_base
   static VALUE _llama_context_params_set_rope_freq_base(VALUE self, VALUE rope_freq_base) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -907,6 +931,66 @@ private:
     return DBL2NUM(ptr->params.rope_freq_scale);
   }
+  // yarn_ext_factor
+  static VALUE _llama_context_params_set_yarn_ext_factor(VALUE self, VALUE yarn_ext_factor) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.yarn_ext_factor = NUM2DBL(yarn_ext_factor);
+    return DBL2NUM(ptr->params.yarn_ext_factor);
+  }
+  static VALUE _llama_context_params_get_yarn_ext_factor(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return DBL2NUM(ptr->params.yarn_ext_factor);
+  }
+  // yarn_attn_factor
+  static VALUE _llama_context_params_set_yarn_attn_factor(VALUE self, VALUE yarn_attn_factor) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.yarn_attn_factor = NUM2DBL(yarn_attn_factor);
+    return DBL2NUM(ptr->params.yarn_attn_factor);
+  }
+  static VALUE _llama_context_params_get_yarn_attn_factor(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return DBL2NUM(ptr->params.yarn_attn_factor);
+  }
+  // yarn_beta_fast
+  static VALUE _llama_context_params_set_yarn_beta_fast(VALUE self, VALUE yarn_beta_fast) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.yarn_beta_fast = NUM2DBL(yarn_beta_fast);
+    return DBL2NUM(ptr->params.yarn_beta_fast);
+  }
+  static VALUE _llama_context_params_get_yarn_beta_fast(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return DBL2NUM(ptr->params.yarn_beta_fast);
+  }
+  // yarn_beta_slow
+  static VALUE _llama_context_params_set_yarn_beta_slow(VALUE self, VALUE yarn_beta_slow) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.yarn_beta_slow = NUM2DBL(yarn_beta_slow);
+    return DBL2NUM(ptr->params.yarn_beta_slow);
+  }
+  static VALUE _llama_context_params_get_yarn_beta_slow(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return DBL2NUM(ptr->params.yarn_beta_slow);
+  }
+  // yarn_orig_ctx
+  static VALUE _llama_context_params_set_yarn_orig_ctx(VALUE self, VALUE yarn_orig_ctx) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.yarn_orig_ctx = NUM2UINT(yarn_orig_ctx);
+    return UINT2NUM(ptr->params.yarn_orig_ctx);
+  }
+  static VALUE _llama_context_params_get_yarn_orig_ctx(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return UINT2NUM(ptr->params.yarn_orig_ctx);
+  }
   // mul_mat_q
   static VALUE _llama_context_params_set_mul_mat_q(VALUE self, VALUE mul_mat_q) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1011,6 +1095,8 @@ public:
     rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
     rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_only_copy), 1);
     rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
   }
 private:
@@ -1083,6 +1169,18 @@ private:
     LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
     return ptr->params.only_copy ? Qtrue : Qfalse;
   }
+  // pure
+  static VALUE _llama_model_quantize_params_set_pure(VALUE self, VALUE pure) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    ptr->params.pure = RTEST(pure) ? true : false;
+    return ptr->params.pure ? Qtrue : Qfalse;
+  }
+  static VALUE _llama_model_quantize_params_get_pure(VALUE self) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    return ptr->params.pure ? Qtrue : Qfalse;
+  }
 };
 const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -1148,6 +1246,16 @@ public:
     rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
     rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
     rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
+    rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
+    rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
+    rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
+    rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
+    rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
+    rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
+    rb_define_method(rb_cLLaMAModel, "token_prefix", RUBY_METHOD_FUNC(_llama_model_token_prefix), 0);
+    rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
+    rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
+    rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
   }
 private:
@@ -1396,6 +1504,62 @@ private:
     LLaMAModelWrapper* ptr = get_llama_model(self);
     return UINT2NUM(llama_model_n_params(ptr->model));
   }
+  static VALUE _llama_model_get_text(VALUE self, VALUE token_) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    const llama_token token = NUM2INT(token_);
+    const char* text = llama_token_get_text(ptr->model, token);
+    return rb_utf8_str_new_cstr(text);
+  }
+  static VALUE _llama_model_get_score(VALUE self, VALUE token_) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    const llama_token token = NUM2INT(token_);
+    const float score = llama_token_get_score(ptr->model, token);
+    return DBL2NUM(score);
+  }
+  static VALUE _llama_model_get_type(VALUE self, VALUE token_) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    const llama_token token = NUM2INT(token_);
+    const int type = llama_token_get_type(ptr->model, token);
+    return INT2NUM(type);
+  }
+  static VALUE _llama_model_token_bos(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_bos(ptr->model));
+  }
+  static VALUE _llama_model_token_eos(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_eos(ptr->model));
+  }
+  static VALUE _llama_model_token_nl(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_nl(ptr->model));
+  }
+  static VALUE _llama_model_token_prefix(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_prefix(ptr->model));
+  }
+  static VALUE _llama_model_token_middle(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_middle(ptr->model));
+  }
+  static VALUE _llama_model_token_suffix(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_suffix(ptr->model));
+  }
+  static VALUE _llama_model_token_eot(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_token_eot(ptr->model));
+  }
 };
 const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -1670,22 +1834,12 @@ public:
     rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
     rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
-    rb_define_method(rb_cLLaMAContext, "text", RUBY_METHOD_FUNC(_llama_context_text), 1);
-    rb_define_method(rb_cLLaMAContext, "score", RUBY_METHOD_FUNC(_llama_context_score), 1);
-    rb_define_method(rb_cLLaMAContext, "type", RUBY_METHOD_FUNC(_llama_context_type), 1);
-    rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
-    rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
-    rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
-    rb_define_method(rb_cLLaMAContext, "token_prefix", RUBY_METHOD_FUNC(_llama_context_token_prefix), 0);
-    rb_define_method(rb_cLLaMAContext, "token_middle", RUBY_METHOD_FUNC(_llama_context_token_middle), 0);
-    rb_define_method(rb_cLLaMAContext, "token_suffix", RUBY_METHOD_FUNC(_llama_context_token_suffix), 0);
-    rb_define_method(rb_cLLaMAContext, "token_eot", RUBY_METHOD_FUNC(_llama_context_token_eot), 0);
     rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
     rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
     rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
     rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
     rb_define_method(rb_cLLaMAContext, "kv_cache_token_count", RUBY_METHOD_FUNC(_llama_context_kv_cache_token_count), 0);
-    rb_define_method(rb_cLLaMAContext, "kv_cache_tokens_rm", RUBY_METHOD_FUNC(_llama_context_kv_cache_tokens_rm), 2);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_clear", RUBY_METHOD_FUNC(_llama_context_kv_cache_clear), 0);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_rm", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_rm), 3);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
@@ -1693,12 +1847,12 @@ public:
     rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
     rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_repetition_penalty", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalty), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_frequency_and_presence_penalties", RUBY_METHOD_FUNC(_llama_context_sample_frequency_and_presence_penalties), -1);
+    rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
     rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
     rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
     rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
     rb_define_method(rb_cLLaMAContext, "sample_top_p", RUBY_METHOD_FUNC(_llama_context_sample_top_p), -1);
+    rb_define_method(rb_cLLaMAContext, "sample_min_p", RUBY_METHOD_FUNC(_llama_context_sample_min_p), -1);
     rb_define_method(rb_cLLaMAContext, "sample_tail_free", RUBY_METHOD_FUNC(_llama_context_sample_tail_free), -1);
     rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
     rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
@@ -1927,102 +2081,6 @@ private:
     return output;
   }
-  static VALUE _llama_context_text(VALUE self, VALUE token_) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    const llama_token token = NUM2INT(token_);
-    const char* text = llama_token_get_text(ptr->ctx, token);
-    return rb_utf8_str_new_cstr(text);
-  }
-  static VALUE _llama_context_score(VALUE self, VALUE token_) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    const llama_token token = NUM2INT(token_);
-    const float score = llama_token_get_score(ptr->ctx, token);
-    return DBL2NUM(score);
-  }
-  static VALUE _llama_context_type(VALUE self, VALUE token_) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    const llama_token token = NUM2INT(token_);
-    const int type = llama_token_get_type(ptr->ctx, token);
-    return INT2NUM(type);
-  }
-  static VALUE _llama_context_token_bos(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_bos(ptr->ctx));
-  }
-  static VALUE _llama_context_token_eos(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_eos(ptr->ctx));
-  }
-  static VALUE _llama_context_token_nl(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_nl(ptr->ctx));
-  }
-  static VALUE _llama_context_token_prefix(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_prefix(ptr->ctx));
-  }
-  static VALUE _llama_context_token_middle(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_middle(ptr->ctx));
-  }
-  static VALUE _llama_context_token_suffix(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_suffix(ptr->ctx));
-  }
-  static VALUE _llama_context_token_eot(VALUE self) {
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    return INT2NUM(llama_token_eot(ptr->ctx));
-  }
   static VALUE _llama_context_n_ctx(VALUE self) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2073,13 +2131,13 @@ private:
     return INT2NUM(llama_get_kv_cache_token_count(ptr->ctx));
   }
-  static VALUE _llama_context_kv_cache_tokens_rm(VALUE self, VALUE c0, VALUE c1) {
+  static VALUE _llama_context_kv_cache_clear(VALUE self) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
       rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
       return Qnil;
     }
-    llama_kv_cache_tokens_rm(ptr->ctx, NUM2INT(c0), NUM2INT(c1));
+    llama_kv_cache_clear(ptr->ctx);
     return Qnil;
   }
@@ -2231,14 +2289,14 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_sample_repetition_penalty(int argc, VALUE* argv, VALUE self) {
+  static VALUE _llama_context_sample_repetition_penalties(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
-    ID kw_table[1] = { rb_intern("penalty") };
-    VALUE kw_values[1] = { Qundef };
+    ID kw_table[3] = { rb_intern("penalty_repeat"), rb_intern("penalty_freq"), rb_intern("penalty_present") };
+    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
     VALUE candidates = Qnil;
     VALUE last_n_tokens = Qnil;
     rb_scan_args(argc, argv, "2:", &candidates, &last_n_tokens, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
+    rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
     if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
       rb_raise(rb_eArgError, "candidates must be a TokenDataArray");
@@ -2249,56 +2307,15 @@ private:
       return Qnil;
     }
     if (!RB_FLOAT_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "penalty must be a float");
-      return Qnil;
-    }
-    const size_t last_tokens_size = RARRAY_LEN(last_n_tokens);
-    std::vector<llama_token> last_n_tokens_data(last_tokens_size);
-    for (size_t i = 0; i < last_tokens_size; i++) {
-      last_n_tokens_data[i] = NUM2INT(rb_ary_entry(last_n_tokens, i));
-    }
-    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
-    if (ctx_ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      rb_raise(rb_eArgError, "penalty_repeat must be a float");
       return Qnil;
     }
-    LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
-    if (cnd_ptr->array.data == nullptr) {
-      rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
-      return Qnil;
-    }
-    const float penalty = NUM2DBL(kw_values[0]);
-    llama_sample_repetition_penalty(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size, penalty);
-    return Qnil;
-  }
-  static VALUE _llama_context_sample_frequency_and_presence_penalties(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[2] = { rb_intern("frequency"), rb_intern("presence") };
-    VALUE kw_values[2] = { Qundef, Qundef };
-    VALUE candidates = Qnil;
-    VALUE last_n_tokens = Qnil;
-    rb_scan_args(argc, argv, "2:", &candidates, &last_n_tokens, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
-    if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
-      rb_raise(rb_eArgError, "candidates must be a TokenDataArray");
-      return Qnil;
-    }
-    if (!RB_TYPE_P(last_n_tokens, T_ARRAY)) {
-      rb_raise(rb_eArgError, "last_n_tokens must be an Array");
-      return Qnil;
-    }
-    if (!RB_FLOAT_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "frequency must be a float");
+    if (!RB_FLOAT_TYPE_P(kw_values[1])) {
+      rb_raise(rb_eArgError, "penalty_freq must be a float");
       return Qnil;
     }
-    if (!RB_FLOAT_TYPE_P(kw_values[1])) {
-      rb_raise(rb_eArgError, "presence must be a float");
+    if (!RB_FLOAT_TYPE_P(kw_values[2])) {
+      rb_raise(rb_eArgError, "penalty_present must be a float");
       return Qnil;
     }
@@ -2318,11 +2335,12 @@ private:
       rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
       return Qnil;
     }
+    const float penalty_repeat = NUM2DBL(kw_values[0]);
+    const float penalty_freq = NUM2DBL(kw_values[1]);
+    const float penalty_present = NUM2DBL(kw_values[2]);
-    const float alpha_frequency = NUM2DBL(kw_values[0]);
-    const float alpha_presence = NUM2DBL(kw_values[1]);
-    llama_sample_frequency_and_presence_penalties(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size, alpha_frequency, alpha_presence);
+    llama_sample_repetition_penalties(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size,
+        penalty_repeat, penalty_freq, penalty_present);
     return Qnil;
   }
@@ -2467,6 +2485,45 @@ private:
     return Qnil;
   }
+  static VALUE _llama_context_sample_min_p(int argc, VALUE* argv, VALUE self) {
+    VALUE kw_args = Qnil;
+    ID kw_table[2] = { rb_intern("prob"), rb_intern("min_keep") };
+    VALUE kw_values[2] = { Qundef, Qundef };
+    VALUE candidates = Qnil;
+    rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
+    rb_get_kwargs(kw_args, kw_table, 1, 1, kw_values);
+    if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
+      rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
+      return Qnil;
+    }
+    if (!RB_FLOAT_TYPE_P(kw_values[0])) {
+      rb_raise(rb_eArgError, "prob must be a float");
+      return Qnil;
+    }
+    if (kw_values[1] != Qundef && !RB_INTEGER_TYPE_P(kw_values[1])) {
+      rb_raise(rb_eArgError, "min_keep must be an integer");
+      return Qnil;
+    }
+    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
+    if (ctx_ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
+    if (cnd_ptr->array.data == nullptr) {
+      rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
+      return Qnil;
+    }
+    const float prob = NUM2DBL(kw_values[0]);
+    const size_t min_keep = kw_values[1] != Qundef ? NUM2SIZET(kw_values[1]) : 1;
+    llama_sample_min_p(ctx_ptr->ctx, &(cnd_ptr->array), prob, min_keep);
+    return Qnil;
+  }
   static VALUE _llama_context_sample_tail_free(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
     ID kw_table[2] = { rb_intern("z"), rb_intern("min_keep") };
@@ -2962,6 +3019,12 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_RNG_UPPER", INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_UNSPECIFIED));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_NONE", INT2NUM(LLAMA_ROPE_SCALING_NONE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_LINEAR));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
   std::stringstream ss_magic;
   ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
   rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));