RubyGems - llama_cpp - Versions diffs - 0.12.7 → 0.14.0 - Mend

llama_cpp 0.12.7 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +24 -0
data/ext/llama_cpp/llama_cpp.cpp +131 -288
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +29 -29
data/vendor/tmp/llama.cpp/Makefile +10 -6
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
data/vendor/tmp/llama.cpp/ggml.c +545 -210
data/vendor/tmp/llama.cpp/ggml.h +65 -23
data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
data/vendor/tmp/llama.cpp/llama.h +81 -75
data/vendor/tmp/llama.cpp/unicode.h +310 -1
metadata +2 -2

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -952,6 +952,8 @@ public:
     rb_define_method(rb_cLLaMAContextParams, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads_batch), 0);
     rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_scaling_type), 1);
     rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type", RUBY_METHOD_FUNC(_llama_context_params_get_rope_scaling_type), 0);
+    rb_define_method(rb_cLLaMAContextParams, "pooling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_pooling_type), 1);
+    rb_define_method(rb_cLLaMAContextParams, "pooling_type", RUBY_METHOD_FUNC(_llama_context_params_get_pooling_type), 0);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_base=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_base), 1);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_base", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_base), 0);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_scale), 1);
@@ -966,20 +968,18 @@ public:
     rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_slow), 0);
     rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_orig_ctx), 1);
     rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_orig_ctx), 0);
+    rb_define_method(rb_cLLaMAContextParams, "defrag_thold=", RUBY_METHOD_FUNC(_llama_context_params_set_defrag_thold), 1);
+    rb_define_method(rb_cLLaMAContextParams, "defrag_thold", RUBY_METHOD_FUNC(_llama_context_params_get_defrag_thold), 0);
     rb_define_method(rb_cLLaMAContextParams, "type_k=", RUBY_METHOD_FUNC(_llama_context_params_set_type_k), 1);
     rb_define_method(rb_cLLaMAContextParams, "type_k", RUBY_METHOD_FUNC(_llama_context_params_get_type_k), 0);
     rb_define_method(rb_cLLaMAContextParams, "type_v=", RUBY_METHOD_FUNC(_llama_context_params_set_type_v), 1);
     rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
-    rb_define_method(rb_cLLaMAContextParams, "mul_mat_q=", RUBY_METHOD_FUNC(_llama_context_params_set_mul_mat_q), 1);
-    rb_define_method(rb_cLLaMAContextParams, "mul_mat_q", RUBY_METHOD_FUNC(_llama_context_params_get_mul_mat_q), 0);
     rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
     rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
-    rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
-    rb_define_method(rb_cLLaMAContextParams, "embedding", RUBY_METHOD_FUNC(_llama_context_params_get_embedding), 0);
+    rb_define_method(rb_cLLaMAContextParams, "embeddings=", RUBY_METHOD_FUNC(_llama_context_params_set_embeddings), 1);
+    rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
     rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
     rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
-    rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
-    rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
   }
 private:
@@ -1058,7 +1058,7 @@ private:
   // rope_scaling_type
   static VALUE _llama_context_params_set_rope_scaling_type(VALUE self, VALUE scaling_type) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    ptr->params.rope_scaling_type = NUM2INT(scaling_type);
+    ptr->params.rope_scaling_type = static_cast<enum llama_rope_scaling_type>(NUM2INT(scaling_type));
     return INT2NUM(ptr->params.rope_scaling_type);
   }
@@ -1067,6 +1067,18 @@ private:
     return INT2NUM(ptr->params.rope_scaling_type);
   }
+  // pooling_type
+  static VALUE _llama_context_params_set_pooling_type(VALUE self, VALUE scaling_type) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.pooling_type = static_cast<enum llama_pooling_type>(NUM2INT(scaling_type));
+    return INT2NUM(ptr->params.pooling_type);
+  }
+  static VALUE _llama_context_params_get_pooling_type(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return INT2NUM(ptr->params.pooling_type);
+  }
   // rope_freq_base
   static VALUE _llama_context_params_set_rope_freq_base(VALUE self, VALUE rope_freq_base) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1146,6 +1158,18 @@ private:
     return UINT2NUM(ptr->params.yarn_orig_ctx);
   }
+  // defrag_thold
+  static VALUE _llama_context_params_set_defrag_thold(VALUE self, VALUE defrag_thold) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.defrag_thold = NUM2DBL(defrag_thold);
+    return DBL2NUM(ptr->params.defrag_thold);
+  }
+  static VALUE _llama_context_params_get_defrag_thold(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return DBL2NUM(ptr->params.defrag_thold);
+  }
   static VALUE _llama_context_params_get_yarn_orig_ctx(VALUE self) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
     return UINT2NUM(ptr->params.yarn_orig_ctx);
@@ -1175,18 +1199,6 @@ private:
     return INT2NUM(ptr->params.type_v);
   }
-  // mul_mat_q
-  static VALUE _llama_context_params_set_mul_mat_q(VALUE self, VALUE mul_mat_q) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    ptr->params.mul_mat_q = RTEST(mul_mat_q) ? true : false;
-    return ptr->params.mul_mat_q ? Qtrue : Qfalse;
-  }
-  static VALUE _llama_context_params_get_mul_mat_q(VALUE self) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    return ptr->params.mul_mat_q ? Qtrue : Qfalse;
-  }
   // logits_all
   static VALUE _llama_context_params_set_logits_all(VALUE self, VALUE logits_all) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1199,16 +1211,16 @@ private:
     return ptr->params.logits_all ? Qtrue : Qfalse;
   }
-  // embedding
-  static VALUE _llama_context_params_set_embedding(VALUE self, VALUE embedding) {
+  // embeddings
+  static VALUE _llama_context_params_set_embeddings(VALUE self, VALUE embeddings) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    ptr->params.embedding = RTEST(embedding) ? true : false;
-    return ptr->params.embedding ? Qtrue : Qfalse;
+    ptr->params.embeddings = RTEST(embeddings) ? true : false;
+    return ptr->params.embeddings ? Qtrue : Qfalse;
   }
-  static VALUE _llama_context_params_get_embedding(VALUE self) {
+  static VALUE _llama_context_params_get_embeddings(VALUE self) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    return ptr->params.embedding ? Qtrue : Qfalse;
+    return ptr->params.embeddings ? Qtrue : Qfalse;
   }
   // offload_kqv
@@ -1222,18 +1234,6 @@ private:
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
     return ptr->params.offload_kqv ? Qtrue : Qfalse;
   }
-  // do_pooling
-  static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
-    return ptr->params.do_pooling ? Qtrue : Qfalse;
-  }
-  static VALUE _llama_context_params_get_do_pooling(VALUE self) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    return ptr->params.do_pooling ? Qtrue : Qfalse;
-  }
 };
 const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -1433,7 +1433,8 @@ public:
     rb_define_method(rb_cLLaMAModel, "empty?", RUBY_METHOD_FUNC(_llama_model_empty), 0);
     rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
     rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
-    rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
+    rb_define_method(rb_cLLaMAModel, "vocab_type", RUBY_METHOD_FUNC(_llama_model_get_model_vocab_type), 0);
+    rb_define_method(rb_cLLaMAModel, "rope_type", RUBY_METHOD_FUNC(_llama_model_get_model_rope_type), 0);
     rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
     rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
     rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
@@ -1559,41 +1560,14 @@ private:
     return Qnil;
   }
-  static VALUE _llama_model_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[4] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads"), rb_intern("scale") };
-    VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
-    rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
-    if (!RB_TYPE_P(kw_values[0], T_STRING)) {
-      rb_raise(rb_eArgError, "lora_path must be a string");
-      return Qnil;
-    }
-    if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
-      rb_raise(rb_eArgError, "base_model_path must be a string");
-      return Qnil;
-    }
-    if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
-      rb_raise(rb_eArgError, "n_threads must be an integer");
-      return Qnil;
-    }
-    if (kw_values[3] != Qundef && !RB_FLOAT_TYPE_P(kw_values[3])) {
-      rb_raise(rb_eArgError, "scale must be a float");
-      return Qnil;
-    }
-    const char* lora_path = StringValueCStr(kw_values[0]);
-    const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
-    const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
-    const float scale = kw_values[3] == Qundef ? 1.0 : NUM2DBL(kw_values[3]);
+  static VALUE _llama_model_get_model_vocab_type(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_vocab_type(ptr->model));
+  }
+  static VALUE _llama_model_get_model_rope_type(VALUE self) {
     LLaMAModelWrapper* ptr = get_llama_model(self);
-    if (llama_model_apply_lora_from_file(ptr->model, lora_path, scale, base_model_path, n_threads) != 0) {
-      rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
-      return Qnil;
-    }
-    return Qnil;
+    return INT2NUM(llama_rope_type(ptr->model));
   }
   static VALUE _llama_model_get_model_n_vocab(VALUE self) {
@@ -2038,12 +2012,11 @@ public:
     rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
     rb_define_attr(rb_cLLaMAContext, "model", 1, 0);
     rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
-    rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
-    rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
     rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
     rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
+    rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
     rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
     rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
     rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
@@ -2054,14 +2027,16 @@ public:
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_rm", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_rm), 3);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
-    rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_seq_add", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_add), 4);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_seq_pos_max", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_pos_max), 1);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
     rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
     rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
     rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
     rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
     rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
     rb_define_method(rb_cLLaMAContext, "sample_top_p", RUBY_METHOD_FUNC(_llama_context_sample_top_p), -1);
@@ -2070,7 +2045,6 @@ public:
     rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
     rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
     rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
     rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
     rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
     rb_define_method(rb_cLLaMAContext, "sample_token_greedy", RUBY_METHOD_FUNC(_llama_context_sample_token_greedy), 1);
@@ -2122,110 +2096,6 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[3] = { rb_intern("tokens"), rb_intern("n_past"), rb_intern("n_tokens") };
-    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
-    rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
-    rb_warn("eval is deprecated. Use decode instead.");
-    if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
-      rb_raise(rb_eArgError, "tokens must be an Array");
-      return Qnil;
-    }
-    if (!RB_INTEGER_TYPE_P(kw_values[1])) {
-      rb_raise(rb_eArgError, "n_past must be an integer");
-      return Qnil;
-    }
-    if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
-      rb_raise(rb_eArgError, "n_tokens must be an integer");
-      return Qnil;
-    }
-    const size_t tokens_len = RARRAY_LEN(kw_values[0]);
-    std::vector<llama_token> embd(tokens_len);
-    for (size_t i = 0; i < tokens_len; i++) {
-      VALUE token = rb_ary_entry(kw_values[0], i);
-      if (!RB_INTEGER_TYPE_P(token)) {
-        rb_raise(rb_eArgError, "tokens must be an array of integers");
-        return Qnil;
-      }
-      embd[i] = NUM2INT(token);
-    }
-    const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
-    const int n_past = NUM2INT(kw_values[1]);
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
-      rb_raise(rb_eRuntimeError, "Failed to evaluate");
-      return Qnil;
-    }
-    rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
-    rb_iv_set(self, "@has_evaluated", Qtrue);
-    return Qnil;
-  }
-  static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[3] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens") };
-    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
-    rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
-    rb_warn("eval_embd is deprecated. Use decode instead.");
-    if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
-      rb_raise(rb_eArgError, "tokens must be an Array");
-      return Qnil;
-    }
-    if (!RB_INTEGER_TYPE_P(kw_values[1])) {
-      rb_raise(rb_eArgError, "n_past must be an integer");
-      return Qnil;
-    }
-    if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
-      rb_raise(rb_eArgError, "n_tokens must be an integer");
-      return Qnil;
-    }
-    const size_t tokens_len = RARRAY_LEN(kw_values[0]);
-    std::vector<float> embd(tokens_len);
-    for (size_t i = 0; i < tokens_len; i++) {
-      VALUE el = rb_ary_entry(kw_values[0], i);
-      if (!RB_FLOAT_TYPE_P(el)) {
-        rb_raise(rb_eArgError, "embd must be an array of floats");
-        return Qnil;
-      }
-      embd[i] = NUM2DBL(el);
-    }
-    const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
-    const int n_past = NUM2INT(kw_values[1]);
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
-      rb_raise(rb_eRuntimeError, "Failed to evaluate");
-      return Qnil;
-    }
-    rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
-    rb_iv_set(self, "@has_evaluated", Qtrue);
-    return Qnil;
-  }
   static VALUE _llama_context_decode(VALUE self, VALUE batch) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2282,7 +2152,7 @@ private:
     LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
     VALUE params = rb_iv_get(self, "@params");
     LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
-    if (!prms_ptr->params.embedding) {
+    if (!prms_ptr->params.embeddings) {
       rb_raise(rb_eRuntimeError, "embedding parameter is false");
       return Qnil;
     }
@@ -2291,10 +2161,11 @@ private:
       return Qnil;
     }
+    const int n_tokens = NUM2INT(rb_iv_get(self, "@n_tokens"));
     const int n_embd = llama_n_embd(model_ptr->model);
     const float* embd = llama_get_embeddings(ptr->ctx);
     VALUE output = rb_ary_new();
-    for (int i = 0; i < n_embd; i++) {
+    for (int i = 0; i < n_tokens * n_embd; i++) {
       rb_ary_push(output, DBL2NUM((double)(embd[i])));
     }
@@ -2313,7 +2184,7 @@ private:
     }
     VALUE params = rb_iv_get(self, "@params");
     LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
-    if (!prms_ptr->params.embedding) {
+    if (!prms_ptr->params.embeddings) {
       rb_raise(rb_eRuntimeError, "embedding parameter is false");
       return Qnil;
     }
@@ -2331,6 +2202,36 @@ private:
     return output;
   }
+  static VALUE _llama_context_embeddings_seq(VALUE self, VALUE seq_id) {
+    if (!RB_INTEGER_TYPE_P(seq_id)) {
+      rb_raise(rb_eArgError, "seq_id must be an integer");
+      return Qnil;
+    }
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    VALUE params = rb_iv_get(self, "@params");
+    LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
+    if (!prms_ptr->params.embeddings) {
+      rb_raise(rb_eRuntimeError, "embedding parameter is false");
+      return Qnil;
+    }
+    VALUE model = rb_iv_get(self, "@model");
+    LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
+    const int n_embd = llama_n_embd(model_ptr->model);
+    VALUE output = rb_ary_new();
+    const float* embd = llama_get_embeddings_seq(ptr->ctx, NUM2INT(seq_id));
+    for (int i = 0; i < n_embd; i++) {
+      rb_ary_push(output, DBL2NUM((double)(embd[i])));
+    }
+    return output;
+  }
   static VALUE _llama_context_n_ctx(VALUE self) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2430,13 +2331,13 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_kv_cache_seq_shift(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
+  static VALUE _llama_context_kv_cache_seq_add(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
       rb_raise(rb_eArgError, "LLaMA context is not initialized");
       return Qnil;
     }
-    llama_kv_cache_seq_shift(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
+    llama_kv_cache_seq_add(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
     return Qnil;
   }
@@ -2450,6 +2351,35 @@ private:
     return Qnil;
   }
+  static VALUE _llama_context_kv_cache_seq_pos_max(VALUE self, VALUE seq_id) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eArgError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    return INT2NUM(llama_kv_cache_seq_pos_max(ptr->ctx, NUM2INT(seq_id)));
+  }
+  static VALUE _llama_context_kv_cache_defrag(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    llama_kv_cache_defrag(ptr->ctx);
+    return Qnil;
+  }
+  static VALUE _llama_context_kv_cache_update(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    llama_kv_cache_update(ptr->ctx);
+    return Qnil;
+  }
   static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2659,46 +2589,6 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
-    VALUE kw_values[2] = { Qundef, Qundef };
-    VALUE candidates = Qnil;
-    rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
-    if (!rb_obj_is_kind_of(kw_values[0], rb_cLLaMAContext)) {
-      rb_raise(rb_eArgError, "guidance must be a Context");
-      return Qnil;
-    }
-    if (!RB_FLOAT_TYPE_P(kw_values[1])) {
-      rb_raise(rb_eArgError, "scale must be a float");
-      return Qnil;
-    }
-    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
-    if (ctx_ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
-    if (cnd_ptr->array.data == nullptr) {
-      rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
-      return Qnil;
-    }
-    LLaMAContextWrapper* guidance_ptr = get_llama_context(kw_values[0]);
-    if (guidance_ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "guidance context is not initialized");
-      return Qnil;
-    }
-    const float scale = NUM2DBL(kw_values[1]);
-    llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale);
-    return Qnil;
-  }
   static VALUE _llama_context_sample_softmax(VALUE self, VALUE candidates) {
     if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
       rb_raise(rb_eArgError, "argument must be a TokenDataArray");
@@ -2994,42 +2884,6 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[1] = { rb_intern("temperature") };
-    VALUE kw_values[1] = { Qundef };
-    VALUE candidates = Qnil;
-    rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
-    rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
-    if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
-      rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
-      return Qnil;
-    }
-    if (!RB_FLOAT_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "temperature must be a float");
-      return Qnil;
-    }
-    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
-    if (ctx_ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
-    if (cnd_ptr->array.data == nullptr) {
-      rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
-      return Qnil;
-    }
-    const float temperature = NUM2DBL(kw_values[0]);
-    llama_sample_temperature(ctx_ptr->ctx, &(cnd_ptr->array), temperature);
-    return Qnil;
-  }
   static VALUE _llama_context_sample_token_mirostat(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
     ID kw_table[4] = { rb_intern("tau"), rb_intern("eta"), rb_intern("m"), rb_intern("mu") };
@@ -3307,16 +3161,6 @@ static VALUE rb_llama_time_us(VALUE self) {
   return LONG2NUM(llama_time_us());
 }
-static VALUE rb_llama_mmap_supported(VALUE self) {
-  rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
-  return llama_mmap_supported() ? Qtrue : Qfalse;
-}
-static VALUE rb_llama_mlock_supported(VALUE self) {
-  rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
-  return llama_mlock_supported() ? Qtrue : Qfalse;
-}
 static VALUE rb_llama_max_devices(VALUE self) {
   return SIZET2NUM(llama_max_devices());
 }
@@ -3355,8 +3199,6 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
   rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
   rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
-  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
-  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
   rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
   rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
   rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
@@ -3394,16 +3236,16 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_INT", INT2NUM(LLAMA_KV_OVERRIDE_INT));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_FLOAT));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_BOOL));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
@@ -3413,19 +3255,20 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_RNG_UPPER", INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_UNSPECIFIED));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_NONE", INT2NUM(LLAMA_ROPE_SCALING_NONE));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_LINEAR));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_NONE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_ROW", INT2NUM(LLAMA_SPLIT_MODE_ROW));
   std::stringstream ss_magic;
   ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.12.7'
+  VERSION = '0.14.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b2249'
+  LLAMA_CPP_VERSION = 'b2361'
 end