RubyGems - llama_cpp - Versions diffs - 0.12.7 → 0.13.0 - Mend

llama_cpp 0.12.7 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +11 -0
data/ext/llama_cpp/llama_cpp.cpp +72 -262
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +23 -25
data/vendor/tmp/llama.cpp/Makefile +8 -3
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
data/vendor/tmp/llama.cpp/ggml.c +318 -195
data/vendor/tmp/llama.cpp/ggml.h +35 -19
data/vendor/tmp/llama.cpp/llama.cpp +806 -531
data/vendor/tmp/llama.cpp/llama.h +53 -65
data/vendor/tmp/llama.cpp/unicode.h +310 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
-  data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
+  metadata.gz: 8e8d23f3abceeea388895f198a3906b7a24d692cba97e46934a14567450fc3a2
+  data.tar.gz: 9d1385671b76ea826fbc000910e102fbbb951970f77b7511fdf2653adbc97334
 SHA512:
-  metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
-  data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
+  metadata.gz: 24746b8aaaa749b4058ddb64f6b07952356a6947ef1f40bc8bf7010a37b8b476e71632452ce28b6e61b11c66249a9d4fb6573de31e66e750bdb4391ce8f3286c
+  data.tar.gz: 56f79812ecdeecfc2dce6f68a73fc72d4495c6a51cc1d2ea7ccfeeb3e1ac9b6e72e78cbed019108e05987e431c4634bbfa1029f380f813a7fb6e009b5f6ec4e3

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,14 @@
+## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
+- Bump bundled llama.cpp from b2143 to b2303.
+  - Remove deprecated methods:
+    - `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.
+  - Rename some constants.
+  - Rename `kv_cache_seq_shift` method to `kv_cache_seq_add`.
+  - Add `defrag_thold` accessor to `ContextParams`.
+  - Add `vocab_type` and `rope_type` methods to `Model`.
+  - Add `kv_cache_seq_pos_max`, `kv_cache_defrag`, and `kv_cache_update` methods to `Context`.
 ## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
 - Bump bundled llama.cpp from b2106 to b2143.

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -966,12 +966,12 @@ public:
     rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_slow), 0);
     rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_orig_ctx), 1);
     rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_orig_ctx), 0);
+    rb_define_method(rb_cLLaMAContextParams, "defrag_thold=", RUBY_METHOD_FUNC(_llama_context_params_set_defrag_thold), 1);
+    rb_define_method(rb_cLLaMAContextParams, "defrag_thold", RUBY_METHOD_FUNC(_llama_context_params_get_defrag_thold), 0);
     rb_define_method(rb_cLLaMAContextParams, "type_k=", RUBY_METHOD_FUNC(_llama_context_params_set_type_k), 1);
     rb_define_method(rb_cLLaMAContextParams, "type_k", RUBY_METHOD_FUNC(_llama_context_params_get_type_k), 0);
     rb_define_method(rb_cLLaMAContextParams, "type_v=", RUBY_METHOD_FUNC(_llama_context_params_set_type_v), 1);
     rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
-    rb_define_method(rb_cLLaMAContextParams, "mul_mat_q=", RUBY_METHOD_FUNC(_llama_context_params_set_mul_mat_q), 1);
-    rb_define_method(rb_cLLaMAContextParams, "mul_mat_q", RUBY_METHOD_FUNC(_llama_context_params_get_mul_mat_q), 0);
     rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
     rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
     rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
@@ -1146,6 +1146,18 @@ private:
     return UINT2NUM(ptr->params.yarn_orig_ctx);
   }
+  // defrag_thold
+  static VALUE _llama_context_params_set_defrag_thold(VALUE self, VALUE defrag_thold) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.defrag_thold = NUM2DBL(defrag_thold);
+    return DBL2NUM(ptr->params.defrag_thold);
+  }
+  static VALUE _llama_context_params_get_defrag_thold(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return DBL2NUM(ptr->params.defrag_thold);
+  }
   static VALUE _llama_context_params_get_yarn_orig_ctx(VALUE self) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
     return UINT2NUM(ptr->params.yarn_orig_ctx);
@@ -1175,18 +1187,6 @@ private:
     return INT2NUM(ptr->params.type_v);
   }
-  // mul_mat_q
-  static VALUE _llama_context_params_set_mul_mat_q(VALUE self, VALUE mul_mat_q) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    ptr->params.mul_mat_q = RTEST(mul_mat_q) ? true : false;
-    return ptr->params.mul_mat_q ? Qtrue : Qfalse;
-  }
-  static VALUE _llama_context_params_get_mul_mat_q(VALUE self) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    return ptr->params.mul_mat_q ? Qtrue : Qfalse;
-  }
   // logits_all
   static VALUE _llama_context_params_set_logits_all(VALUE self, VALUE logits_all) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1433,7 +1433,8 @@ public:
     rb_define_method(rb_cLLaMAModel, "empty?", RUBY_METHOD_FUNC(_llama_model_empty), 0);
     rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
     rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
-    rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
+    rb_define_method(rb_cLLaMAModel, "vocab_type", RUBY_METHOD_FUNC(_llama_model_get_model_vocab_type), 0);
+    rb_define_method(rb_cLLaMAModel, "rope_type", RUBY_METHOD_FUNC(_llama_model_get_model_rope_type), 0);
     rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
     rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
     rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
@@ -1559,41 +1560,14 @@ private:
     return Qnil;
   }
-  static VALUE _llama_model_apply_lora_from_file(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[4] = { rb_intern("lora_path"), rb_intern("base_model_path"), rb_intern("n_threads"), rb_intern("scale") };
-    VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
-    rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
-    if (!RB_TYPE_P(kw_values[0], T_STRING)) {
-      rb_raise(rb_eArgError, "lora_path must be a string");
-      return Qnil;
-    }
-    if (kw_values[1] != Qundef && !RB_TYPE_P(kw_values[1], T_STRING)) {
-      rb_raise(rb_eArgError, "base_model_path must be a string");
-      return Qnil;
-    }
-    if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
-      rb_raise(rb_eArgError, "n_threads must be an integer");
-      return Qnil;
-    }
-    if (kw_values[3] != Qundef && !RB_FLOAT_TYPE_P(kw_values[3])) {
-      rb_raise(rb_eArgError, "scale must be a float");
-      return Qnil;
-    }
-    const char* lora_path = StringValueCStr(kw_values[0]);
-    const char* base_model_path = kw_values[1] == Qundef ? NULL : StringValueCStr(kw_values[1]);
-    const int n_threads = kw_values[2] == Qundef ? 1 : NUM2INT(kw_values[2]);
-    const float scale = kw_values[3] == Qundef ? 1.0 : NUM2DBL(kw_values[3]);
+  static VALUE _llama_model_get_model_vocab_type(VALUE self) {
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return INT2NUM(llama_vocab_type(ptr->model));
+  }
+  static VALUE _llama_model_get_model_rope_type(VALUE self) {
     LLaMAModelWrapper* ptr = get_llama_model(self);
-    if (llama_model_apply_lora_from_file(ptr->model, lora_path, scale, base_model_path, n_threads) != 0) {
-      rb_raise(rb_eRuntimeError, "Failed to apply LoRA");
-      return Qnil;
-    }
-    return Qnil;
+    return INT2NUM(llama_rope_type(ptr->model));
   }
   static VALUE _llama_model_get_model_n_vocab(VALUE self) {
@@ -2038,8 +2012,6 @@ public:
     rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
     rb_define_attr(rb_cLLaMAContext, "model", 1, 0);
     rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
-    rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
-    rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
     rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
     rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
@@ -2054,14 +2026,16 @@ public:
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_rm", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_rm), 3);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
-    rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_seq_add", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_add), 4);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_seq_pos_max", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_pos_max), 1);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
     rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
     rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
     rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
     rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
     rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
     rb_define_method(rb_cLLaMAContext, "sample_top_p", RUBY_METHOD_FUNC(_llama_context_sample_top_p), -1);
@@ -2070,7 +2044,6 @@ public:
     rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
     rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
     rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
-    rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
     rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
     rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
     rb_define_method(rb_cLLaMAContext, "sample_token_greedy", RUBY_METHOD_FUNC(_llama_context_sample_token_greedy), 1);
@@ -2122,110 +2095,6 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_eval(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[3] = { rb_intern("tokens"), rb_intern("n_past"), rb_intern("n_tokens") };
-    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
-    rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
-    rb_warn("eval is deprecated. Use decode instead.");
-    if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
-      rb_raise(rb_eArgError, "tokens must be an Array");
-      return Qnil;
-    }
-    if (!RB_INTEGER_TYPE_P(kw_values[1])) {
-      rb_raise(rb_eArgError, "n_past must be an integer");
-      return Qnil;
-    }
-    if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
-      rb_raise(rb_eArgError, "n_tokens must be an integer");
-      return Qnil;
-    }
-    const size_t tokens_len = RARRAY_LEN(kw_values[0]);
-    std::vector<llama_token> embd(tokens_len);
-    for (size_t i = 0; i < tokens_len; i++) {
-      VALUE token = rb_ary_entry(kw_values[0], i);
-      if (!RB_INTEGER_TYPE_P(token)) {
-        rb_raise(rb_eArgError, "tokens must be an array of integers");
-        return Qnil;
-      }
-      embd[i] = NUM2INT(token);
-    }
-    const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
-    const int n_past = NUM2INT(kw_values[1]);
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    if (llama_eval(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
-      rb_raise(rb_eRuntimeError, "Failed to evaluate");
-      return Qnil;
-    }
-    rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
-    rb_iv_set(self, "@has_evaluated", Qtrue);
-    return Qnil;
-  }
-  static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[3] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens") };
-    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
-    rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
-    rb_warn("eval_embd is deprecated. Use decode instead.");
-    if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
-      rb_raise(rb_eArgError, "tokens must be an Array");
-      return Qnil;
-    }
-    if (!RB_INTEGER_TYPE_P(kw_values[1])) {
-      rb_raise(rb_eArgError, "n_past must be an integer");
-      return Qnil;
-    }
-    if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
-      rb_raise(rb_eArgError, "n_tokens must be an integer");
-      return Qnil;
-    }
-    const size_t tokens_len = RARRAY_LEN(kw_values[0]);
-    std::vector<float> embd(tokens_len);
-    for (size_t i = 0; i < tokens_len; i++) {
-      VALUE el = rb_ary_entry(kw_values[0], i);
-      if (!RB_FLOAT_TYPE_P(el)) {
-        rb_raise(rb_eArgError, "embd must be an array of floats");
-        return Qnil;
-      }
-      embd[i] = NUM2DBL(el);
-    }
-    const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
-    const int n_past = NUM2INT(kw_values[1]);
-    LLaMAContextWrapper* ptr = get_llama_context(self);
-    if (ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past) != 0) {
-      rb_raise(rb_eRuntimeError, "Failed to evaluate");
-      return Qnil;
-    }
-    rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
-    rb_iv_set(self, "@has_evaluated", Qtrue);
-    return Qnil;
-  }
   static VALUE _llama_context_decode(VALUE self, VALUE batch) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2430,13 +2299,13 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_kv_cache_seq_shift(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
+  static VALUE _llama_context_kv_cache_seq_add(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE delta) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
       rb_raise(rb_eArgError, "LLaMA context is not initialized");
       return Qnil;
     }
-    llama_kv_cache_seq_shift(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
+    llama_kv_cache_seq_add(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(delta));
     return Qnil;
   }
@@ -2450,6 +2319,35 @@ private:
     return Qnil;
   }
+  static VALUE _llama_context_kv_cache_seq_pos_max(VALUE self, VALUE seq_id) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eArgError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    return INT2NUM(llama_kv_cache_seq_pos_max(ptr->ctx, NUM2INT(seq_id)));
+  }
+  static VALUE _llama_context_kv_cache_defrag(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    llama_kv_cache_defrag(ptr->ctx);
+    return Qnil;
+  }
+  static VALUE _llama_context_kv_cache_update(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    llama_kv_cache_update(ptr->ctx);
+    return Qnil;
+  }
   static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2659,46 +2557,6 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
-    VALUE kw_values[2] = { Qundef, Qundef };
-    VALUE candidates = Qnil;
-    rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
-    if (!rb_obj_is_kind_of(kw_values[0], rb_cLLaMAContext)) {
-      rb_raise(rb_eArgError, "guidance must be a Context");
-      return Qnil;
-    }
-    if (!RB_FLOAT_TYPE_P(kw_values[1])) {
-      rb_raise(rb_eArgError, "scale must be a float");
-      return Qnil;
-    }
-    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
-    if (ctx_ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
-    if (cnd_ptr->array.data == nullptr) {
-      rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
-      return Qnil;
-    }
-    LLaMAContextWrapper* guidance_ptr = get_llama_context(kw_values[0]);
-    if (guidance_ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "guidance context is not initialized");
-      return Qnil;
-    }
-    const float scale = NUM2DBL(kw_values[1]);
-    llama_sample_classifier_free_guidance(ctx_ptr->ctx, &(cnd_ptr->array), guidance_ptr->ctx, scale);
-    return Qnil;
-  }
   static VALUE _llama_context_sample_softmax(VALUE self, VALUE candidates) {
     if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
       rb_raise(rb_eArgError, "argument must be a TokenDataArray");
@@ -2994,42 +2852,6 @@ private:
     return Qnil;
   }
-  static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[1] = { rb_intern("temperature") };
-    VALUE kw_values[1] = { Qundef };
-    VALUE candidates = Qnil;
-    rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
-    rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
-    if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
-      rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
-      return Qnil;
-    }
-    if (!RB_FLOAT_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "temperature must be a float");
-      return Qnil;
-    }
-    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
-    if (ctx_ptr->ctx == NULL) {
-      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
-      return Qnil;
-    }
-    LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
-    if (cnd_ptr->array.data == nullptr) {
-      rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
-      return Qnil;
-    }
-    const float temperature = NUM2DBL(kw_values[0]);
-    llama_sample_temperature(ctx_ptr->ctx, &(cnd_ptr->array), temperature);
-    return Qnil;
-  }
   static VALUE _llama_context_sample_token_mirostat(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
     ID kw_table[4] = { rb_intern("tau"), rb_intern("eta"), rb_intern("m"), rb_intern("mu") };
@@ -3307,16 +3129,6 @@ static VALUE rb_llama_time_us(VALUE self) {
   return LONG2NUM(llama_time_us());
 }
-static VALUE rb_llama_mmap_supported(VALUE self) {
-  rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
-  return llama_mmap_supported() ? Qtrue : Qfalse;
-}
-static VALUE rb_llama_mlock_supported(VALUE self) {
-  rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
-  return llama_mlock_supported() ? Qtrue : Qfalse;
-}
 static VALUE rb_llama_max_devices(VALUE self) {
   return SIZET2NUM(llama_max_devices());
 }
@@ -3355,8 +3167,6 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
   rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
   rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
-  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
-  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
   rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
   rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
   rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
@@ -3394,16 +3204,16 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_INT", INT2NUM(LLAMA_KV_OVERRIDE_INT));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_FLOAT));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_BOOL));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
@@ -3413,19 +3223,19 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_RNG_UPPER", INT2NUM(LLAMA_GRETYPE_CHAR_RNG_UPPER));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_UNSPECIFIED));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_NONE", INT2NUM(LLAMA_ROPE_SCALING_NONE));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_LINEAR));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_NONE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_NONE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_LINEAR", INT2NUM(LLAMA_ROPE_SCALING_TYPE_LINEAR));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_ROW", INT2NUM(LLAMA_SPLIT_MODE_ROW));
   std::stringstream ss_magic;
   ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.12.7'
+  VERSION = '0.13.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b2249'
+  LLAMA_CPP_VERSION = 'b2303'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -27,14 +27,14 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
   LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
   LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
-  LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
+  LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
   LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
   LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
   LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
-  LLAMA_KV_OVERRIDE_INT: Integer
-  LLAMA_KV_OVERRIDE_FLOAT: Integer
-  LLAMA_KV_OVERRIDE_BOOL: Integer
+  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
+  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
+  LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
   LLAMA_GRETYPE_END: Integer
   LLAMA_GRETYPE_ALT: Integer
@@ -44,19 +44,19 @@ module LLaMACpp
   LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
   LLAMA_GRETYPE_CHAR_ALT: Integer
-  LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
-  LLAMA_ROPE_SCALING_NONE: Integer
-  LLAMA_ROPE_SCALING_LINEAR: Integer
-  LLAMA_ROPE_SCALING_YARN: Integer
-  LLAMA_ROPE_SCALING_MAX_VALUE: Integer
+  LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
+  LLAMA_ROPE_SCALING_TYPE_NONE: Integer
+  LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
+  LLAMA_ROPE_SCALING_TYPE_YARN: Integer
+  LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
-  LLAMA_POOLING_NONE: Integer
-  LLAMA_POOLING_MEAN: Integer
-  LLAMA_POOLING_CLS: Integer
+  LLAMA_POOLING_TYPE_NONE: Integer
+  LLAMA_POOLING_TYPE_MEAN: Integer
+  LLAMA_POOLING_TYPE_CLS: Integer
-  LLAMA_SPLIT_NONE: Integer
-  LLAMA_SPLIT_LAYER: Integer
-  LLAMA_SPLIT_ROW: Integer
+  LLAMA_SPLIT_MODE_NONE: Integer
+  LLAMA_SPLIT_MODE_LAYER: Integer
+  LLAMA_SPLIT_MODE_ROW: Integer
   def self?.backend_init: () -> void
   def self?.backend_free: () -> void
@@ -68,8 +68,6 @@ module LLaMACpp
     ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
   def self?.print_system_info: () -> void
   def self?.time_us: () -> Integer
-  def self?.mmap_supported?: () -> bool
-  def self?.mlock_supported?: () -> bool
   def self?.max_devices: () -> Integer
   def self?.supports_mmap?: () -> bool
   def self?.supports_mlock?: () -> bool
@@ -103,7 +101,8 @@ module LLaMACpp
     def empty?: () -> bool
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
-    def apply_lora_from_file: (lora_path: String, ?scale: Float, ?base_model_path: String, ?n_threads: Integer) -> void
+    def vocab_type: () -> Integer
+    def rope_type: () -> Integer
     def n_vocab: () -> Integer
     def n_ctx_train: () -> Integer
     def n_embd: () -> Integer
@@ -202,8 +201,6 @@ module LLaMACpp
     def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
     def embeddings: () -> Array[Float]
     def embeddings_ith: (Integer) -> Array[Float]
-    def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
-    def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
     def decode: (::LLaMACpp::Batch) -> void
     def logits: () -> Array[Float]
     def n_ctx: () -> Integer
@@ -216,14 +213,16 @@ module LLaMACpp
     def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
     def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
     def kv_cache_seq_keep: (Integer) -> void
-    def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
+    def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
     def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
+    def kv_cache_seq_pos_max: (Integer) -> Integer
+    def kv_cache_defrag: () -> void
+    def kv_cache_update: () -> void
     def set_rng_seed: (Integer) -> void
     def load_session_file: (session_path: String) -> void
     def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
     def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
     def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
-    def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
     def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
     def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
     def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -232,7 +231,6 @@ module LLaMACpp
     def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
     def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
     def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
-    def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
     def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
     def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
     def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
@@ -270,12 +268,12 @@ module LLaMACpp
     def yarn_beta_slow: () -> Float
     def yarn_orig_ctx=: (Integer) -> Integer
     def yarn_orig_ctx: () -> Integer
+    def defrag_thold=: (Float) -> Float
+    def defrag_thold: () -> Float
     def type_k=: (Integer) -> Integer
     def type_k: () -> Integer
     def type_v=: (Integer) -> Integer
     def type_v: () -> Integer
-    def mul_mat_q: () -> bool
-    def mul_mat_q=: (bool) -> bool
     def logits_all: () -> bool
     def logits_all=: (bool) -> bool
     def embedding: () -> bool

data/vendor/tmp/llama.cpp/Makefile CHANGED Viewed

@@ -383,8 +383,13 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS
 ifdef LLAMA_CUBLAS
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
+	ifneq ('', '$(wildcard /opt/cuda)')
+		CUDA_PATH ?= /opt/cuda
+	else
+		CUDA_PATH ?= /usr/local/cuda
+	endif
+	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	MK_NVCCFLAGS += -use_fast_math
 ifdef LLAMA_FATAL_WARNINGS
@@ -599,7 +604,7 @@ $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef LLAMA_CUBLAS
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
-CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
+CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 ifndef CUDA_DOCKER_ARCH
 ifndef CUDA_POWER_ARCH