RubyGems - llama_cpp - Versions diffs - 0.14.6 → 0.15.0 - Mend

llama_cpp 0.14.6 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +22 -0
data/README.md +2 -2
data/ext/llama_cpp/extconf.rb +2 -1
data/ext/llama_cpp/llama_cpp.cpp +90 -11
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +22 -3
data/vendor/tmp/llama.cpp/Makefile +52 -22
data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
data/vendor/tmp/llama.cpp/ggml-backend.c +21 -15
data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
data/vendor/tmp/llama.cpp/ggml-quants.c +284 -293
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
data/vendor/tmp/llama.cpp/ggml.c +394 -44
data/vendor/tmp/llama.cpp/ggml.h +22 -0
data/vendor/tmp/llama.cpp/llama.cpp +996 -455
data/vendor/tmp/llama.cpp/llama.h +46 -15
data/vendor/tmp/llama.cpp/sgemm.cpp +437 -590
data/vendor/tmp/llama.cpp/sgemm.h +4 -2
data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
data/vendor/tmp/llama.cpp/unicode.h +2 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5c4bd6bcb93b98a00f94dcdf93d04f853174f73e281d96fce8f837a6ba7f250e
-  data.tar.gz: 6d184e9ce927c06ba794bea63a09007a175a72e477366ffb1c5763ceb2c7c71e
+  metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
+  data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
 SHA512:
-  metadata.gz: 953fe2777a759e5467694b8afb9d3f929a42603e81b2c3e38ba0fda4bb6dca78b2d147345023f99c2c9fb899cc746bf6729ad2726c2cb473d7094e93c13caf73
-  data.tar.gz: 71eb3cd5a5c619e9cc8a3418be745a8b76dc5e8cabe5b26a766230a8533df9a11c3981601b0be4ec0adb34a49f86ad741503ffc9f3b0d7ba021a7e9ddc3246a7
+  metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
+  data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,25 @@
+## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
+- Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
+- Bump llama.cpp from b2740 to b2781.
+  - Change `LLAMA_SESSION_VERSION` value from 5 to 6.
+  - Add contants for pre-tokenization types.
+  - Add `flash_attn` accessor to `ContextParams`.
+  - Add `heck_tensors` accessor to `ModelParams`.
+  - Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
+**Breaking Change**
+- Change method names in `ModelKVOverride`.
+## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
+- Bump llama.cpp from b2698 to b2740.
+  - Add `keep_split` accessor to `ModelQuantizeParams`.
+  - Add `pooling_type` method to `Context`.
+  - Add `token_is_eog?` method to `Model`.
+Implementation binding for llama_sample_token_with_rng has been skipped.
 ## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
 - Bump llama.cpp from b2658 to b2698.

data/README.md CHANGED Viewed

@@ -28,8 +28,8 @@ There are several installation options:
 # use OpenBLAS
 $ gem install llama_cpp -- --with-openblas
-# use cuBLAS
-$ gem install llama_cpp -- --with-cublas
+# use CUDA
+$ gem install llama_cpp -- --with-cuda
 ```
 Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
 make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
 make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
 make_envs << ' LLAMA_BLIS=1' if with_config('blis')
-make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
+make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
+make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
 make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
 make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
 make_envs << ' LLAMA_MPI=1' if with_config('mpi')

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -708,9 +708,10 @@ public:
     rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
     rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
     rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
-    rb_define_method(rb_cLLaMAModelKVOverride, "int_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_int_value), 0);
-    rb_define_method(rb_cLLaMAModelKVOverride, "float_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_float_value), 0);
-    rb_define_method(rb_cLLaMAModelKVOverride, "bool_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_bool_value), 0);
+    rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
+    rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
+    rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
+    rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
   }
   static const rb_data_type_t llama_model_kv_override_type;
@@ -726,19 +727,24 @@ private:
     return INT2NUM(ptr->tag);
   }
-  static VALUE _llama_model_kv_override_get_int_value(VALUE self) {
+  static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
     llama_model_kv_override* ptr = get_llama_model_kv_override(self);
-    return INT2NUM(ptr->int_value);
+    return INT2NUM(ptr->val_i64);
   }
-  static VALUE _llama_model_kv_override_get_float_value(VALUE self) {
+  static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
     llama_model_kv_override* ptr = get_llama_model_kv_override(self);
-    return DBL2NUM(ptr->float_value);
+    return DBL2NUM(ptr->val_f64);
   }
-  static VALUE _llama_model_kv_override_get_bool_value(VALUE self) {
+  static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
     llama_model_kv_override* ptr = get_llama_model_kv_override(self);
-    return ptr->bool_value ? Qtrue : Qfalse;
+    return ptr->val_bool ? Qtrue : Qfalse;
+  }
+  static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
+    llama_model_kv_override* ptr = get_llama_model_kv_override(self);
+    return rb_utf8_str_new_cstr(ptr->val_str);
   }
 };
@@ -800,6 +806,8 @@ public:
     rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
     rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
     rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
+    rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
+    rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
   }
 private:
@@ -892,6 +900,18 @@ private:
     LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
     return ptr->params.use_mlock ? Qtrue : Qfalse;
   }
+  // check_tensors
+  static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
+    LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
+    ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
+    return ptr->params.check_tensors ? Qtrue : Qfalse;
+  }
+  static VALUE _llama_model_params_get_check_tensors(VALUE self) {
+    LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
+    return ptr->params.check_tensors ? Qtrue : Qfalse;
+  }
 };
 const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
@@ -984,6 +1004,8 @@ public:
     rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
     rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
     rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
+    rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
+    rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
   }
 private:
@@ -1262,6 +1284,18 @@ private:
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
     return ptr->params.offload_kqv ? Qtrue : Qfalse;
   }
+  // flash_attn
+  static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
+    return ptr->params.flash_attn ? Qtrue : Qfalse;
+  }
+  static VALUE _llama_context_params_get_flash_attn(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return ptr->params.flash_attn ? Qtrue : Qfalse;
+  }
 };
 const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -1321,6 +1355,8 @@ public:
     rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
     rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
     rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
   }
 private:
@@ -1405,6 +1441,18 @@ private:
     LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
     return ptr->params.pure ? Qtrue : Qfalse;
   }
+  // keep_split
+  static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    ptr->params.keep_split = RTEST(keep_split) ? true : false;
+    return ptr->params.keep_split ? Qtrue : Qfalse;
+  }
+  static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    return ptr->params.keep_split ? Qtrue : Qfalse;
+  }
 };
 const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -1487,6 +1535,7 @@ public:
     rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
     rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
     rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
+    rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
   }
 private:
@@ -1634,10 +1683,10 @@ private:
     const llama_token token = NUM2INT(token_);
     LLaMAModelWrapper* ptr = get_llama_model(self);
     std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
     if (n_tokens < 0) {
       result.resize(-n_tokens);
-      const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
+      const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
       if (check != -n_tokens) {
         rb_raise(rb_eRuntimeError, "failed to convert");
         return Qnil;
@@ -1789,6 +1838,16 @@ private:
     LLaMAModelWrapper* ptr = get_llama_model(self);
     return INT2NUM(llama_token_eot(ptr->model));
   }
+  static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
+    if (!RB_INTEGER_TYPE_P(token_)) {
+      rb_raise(rb_eArgError, "token must be an integer");
+      return Qnil;
+    }
+    const llama_token token = NUM2INT(token_);
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
+  }
 };
 const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -2102,6 +2161,7 @@ public:
     rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
     rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
     rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
+    rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
   }
 private:
@@ -3225,6 +3285,15 @@ private:
     return Qnil;
   }
+  static VALUE _llama_context_pooling_type(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
+  }
 };
 const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3351,6 +3420,15 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
   rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
   rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
   rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
   rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
@@ -3393,6 +3471,7 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
   rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
   rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.14.6'
+  VERSION = '0.15.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b2698'
+  LLAMA_CPP_VERSION = 'b2781'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -16,6 +16,15 @@ module LLaMACpp
   LLAMA_VOCAB_TYPE_BPE: Integer
   LLAMA_VOCAB_TYPE_WPM: Integer
+  LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
+  LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
+  LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
+  LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
+  LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
+  LLAMA_VOCAB_PRE_TYPE_MPT: Integer
+  LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
+  LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
   LLAMA_FTYPE_ALL_F32: Integer
   LLAMA_FTYPE_MOSTLY_F16: Integer
   LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -48,6 +57,7 @@ module LLaMACpp
   LLAMA_KV_OVERRIDE_TYPE_INT: Integer
   LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
   LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
+  LLAMA_KV_OVERRIDE_TYPE_STR: Integer
   LLAMA_GRETYPE_END: Integer
   LLAMA_GRETYPE_ALT: Integer
@@ -141,6 +151,7 @@ module LLaMACpp
     def token_middle: () -> Integer
     def token_suffix: () -> Integer
     def token_eot: () -> Integer
+    def token_is_eog?: (Integer) -> bool
   end
   class Timings
@@ -162,9 +173,10 @@ module LLaMACpp
     def key: () -> String
     def tag: () -> Integer
-    def int_value: () -> Integer
-    def float_value: () -> Float
-    def bool_value: () -> bool
+    def val_i64: () -> Integer
+    def val_f64: () -> Float
+    def val_bool: () -> bool
+    def val_str: () -> String
   end
   class ModelParams
@@ -183,6 +195,8 @@ module LLaMACpp
     def use_mmap=: (bool) -> bool
     def use_mlock: () -> bool
     def use_mlock=: (bool) -> bool
+    def check_tensors: () -> bool
+    def check_tensors=: (bool) -> bool
   end
   class Batch
@@ -260,6 +274,7 @@ module LLaMACpp
     def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
     def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
     def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
+    def pooling_type: () -> Integer
   end
   class ContextParams
@@ -309,6 +324,8 @@ module LLaMACpp
     def embeddings=: (bool) -> bool
     def offload_kqv: () -> bool
     def offload_kqv=: (bool) -> bool
+    def flash_attn: () -> bool
+    def flash_attn=: (bool) -> bool
   end
   class ModelQuantizeParams
@@ -328,6 +345,8 @@ module LLaMACpp
     def only_copy=: (bool) -> bool
     def pure: () -> bool
     def pure=: (bool) -> bool
+    def keep_split: () -> bool
+    def keep_split=: (bool) -> bool
   end
   class Params = ContextParams

data/vendor/tmp/llama.cpp/Makefile CHANGED Viewed

@@ -6,11 +6,23 @@ BUILD_TARGETS = \
 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
-	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease                                 \
-	tests/test-json-schema-to-grammar tests/test-grammar-integration
+	tests/test-autorelease \
+	tests/test-backend-ops \
+	tests/test-double-float \
+	tests/test-grad0 \
+	tests/test-grammar-integration \
+	tests/test-grammar-parser \
+	tests/test-json-schema-to-grammar \
+	tests/test-llama-grammar \
+	tests/test-model-load-cancel \
+	tests/test-opt \
+	tests/test-quantize-fns \
+	tests/test-quantize-perf \
+	tests/test-rope \
+	tests/test-sampling \
+	tests/test-tokenizer-0 \
+	tests/test-tokenizer-1-bpe \
+	tests/test-tokenizer-1-spm
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -27,6 +39,17 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
+# In GNU make default CXX is g++ instead of c++.  Let's fix that so that users
+# of non-gcc compilers don't have to provide g++ alias or wrapper.
+DEFCC  := cc
+DEFCXX := c++
+ifeq ($(origin CC),default)
+CC  := $(DEFCC)
+endif
+ifeq ($(origin CXX),default)
+CXX := $(DEFCXX)
+endif
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
 test: $(TEST_TARGETS)
 	@failures=0; \
 	for test_target in $(TEST_TARGETS); do \
-		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
+		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
 			continue; \
@@ -386,10 +415,6 @@ ifdef LLAMA_OPENBLAS
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS
-# TODO: temporary disable until MoE is fixed
-#       https://github.com/ggerganov/llama.cpp/pull/6716
-LLAMA_NO_LLAMAFILE := 1
 ifndef LLAMA_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
 	OBJS        += sgemm.o
@@ -701,7 +726,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
 COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
 common.o: common/common.cpp $(COMMON_H_DEPS)
@@ -777,7 +802,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -805,10 +830,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+examples/server/%.hpp: examples/server/public/% Makefile
+	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
+		echo "unsigned char $${NAME}[] = {" && \
+		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
+		echo "};" && \
+		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
+	) > $@
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -971,11 +1005,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -983,7 +1013,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

data/vendor/tmp/llama.cpp/ggml-alloc.c CHANGED Viewed

@@ -371,16 +371,16 @@ struct ggml_gallocr {
 };
 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
-    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
+    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
     GGML_ASSERT(galloc != NULL);
-    galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
+    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
     GGML_ASSERT(galloc->bufts != NULL);
-    galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
+    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
     GGML_ASSERT(galloc->buffers != NULL);
-    galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
+    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
     GGML_ASSERT(galloc->buf_tallocs != NULL);
     for (int i = 0; i < n_bufs; i++) {
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         free(galloc->hash_set.keys);
         free(galloc->hash_values);
         galloc->hash_set.size = hash_size;
-        galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
-        galloc->hash_values   = calloc(sizeof(struct hash_node), hash_size);
+        galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
+        galloc->hash_values   = calloc(hash_size, sizeof(struct hash_node));
         GGML_ASSERT(galloc->hash_set.keys != NULL);
         GGML_ASSERT(galloc->hash_values != NULL);
     } else {
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     // set the node_allocs from the hash table
     if (galloc->n_nodes < graph->n_nodes) {
         free(galloc->node_allocs);
-        galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
+        galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
         GGML_ASSERT(galloc->node_allocs != NULL);
     }
     galloc->n_nodes = graph->n_nodes;
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     }
     if (galloc->n_leafs < graph->n_leafs) {
         free(galloc->leaf_allocs);
-        galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
+        galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
         GGML_ASSERT(galloc->leaf_allocs != NULL);
     }
     galloc->n_leafs = graph->n_leafs;

data/vendor/tmp/llama.cpp/ggml-backend.c CHANGED Viewed

@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_CPY:
-            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
+            return
+                op->type != GGML_TYPE_IQ2_XXS &&
+                op->type != GGML_TYPE_IQ2_XS  &&
+                op->type != GGML_TYPE_IQ1_S   &&
+                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
             return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
         default:
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
     GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
-    struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
+    struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
     // initialize hash table
     sched->hash_set          = ggml_hash_set_new(graph_size);
-    sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
-    sched->tensor_copies     = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
+    sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
+    sched->tensor_copies     = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
     const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
-    sched->node_backend_ids  = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
-    sched->leaf_backend_ids  = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
+    sched->node_backend_ids  = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
+    sched->leaf_backend_ids  = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
     sched->n_backends = n_backends;
     sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
     const int initial_splits_capacity = 16;
-    sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
+    sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
     sched->splits_capacity = initial_splits_capacity;
     for (int b = 0; b < n_backends; b++) {
@@ -1780,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
 void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     // reset state for the next run
-    size_t hash_size = sched->hash_set.size;
-    memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
-    memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
-    memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
+    if (!sched->is_reset) {
+        size_t hash_size = sched->hash_set.size;
+        memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
+        memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
+        memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
-    sched->is_reset = true;
+        sched->is_reset = true;
+    }
     sched->is_alloc = false;
 }
@@ -1968,10 +1974,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
 struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
     struct ggml_hash_set hash_set = {
         /* .size = */ graph->visited_hash_table.size,
-        /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
+        /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
     };
-    struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
-    bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
+    struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
+    bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
     struct ggml_init_params params = {
         /* .mem_size   = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),