RubyGems - llama_cpp - Versions diffs - 0.1.4 → 0.2.0 - Mend

llama_cpp 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +29 -0
data/ext/llama_cpp/extconf.rb +26 -1
data/ext/llama_cpp/llama_cpp.cpp +210 -13
data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
data/ext/llama_cpp/src/ggml-cuda.h +15 -2
data/ext/llama_cpp/src/ggml-metal.h +63 -0
data/ext/llama_cpp/src/ggml-metal.m +783 -0
data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
data/ext/llama_cpp/src/ggml-opencl.h +4 -0
data/ext/llama_cpp/src/ggml.c +340 -109
data/ext/llama_cpp/src/ggml.h +44 -6
data/ext/llama_cpp/src/k_quants.c +2244 -0
data/ext/llama_cpp/src/k_quants.h +122 -0
data/ext/llama_cpp/src/llama-util.h +16 -0
data/ext/llama_cpp/src/llama.cpp +484 -136
data/ext/llama_cpp/src/llama.h +39 -8
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +33 -1
metadata +8 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f08992d10701b3ac0ab87c32d8a28d7f81101a4896e3300c461d7015f6486814
-  data.tar.gz: 38fd72fe9cdd596f7878ef902ddf8ec8e36954bbac50388fe8ef8437a93bfe29
+  metadata.gz: bf8532fb7d2d96acd42b0da600cd5a8923411545817cb433036c182cb6d549ca
+  data.tar.gz: 2aa68d4ffe814632d6b8f2d97f6407284520d2b540b8920c40d486c599221bc3
 SHA512:
-  metadata.gz: e3d024db508be6cbe7e644c4a9295da97742b45f921c9c5d64a7f4b4eb6be624e79f4c63d39c226566dbb9676215ae3b986828095a185cb2069547a12cf651a0
-  data.tar.gz: d884334f2d77a7204f0bc96c037fa86ee6fdf3f2879c5c5bd721be336dc743a0034733fc566c114bc6f22e620e5d79ccd4c67b6ded7d929d1949315b31445701
+  metadata.gz: ea8dad06ae15f9ca6ba585ae901d163bc6580543131338bbe785444083791b6251b2f631725190f9935740d0169520e3da604a66330e5bf7551031b7dc47dd81
+  data.tar.gz: 3b32180e6a4653af2afac59d4640c5d06b29f5872d7ef40f33dcddc27b97e2eb0fa2f38fe6389ddcb60b4631db9e3ebfd0d4b14cc9de6419e50452b4c67ad98a

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,32 @@
+## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
+- Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
+- Fix installation files for CUDA.
+- Add metal config option:
+  ```
+  $ gem install llama_cpp -- --with-metal
+  ```
+  ```ruby
+  require 'llama_cpp'
+  params = LLaMACpp::ContextParams.new
+  params.n_gpu_layers = 1
+  context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
+  LLaMACpp.generate(context, 'Hello, world.')
+  ```
+**Breaking Changes**
+- Add ModelQuantizationParams class.
+- Change the argument of the `model_quantize` module function in LLaMACpp.
+  ```ruby
+  require 'llama_cpp'
+  params = LLaMACpp::ModelQuantizeParams.new
+  LLaMACpp.model_quantize(input_path: 'foo.model', output_path: 'bar.model', params: params)
+  ```
 ## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
 - Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require 'mkmf'
+require 'fileutils'
 abort 'libstdc++ is not found.' unless have_library('stdc++')
@@ -36,17 +37,30 @@ if with_config('accelerate')
   $CFLAGS << ' -DGGML_USE_ACCELERATE'
 end
+if with_config('metal')
+  $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
+  $CXXFLAGS << ' -DGGML_USE_METAL'
+  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
+  $objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
+end
 if with_config('cublas')
   $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
+  $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
   $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
   $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
 end
 if with_config('clblast')
   abort 'libclblast is not found.' unless have_library('clblast')
-  abort 'libOpenCL is not found.' unless have_library('OpenCL')
   $CFLAGS << ' -DGGML_USE_CLBLAST'
+  $CXXFLAGS << ' -DGGML_USE_CLBLAST'
+  if RUBY_PLATFORM.match?(/darwin/)
+    $LDFLAGS << ' -framework OpenCL'
+  else
+    abort 'libOpenCL is not found.' unless have_library('OpenCL')
+  end
 end
 UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
@@ -78,3 +92,14 @@ if with_config('cublas')
     f.puts "\tnvcc -arch=native -c -o $@ $<"
   end
 end
+if with_config('metal')
+  File.open('Makefile', 'a') do |f|
+    f.puts 'ggml-metal.o: ggml-metal.m ggml-metal.h'
+    f.puts "\t$(CC) $(CFLAGS) -c $< -o $@"
+  end
+  metal_path = File.expand_path("#{__dir__}/src/ggml-metal.metal")
+  dest_path = File.expand_path("#{__dir__}/../../lib/llama_cpp/")
+  FileUtils.cp(metal_path, dest_path)
+end

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -4,6 +4,7 @@
 VALUE rb_mLLaMACpp;
 VALUE rb_cLLaMAContext;
 VALUE rb_cLLaMAContextParams;
+VALUE rb_cLLaMAModelQuantizeParams;
 VALUE rb_cLLaMATokenData;
 VALUE rb_cLLaMATokenDataArray;
@@ -292,6 +293,13 @@ public:
     // rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
     rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
     rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
+    rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
+    rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
+    rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_context_params_set_n_gpu_layers), 1);
+    rb_define_method(rb_cLLaMAContextParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_context_params_get_n_gpu_layers), 0);
+    rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
+    rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
+    rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
     rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
     rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
     rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -329,6 +337,55 @@ private:
     return INT2NUM(ptr->params.n_ctx);
   };
+  // n_batch
+  static VALUE _llama_context_params_set_n_batch(VALUE self, VALUE n_batch) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.n_batch = NUM2INT(n_batch);
+    return INT2NUM(ptr->params.n_batch);
+  };
+  static VALUE _llama_context_params_get_n_batch(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return INT2NUM(ptr->params.n_batch);
+  };
+  // n_gpu_layers
+  static VALUE _llama_context_params_set_n_gpu_layers(VALUE self, VALUE n_gpu_layers) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.n_gpu_layers = NUM2INT(n_gpu_layers);
+    return INT2NUM(ptr->params.n_gpu_layers);
+  };
+  static VALUE _llama_context_params_get_n_gpu_layers(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return INT2NUM(ptr->params.n_gpu_layers);
+  };
+  // main_gpu
+  static VALUE _llama_context_params_set_main_gpu(VALUE self, VALUE main_gpu) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.main_gpu = NUM2INT(main_gpu);
+    return INT2NUM(ptr->params.main_gpu);
+  };
+  static VALUE _llama_context_params_get_main_gpu(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return INT2NUM(ptr->params.main_gpu);
+  };
+  // tensor_split
+  static VALUE _llama_context_params_get_tensor_split(VALUE self) {
+    if (LLAMA_MAX_DEVICES < 1) {
+      return rb_ary_new();
+    }
+    VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
+      rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
+    }
+    return ret;
+  };
   // seed
   static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -424,6 +481,121 @@ const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
   RUBY_TYPED_FREE_IMMEDIATELY
 };
+class LLaMAModelQuantizeParamsWrapper {
+public:
+  llama_model_quantize_params params;
+  LLaMAModelQuantizeParamsWrapper() : params(llama_model_quantize_default_params()){};
+  ~LLaMAModelQuantizeParamsWrapper(){};
+};
+class RbLLaMAModelQuantizeParams {
+public:
+  static VALUE llama_model_quantize_params_alloc(VALUE self) {
+    LLaMAModelQuantizeParamsWrapper* ptr = (LLaMAModelQuantizeParamsWrapper*)ruby_xmalloc(sizeof(LLaMAModelQuantizeParamsWrapper));
+    new (ptr) LLaMAModelQuantizeParamsWrapper();
+    return TypedData_Wrap_Struct(self, &llama_model_quantize_params_type, ptr);
+  };
+  static void llama_model_quantize_params_free(void* ptr) {
+    ((LLaMAModelQuantizeParamsWrapper*)ptr)->~LLaMAModelQuantizeParamsWrapper();
+    ruby_xfree(ptr);
+  };
+  static size_t llama_model_quantize_params_size(const void* ptr) {
+    return sizeof(*((LLaMAModelQuantizeParamsWrapper*)ptr));
+  };
+  static LLaMAModelQuantizeParamsWrapper* get_llama_model_quantize_params(VALUE self) {
+    LLaMAModelQuantizeParamsWrapper* ptr;
+    TypedData_Get_Struct(self, LLaMAModelQuantizeParamsWrapper, &llama_model_quantize_params_type, ptr);
+    return ptr;
+  };
+  static void define_class(VALUE outer) {
+    rb_cLLaMAModelQuantizeParams = rb_define_class_under(outer, "ModelQuantizeParams", rb_cObject);
+    rb_define_alloc_func(rb_cLLaMAModelQuantizeParams, llama_model_quantize_params_alloc);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_n_thread), 1);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "n_thread", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_n_thread), 0);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_ftype), 1);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "ftype", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_ftype), 0);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_allow_requantize), 1);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
+    rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
+  };
+private:
+  static const rb_data_type_t llama_model_quantize_params_type;
+  // n_thread
+  static VALUE _llama_model_quantize_params_set_n_thread(VALUE self, VALUE n_thread) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    ptr->params.nthread = NUM2INT(n_thread);
+    return INT2NUM(ptr->params.nthread);
+  };
+  static VALUE _llama_model_quantize_params_get_n_thread(VALUE self) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    return INT2NUM(ptr->params.nthread);
+  };
+  // ftype
+  static VALUE _llama_model_quantize_params_set_ftype(VALUE self, VALUE ftype) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    ptr->params.ftype = static_cast<enum llama_ftype>(NUM2INT(ftype));
+    return INT2NUM(ptr->params.ftype);
+  };
+  static VALUE _llama_model_quantize_params_get_ftype(VALUE self) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    return INT2NUM(ptr->params.ftype);
+  };
+  // allow_requantize
+  static VALUE _llama_model_quantize_params_set_allow_requantize(VALUE self, VALUE allow_requantize) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    if (NIL_P(allow_requantize) || allow_requantize == Qfalse) {
+      ptr->params.allow_requantize = false;
+    } else {
+      ptr->params.allow_requantize = true;
+    }
+    return ptr->params.allow_requantize ? Qtrue : Qfalse;
+  };
+  static VALUE _llama_model_quantize_params_get_allow_requantize(VALUE self) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    return ptr->params.allow_requantize ? Qtrue : Qfalse;
+  };
+  // quantize_output_tensor
+  static VALUE _llama_model_quantize_params_set_quantize_output_tensor(VALUE self, VALUE quantize_output_tensor) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    if (NIL_P(quantize_output_tensor) || quantize_output_tensor == Qfalse) {
+      ptr->params.quantize_output_tensor = false;
+    } else {
+      ptr->params.quantize_output_tensor = true;
+    }
+    return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
+  };
+  static VALUE _llama_model_quantize_params_get_quantize_output_tensor(VALUE self) {
+    LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
+    return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
+  };
+};
+const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
+  "RbLLaMAModelQuantizeParams",
+  { NULL,
+    RbLLaMAModelQuantizeParams::llama_model_quantize_params_free,
+    RbLLaMAModelQuantizeParams::llama_model_quantize_params_size },
+  NULL,
+  NULL,
+  RUBY_TYPED_FREE_IMMEDIATELY
+};
 class LLaMAContextWrapper {
 public:
   struct llama_context* ctx;
@@ -465,6 +637,7 @@ public:
     rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
     rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
     rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
+    rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
     rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
     rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
@@ -517,7 +690,7 @@ private:
       return Qnil;
     }
     if (!rb_obj_is_kind_of(kw_values[1], rb_cLLaMAContextParams)) {
-      rb_raise(rb_eArgError, "params must be a LLaMAContextParams");
+      rb_raise(rb_eArgError, "params must be a ContextParams");
       return Qnil;
     }
@@ -599,6 +772,24 @@ private:
     return Qnil;
   };
+  static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    if (!RB_TYPE_P(fname_, T_STRING)) {
+      rb_raise(rb_eArgError, "fname must be a string");
+      return Qnil;
+    }
+    const char* fname = StringValueCStr(fname_);
+    if (llama_eval_export(ptr->ctx, fname) != 0) {
+      return Qfalse;
+    }
+    RB_GC_GUARD(fname_);
+    return Qtrue;
+  };
   static VALUE _llama_context_tokenize(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
     ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
@@ -1428,10 +1619,10 @@ static VALUE rb_llama_llama_init_backend(VALUE self) {
 static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
   VALUE kw_args = Qnil;
-  ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
-  VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
+  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
+  VALUE kw_values[3] = { Qundef, Qundef, Qundef };
   rb_scan_args(argc, argv, ":", &kw_args);
-  rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
+  rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
   if (!RB_TYPE_P(kw_values[0], T_STRING)) {
     rb_raise(rb_eArgError, "input_path must be a string");
@@ -1441,21 +1632,16 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
     rb_raise(rb_eArgError, "output_path must be a string");
     return Qnil;
   }
-  if (!RB_INTEGER_TYPE_P(kw_values[2])) {
-    rb_raise(rb_eArgError, "ftype must be an integer");
-    return Qnil;
-  }
-  if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
-    rb_raise(rb_eArgError, "n_threads must be an integer");
+  if (!rb_obj_is_kind_of(kw_values[2], rb_cLLaMAModelQuantizeParams)) {
+    rb_raise(rb_eArgError, "params must be a ModelQuantizeParams");
     return Qnil;
   }
   const char* input_path = StringValueCStr(kw_values[0]);
   const char* output_path = StringValueCStr(kw_values[1]);
-  const int ftype = NUM2INT(kw_values[2]);
-  const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
+  LLaMAModelQuantizeParamsWrapper* wrapper = RbLLaMAModelQuantizeParams::get_llama_model_quantize_params(kw_values[2]);
-  if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
+  if (llama_model_quantize(input_path, output_path, &(wrapper->params)) != 0) {
     rb_raise(rb_eRuntimeError, "Failed to quantize model");
     return Qnil;
   }
@@ -1505,6 +1691,8 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
   rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
+  rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -1513,6 +1701,15 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_S));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_M));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_L", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_L));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_S));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_K_M));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
   std::stringstream ss_magic;
   ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;