RubyGems - llama_cpp - Versions diffs - 0.3.8 → 0.4.0 - Mend

llama_cpp 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/README.md +1 -1
data/examples/chat.rb +2 -4
data/ext/llama_cpp/extconf.rb +2 -2
data/ext/llama_cpp/llama_cpp.cpp +110 -117
data/ext/llama_cpp/src/ggml-alloc.c +79 -65
data/ext/llama_cpp/src/ggml-alloc.h +1 -1
data/ext/llama_cpp/src/ggml-cuda.cu +330 -69
data/ext/llama_cpp/src/ggml-cuda.h +13 -0
data/ext/llama_cpp/src/ggml-metal.h +3 -0
data/ext/llama_cpp/src/ggml-metal.m +102 -66
data/ext/llama_cpp/src/ggml-metal.metal +113 -9
data/ext/llama_cpp/src/ggml.c +2064 -233
data/ext/llama_cpp/src/ggml.h +238 -13
data/ext/llama_cpp/src/k_quants.c +110 -54
data/ext/llama_cpp/src/llama.cpp +4520 -2978
data/ext/llama_cpp/src/llama.h +133 -125
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +1 -1
data/sig/llama_cpp.rbs +7 -8
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8a6623a24970936369944231171226dda1ce579bf29fc3711f8923c8d2d22cba
-  data.tar.gz: dbff8f38ea54195b05fc0acbaf8fceb7fd6bfdc329100a18665ef2cba2fd5d81
+  metadata.gz: af3a0e01bc9f3cfad4cee3f21144dd354640e1d4558125be36d4b499fa3b4c24
+  data.tar.gz: 042a3b0491d98fa6a093c684e6ab751152f37c8438a3b4a7b19cb2d8c7ab95a7
 SHA512:
-  metadata.gz: 710ab86cfea7b5f91a386bdf87872c1d19ba49057bc02aa11a4f0198aee404a2d5b931965fdeba40aa1353269f95a451090e261305931e31a182a078827ace80
-  data.tar.gz: ec4d956b5ab5ad665a0e99489b81b364b79ed39e74146629e4140240b5e176f4ef9dbf3d1c11acdb4098398114fbf055a2ad4f8251ed98ec42471a478f6dcaa2
+  metadata.gz: 7ed85bd8438ee3b3adab884795c4aecb5b0d72ad57b7e02bc281b62c3b1d669efab62a020e03b09defe3084ecd8afacc4220303e99167d04d668650768c7392b
+  data.tar.gz: b705a0ccd2c7c1e15aed6383acb9d5a3d79d0a0c882a74c42b9099df9a27aff88ba08a2f06aa4d195382e8f41c1b16c0014a2047d1923369f275ca481d52bb21

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,13 @@
+## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
+**Breaking Changes**
+- Bump bundled llama.cpp from master-097e121 to master-b1060.
+  - Support new file format GGUF.
+    - You should re-convert / re-quantize your model files.
+  - Remove vocab methods.
+  - Move token_bos, token_eos, and token_nl methods to Context.
+  - Add text, score, and type methods to Context.
 ## [[0.3.8](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.7...v0.3.8)] - 2023-08-19
 - Bump bundled llama.cpp from master-9ca4abe to master-097e121.

data/README.md CHANGED Viewed

@@ -51,7 +51,7 @@ $ git clone https://huggingface.co/openlm-research/open_llama_7b
 $ cd ../
 $ python3 convert.py models/open_llama_7b
 $ make
-$ ./quantize ./models/open_llama_7b/ggml-model-f16.bin ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
+$ ./quantize ./models/open_llama_7b/ggml-model-f16.gguf ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
 ```
 An example of Ruby code that generates sentences with the quantization model is as follows:

data/examples/chat.rb CHANGED Viewed

@@ -49,8 +49,6 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
     n_keep = options[:keep]
     n_keep = embd_input.size if n_keep > embd_input.size
-    token_newline = context.tokenize(text: "\n", add_bos: false)
     last_n_tokens = [0] * n_ctx
     interactive = true
     is_interacting = false
@@ -101,8 +99,8 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         last_n_tokens.shift
         last_n_tokens.push(id)
-        if id == LLaMACpp.token_eos
-          id = token_newline.first
+        if id == context.token_eos
+          id = context.token_nl
           unless antiprompt.empty?
             first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
             embd_input.concat(first_antiprompt)

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -53,7 +53,7 @@ if with_config('metal')
   $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
   $CXXFLAGS << ' -DGGML_USE_METAL'
   $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
-  $objs = %w[ggml.o ggml-alloc.o llama.o llama_cpp.o ggml-metal.o]
+  $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
   $objs << 'k_quants.o' unless with_config('no_k_quants')
 end
@@ -61,7 +61,7 @@ if with_config('cublas')
   $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
   $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
   $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
-  $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
+  $objs = %w[ggml.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
   $objs << 'k_quants.o' unless with_config('no_k_quants')
 end

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -808,10 +808,9 @@ public:
     rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
     rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
     rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
-    rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_n_vocab_from_model), 0);
-    rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_n_ctx_from_model), 0);
-    rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_n_embd_from_model), 0);
-    rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
+    rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
+    rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
+    rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
     rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
     rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
     rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
@@ -960,53 +959,19 @@ private:
     return Qnil;
   }
-  static VALUE _llama_model_get_n_vocab_from_model(VALUE self) {
+  static VALUE _llama_model_get_model_n_vocab(VALUE self) {
     LLaMAModelWrapper* ptr = get_llama_model(self);
-    return INT2NUM(llama_n_vocab_from_model(ptr->model));
+    return INT2NUM(llama_model_n_vocab(ptr->model));
   }
-  static VALUE _llama_model_get_n_ctx_from_model(VALUE self) {
+  static VALUE _llama_model_get_model_n_ctx(VALUE self) {
     LLaMAModelWrapper* ptr = get_llama_model(self);
-    return INT2NUM(llama_n_ctx_from_model(ptr->model));
+    return INT2NUM(llama_model_n_ctx(ptr->model));
   }
-  static VALUE _llama_model_get_n_embd_from_model(VALUE self) {
+  static VALUE _llama_model_get_model_n_embd(VALUE self) {
     LLaMAModelWrapper* ptr = get_llama_model(self);
-    return INT2NUM(llama_n_embd_from_model(ptr->model));
-  }
-  static VALUE _llama_model_get_vocab_from_model(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[1] = { rb_intern("capacity") };
-    VALUE kw_values[1] = { Qundef };
-    rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
-    if (!RB_INTEGER_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "capacity must be an integer");
-      return Qnil;
-    }
-    const int capacity = NUM2INT(kw_values[0]);
-    LLaMAModelWrapper* ptr = get_llama_model(self);
-    const int n = std::min(capacity, llama_n_vocab_from_model(ptr->model));
-    const char** vocabs = ALLOCA_N(const char*, n);
-    float* scores = ALLOCA_N(float, n);
-    llama_get_vocab_from_model(ptr->model, vocabs, scores, capacity);
-    VALUE vocabs_ary = rb_ary_new();
-    VALUE scores_ary = rb_ary_new();
-    for (int i = 0; i < n; i++) {
-      rb_ary_push(vocabs_ary, rb_str_new_cstr(vocabs[i]));
-      rb_ary_push(scores_ary, DBL2NUM(scores[i]));
-    }
-    VALUE ret = rb_ary_new3(2, vocabs_ary, scores_ary);
-    return ret;
+    return INT2NUM(llama_model_n_embd(ptr->model));
   }
   static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
@@ -1016,8 +981,20 @@ private:
     }
     const llama_token token = NUM2INT(token_);
     LLaMAModelWrapper* ptr = get_llama_model(self);
-    const char* str = llama_token_to_str_with_model(ptr->model, token);
-    return rb_str_new_cstr(str);
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
+    if (n_tokens < 0) {
+      result.resize(-n_tokens);
+      const int check = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
+      if (check != -n_tokens) {
+        rb_raise(rb_eRuntimeError, "failed to convert");
+        return Qnil;
+      }
+    } else {
+      result.resize(n_tokens);
+    }
+    std::string ret(result.data(), result.size());
+    return rb_str_new_cstr(ret.c_str());
   }
   static VALUE _llama_model_tokenize_with_model(int argc, VALUE* argv, VALUE self) {
@@ -1343,7 +1320,12 @@ public:
     rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
     rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
-    rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
+    rb_define_method(rb_cLLaMAContext, "text", RUBY_METHOD_FUNC(_llama_context_text), 1);
+    rb_define_method(rb_cLLaMAContext, "score", RUBY_METHOD_FUNC(_llama_context_score), 1);
+    rb_define_method(rb_cLLaMAContext, "type", RUBY_METHOD_FUNC(_llama_context_type), 1);
+    rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
+    rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
+    rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
     rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
     rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
     rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
@@ -1592,8 +1574,20 @@ private:
       return Qnil;
     }
     const llama_token token = NUM2INT(token_);
-    const char* str = llama_token_to_str(ptr->ctx, token);
-    return str != nullptr ? rb_utf8_str_new_cstr(str) : rb_utf8_str_new_cstr("");
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
+    if (n_tokens < 0) {
+      result.resize(-n_tokens);
+      const int check = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
+      if (check != -n_tokens) {
+        rb_raise(rb_eRuntimeError, "failed to convert");
+        return Qnil;
+      }
+    } else {
+      result.resize(n_tokens);
+    }
+    std::string ret(result.data(), result.size());
+    return rb_str_new_cstr(ret.c_str());
   }
   static VALUE _llama_context_logits(VALUE self) {
@@ -1649,41 +1643,64 @@ private:
     return output;
   }
-  static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
-    VALUE kw_args = Qnil;
-    ID kw_table[1] = { rb_intern("capacity") };
-    VALUE kw_values[1] = { Qundef };
-    rb_scan_args(argc, argv, ":", &kw_args);
-    rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
-    if (!RB_INTEGER_TYPE_P(kw_values[0])) {
-      rb_raise(rb_eArgError, "capacity must be an integer");
+  static VALUE _llama_context_text(VALUE self, VALUE token_) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
       return Qnil;
     }
+    const llama_token token = NUM2INT(token_);
+    const char* text = llama_token_get_text(ptr->ctx, token);
+    return rb_str_new_cstr(text);
+  }
+  static VALUE _llama_context_score(VALUE self, VALUE token_) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
       rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
       return Qnil;
     }
+    const llama_token token = NUM2INT(token_);
+    const float score = llama_token_get_score(ptr->ctx, token);
+    return DBL2NUM(score);
+  }
-    const int capacity = NUM2INT(kw_values[0]);
-    std::vector<const char*> strings;
-    std::vector<float> scores;
-    int n_vocab = llama_n_vocab(ptr->ctx);
-    strings.resize(n_vocab, NULL);
-    scores.resize(n_vocab, 0);
+  static VALUE _llama_context_type(VALUE self, VALUE token_) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    const llama_token token = NUM2INT(token_);
+    const int type = llama_token_get_type(ptr->ctx, token);
+    return INT2NUM(type);
+  }
-    n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
+  static VALUE _llama_context_token_bos(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    return INT2NUM(llama_token_bos(ptr->ctx));
+  }
-    VALUE ret_strings = rb_ary_new();
-    VALUE ret_scores = rb_ary_new();
-    for (int i = 0; i < n_vocab; i++) {
-      rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
-      rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
+  static VALUE _llama_context_token_eos(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
     }
+    return INT2NUM(llama_token_eos(ptr->ctx));
+  }
-    return rb_ary_new_from_args(2, ret_strings, ret_scores);
+  static VALUE _llama_context_token_nl(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    return INT2NUM(llama_token_nl(ptr->ctx));
   }
   static VALUE _llama_context_n_vocab(VALUE self) {
@@ -2474,23 +2491,15 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
   return Qnil;
 }
-static VALUE rb_llama_token_bos(VALUE self) {
-  return INT2NUM(llama_token_bos());
-}
-static VALUE rb_llama_token_eos(VALUE self) {
-  return INT2NUM(llama_token_eos());
-}
-static VALUE rb_llama_token_nl(VALUE self) {
-  return INT2NUM(llama_token_nl());
-}
 static VALUE rb_llama_print_system_info(VALUE self) {
   const char* result = llama_print_system_info();
   return rb_utf8_str_new_cstr(result);
 }
+static VALUE rb_llama_time_us(VALUE self) {
+  return LONG2NUM(llama_time_us());
+}
 static VALUE rb_llama_mmap_supported(VALUE self) {
   return llama_mmap_supported() ? Qtrue : Qfalse;
 }
@@ -2519,16 +2528,29 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
   rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
   rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
-  rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
-  rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
-  rb_define_module_function(rb_mLLaMACpp, "token_nl", rb_llama_token_nl, 0);
   rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
+  rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
   rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
   rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
   rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
   rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_ERROR", INT2NUM(LLAMA_LOG_LEVEL_ERROR));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_WARN", INT2NUM(LLAMA_LOG_LEVEL_WARN));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_INFO", INT2NUM(LLAMA_LOG_LEVEL_INFO));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_CONTROL", INT2NUM(LLAMA_TOKEN_TYPE_CONTROL));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_USER_DEFINED", INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -2547,6 +2569,8 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_RULE_REF", INT2NUM(LLAMA_GRETYPE_RULE_REF));
@@ -2556,39 +2580,9 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
   std::stringstream ss_magic;
-  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGJT", rb_str_new2(ss_magic.str().c_str()));
-  ss_magic.str("");
-  ss_magic.clear(std::stringstream::goodbit);
-  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
-  ss_magic.str("");
-  ss_magic.clear(std::stringstream::goodbit);
-  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGMF;
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGMF", rb_str_new2(ss_magic.str().c_str()));
-  ss_magic.str("");
-  ss_magic.clear(std::stringstream::goodbit);
-  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGML;
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGML", rb_str_new2(ss_magic.str().c_str()));
-  ss_magic.str("");
-  ss_magic.clear(std::stringstream::goodbit);
   ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
   rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
-  ss_magic.str("");
-  ss_magic.clear(std::stringstream::goodbit);
-  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC", rb_str_new2(ss_magic.str().c_str()));
-  ss_magic.str("");
-  ss_magic.clear(std::stringstream::goodbit);
-  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_UNVERSIONED;
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_UNVERSIONED", rb_str_new2(ss_magic.str().c_str()));
   ss_magic.str("");
   ss_magic.clear(std::stringstream::goodbit);
   ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
@@ -2599,6 +2593,5 @@ extern "C" void Init_llama_cpp(void) {
   ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
   rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
   rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
 }

data/ext/llama_cpp/src/ggml-alloc.c CHANGED Viewed

@@ -8,6 +8,7 @@
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
 //#define GGML_ALLOCATOR_DEBUG
@@ -67,8 +68,8 @@ struct ggml_allocr {
     struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
     size_t max_size;
     bool measure;
-    int parse_seq[GGML_MAX_NODES];
-    bool has_parse_seq;
+    int parse_seq[GGML_MAX_CONCUR];
+    int parse_seq_len;
 #ifdef GGML_ALLOCATOR_DEBUG
     struct ggml_tensor * allocated_tensors[1024];
@@ -76,7 +77,7 @@ struct ggml_allocr {
 };
 #ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
         if (alloc->allocated_tensors[i] == NULL) {
             alloc->allocated_tensors[i] = tensor;
@@ -85,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
     }
     GGML_ASSERT(!"out of allocated_tensors");
 }
-static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
         if (alloc->allocated_tensors[i] == tensor ||
             (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -238,15 +239,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
     alloc->n_free_blocks++;
 }
-void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
-    int pos = 0;
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
     for (int i = 0; i < n; i++) {
-        if (list[i] != -1) {
-            alloc->parse_seq[pos] = list[i];
-            pos++;
-        }
+        alloc->parse_seq[i] = list[i];
     }
-    alloc->has_parse_seq = true;
+    alloc->parse_seq_len = n;
 }
 void ggml_allocr_reset(struct ggml_allocr * alloc) {
@@ -269,7 +266,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
         /*.max_size      = */ 0,
         /*.measure       = */ false,
         /*.parse_seq     = */ {0},
-        /*.has_parse_seq = */ false,
+        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ = {0},
 #endif
@@ -298,7 +295,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
         /*.max_size      = */ 0,
         /*.measure       = */ true,
         /*.parse_seq     = */ {0},
-        /*.has_parse_seq = */ false,
+        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ = {0},
 #endif
@@ -445,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                         else {
                             AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                             node->data = parent->data;
+                            return;
                         }
-                        return;
                     }
                 }
             }
@@ -497,69 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                 allocate_node(alloc, input);
             }
         }
-        for (int ind = 0; ind < gf->n_nodes; ind++) {
-            int i;
-            if (alloc->has_parse_seq) {
-                i = alloc->parse_seq[ind];
-            } else {
-                i = ind;
-            }
-            struct ggml_tensor * node = gf->nodes[i];
-            // allocate parents (leafs)
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * parent = node->src[j];
-                if (parent == NULL) {
-                    break;
+        // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
+        int last_barrier_pos = 0;
+        int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
+        for (int ind = 0; ind < n_nodes; ind++) {
+            // allocate a node if there is no parse_seq or this is not a barrier
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
+                int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
+                struct ggml_tensor * node = gf->nodes[i];
+                // allocate parents (leafs)
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * parent = node->src[j];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    allocate_node(alloc, parent);
                 }
-                allocate_node(alloc, parent);
-            }
-            // allocate node
-            allocate_node(alloc, node);
+                // allocate node
+                allocate_node(alloc, node);
-            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * parent = node->src[j];
-                if (parent == NULL) {
-                    break;
-                }
-                AT_PRINTF("%s", parent->name);
-                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
-                    AT_PRINTF(", ");
+                AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * parent = node->src[j];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    AT_PRINTF("%s", parent->name);
+                    if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                        AT_PRINTF(", ");
+                    }
                 }
+                AT_PRINTF("\n");
             }
-            AT_PRINTF("\n");
             // update parents
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * parent = node->src[j];
-                if (parent == NULL) {
-                    break;
-                }
-                struct hash_node * p_hn = hash_get(ht, parent);
-                p_hn->n_children -= 1;
-                //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
-                if (p_hn->n_children == 0 && p_hn->n_views == 0) {
-                    if (ggml_is_view(parent)) {
-                        struct ggml_tensor * view_src = get_view_source(parent);
-                        struct hash_node * view_src_hn = hash_get(ht, view_src);
-                        view_src_hn->n_views -= 1;
-                        AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
-                        if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
-                            ggml_allocator_free_tensor(alloc, view_src);
+            // update immediately if there is no parse_seq
+            // update only at barriers if there is parse_seq
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
+                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
+                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
+                for (int i = update_start; i < update_end; i++) {
+                    int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
+                    struct ggml_tensor * node = gf->nodes[node_i];
+                    for (int j = 0; j < GGML_MAX_SRC; j++) {
+                        struct ggml_tensor * parent = node->src[j];
+                        if (parent == NULL) {
+                            break;
                         }
-                    }
-                    else {
-                        if (parent->data != node->data) {
-                            ggml_allocator_free_tensor(alloc, parent);
+                        struct hash_node * p_hn = hash_get(ht, parent);
+                        p_hn->n_children -= 1;
+                        //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
+                        if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+                            if (ggml_is_view(parent)) {
+                                struct ggml_tensor * view_src = get_view_source(parent);
+                                struct hash_node * view_src_hn = hash_get(ht, view_src);
+                                view_src_hn->n_views -= 1;
+                                AT_PRINTF("view_src %s\n", view_src->name);
+                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
+                                    ggml_allocator_free_tensor(alloc, view_src);
+                                }
+                            }
+                            else {
+                                if (parent->data != node->data) {
+                                    ggml_allocator_free_tensor(alloc, parent);
+                                }
+                            }
                         }
                     }
                 }
+                AT_PRINTF("\n");
+                if (alloc->parse_seq_len) {
+                    last_barrier_pos = ind + 1;
+                }
             }
-            AT_PRINTF("\n");
         }
         // free graph outputs here that wouldn't be freed otherwise because they have no children
         if (outputs != NULL && outputs[g] != NULL) {

data/ext/llama_cpp/src/ggml-alloc.h CHANGED Viewed

@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
-GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
+GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
 GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
 GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);