RubyGems - llama_cpp - Versions diffs - 0.11.1 → 0.12.1 - Mend

llama_cpp 0.11.1 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +15 -0
data/README.md +3 -3
data/examples/chat.rb +6 -2
data/examples/embedding.rb +5 -1
data/examples/simple.rb +4 -1
data/ext/llama_cpp/llama_cpp.cpp +63 -0
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +2 -2
data/sig/llama_cpp.rbs +5 -0
data/vendor/tmp/llama.cpp/Makefile +8 -2
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
data/vendor/tmp/llama.cpp/ggml.c +91 -52
data/vendor/tmp/llama.cpp/ggml.h +14 -11
data/vendor/tmp/llama.cpp/llama.cpp +79 -30
data/vendor/tmp/llama.cpp/llama.h +14 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 705f8a7e2228a324f14fa23ca093e2ce4408eacb839d891284c40e319b07940a
-  data.tar.gz: fc04d232f2c7ecfa1402aa711eda63e36a03d287a6bc7c8e0d2c791194ad9e9a
+  metadata.gz: 13381408318e71cc1fc55c40ee9be6e62ad9e3ad6a8ce39279bb8040614e9b3b
+  data.tar.gz: 6456734b18865a7811f08d0d9d599771f574f4b59bd5b54a964ece7428115907
 SHA512:
-  metadata.gz: 888ba5bfa23ab51746d49c2cc071f8a220d3de39f6c3a34576f35bcb993fc0be841481dcbca9762504397ca6555571f43b4ba2c0ae3dae3fcd8d29bd2735ae16
-  data.tar.gz: e8940f8ab7a542569a71ad5c869ac781b8ad958ca02d2a5547b792008c228ae1e9ff23cc5b9552e83b0c631805ec7edfea6138c5ae9d922daca06e5ab3f1490d
+  metadata.gz: 1014349771d7aa3c318027de11603e96d5482e4bd5b1bcf0fd4874040245daf44c4cfb801077a698846459a7619ca9e01e0afc3507fc7bd519e7ba68a000a15d
+  data.tar.gz: 1315ca8954397edb0db93347a10762e35f829377ef3dba0ea9cf6c67f986972ac8e75b46c410a3ceceefc0474f2abbe6f441e56a60e789ef1d2617fc15cfb29e

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,18 @@
+## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
+- Bump bundled llama.cpp from b1768 to b1833.
+  - Add model file type constants.
+  - Add `kv_cache_seq_div` method to `Context`.
+## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
+- Add `get_one` singleton method to `Batch`.
+**Breaking Changes**
+- Add deprecation warning to `eval`, `eval_embd`, and `sample_temperature` methods on `Context`.
+- Change to avoid using deprecated methods on `generate` method and example scripts.
 ## [[0.11.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.0...v0.11.1)] - 2024-01-08
 - Fix to set the values of `@n_tokens` and `@has_evaluated` instance variables in `decode` method of `Context`.

data/README.md CHANGED Viewed

@@ -22,14 +22,14 @@ If bundler is not being used to manage dependencies, install the gem by executin
 $ gem install llama_cpp
 ```
-There are several installation options for improving execution performance:
+There are several installation options:
 ```sh
 # use OpenBLAS
 $ gem install llama_cpp -- --with-openblas
-# use Metal on macOS
-$ gem install llama_cpp -- --with-metal
+# use CUDA
+$ gem install llama_cpp -- --with-cuda
 ```
 Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.

data/examples/chat.rb CHANGED Viewed

@@ -9,6 +9,7 @@
 require 'llama_cpp'
 require 'thor'
 require 'readline'
+require 'etc'
 class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
   default_command :main
@@ -30,12 +31,15 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
   option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
   option :temp, type: :numeric, desc: 'temperature', default: 0.8
   option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
+  option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
   def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
     mdl_params = LLaMACpp::ModelParams.new
     mdl_params.n_gpu_layers = options[:n_gpu_layers]
     model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
     ctx_params = LLaMACpp::ContextParams.new
     ctx_params.seed = options[:seed] if options[:seed] != -1
+    ctx_params.n_threads = options[:n_threads]
+    ctx_params.n_threads_batch = options[:n_threads]
     context = LLaMACpp::Context.new(model: model, params: ctx_params)
     antiprompt = options[:reverse_prompt] || 'User:'
@@ -70,7 +74,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         0.step(embd.size - 1, options[:batch_size]) do |i|
           n_eval = [options[:batch_size], embd.size - i].min
-          context.eval(tokens: embd[i...i + n_eval], n_past: n_past)
+          context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
           n_past += n_eval
         end
       end
@@ -95,7 +99,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         context.sample_tail_free(candidates, z: options[:tfs_z])
         context.sample_typical(candidates, prob: options[:typical_p])
         context.sample_top_p(candidates, prob: options[:top_p])
-        context.sample_temperature(candidates, temperature: options[:temp])
+        context.sample_temp(candidates, temp: options[:temp])
         id = context.sample_token(candidates)
         last_n_tokens.shift

data/examples/embedding.rb CHANGED Viewed

@@ -7,6 +7,7 @@
 require 'llama_cpp'
 require 'thor'
+require 'etc'
 class Embedding < Thor # rubocop:disable Style/Documentation
   default_command :main
@@ -15,6 +16,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
   option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
   option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
   option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
+  option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
   def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
     mdl_params = LLaMACpp::ModelParams.new
     mdl_params.n_gpu_layers = options[:n_gpu_layers]
@@ -22,13 +24,15 @@ class Embedding < Thor # rubocop:disable Style/Documentation
     ctx_params = LLaMACpp::ContextParams.new
     ctx_params.embedding = true
     ctx_params.seed = options[:seed] if options[:seed] != -1
+    ctx_params.n_threads = options[:n_threads]
+    ctx_params.n_threads_batch = options[:n_threads]
     context = LLaMACpp::Context.new(model: model, params: ctx_params)
     embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
     return unless embd_input.size.positive?
-    context.eval(tokens: embd_input, n_past: 0)
+    context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
     context.embeddings.each { |val| print("#{val} ") }
     print("\n")

data/examples/simple.rb CHANGED Viewed

@@ -7,12 +7,14 @@
 require 'llama_cpp'
 require 'thor'
+require 'etc'
 class Simple < Thor # rubocop:disable Style/Documentation
   default_command :main
   desc 'main', 'Simple completion'
   option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
   option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
+  option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
   def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
     n_len = 32
     model_params = LLaMACpp::ModelParams.new
@@ -21,7 +23,8 @@ class Simple < Thor # rubocop:disable Style/Documentation
     context_params.seed = 1234
     context_params.n_ctx = 2048
     context_params.logits_all = true
-    context_params.n_threads = 4
+    context_params.n_threads = options[:n_threads]
+    context_params.n_threads_batch = options[:n_threads]
     context = LLaMACpp::Context.new(model: model, params: context_params)
     tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -51,6 +51,7 @@ public:
   static void define_class(VALUE outer) {
     rb_cLLaMABatch = rb_define_class_under(outer, "Batch", rb_cObject);
     rb_define_alloc_func(rb_cLLaMABatch, llama_batch_alloc);
+    rb_define_singleton_method(rb_cLLaMABatch, "get_one", RUBY_METHOD_FUNC(_llama_batch_get_one), -1);
     rb_define_method(rb_cLLaMABatch, "initialize", RUBY_METHOD_FUNC(_llama_batch_initialize), -1);
     rb_define_method(rb_cLLaMABatch, "n_tokens=", RUBY_METHOD_FUNC(_llama_batch_set_n_tokens), 1);
     rb_define_method(rb_cLLaMABatch, "n_tokens", RUBY_METHOD_FUNC(_llama_batch_get_n_tokens), 0);
@@ -75,6 +76,48 @@ public:
 private:
   static const rb_data_type_t llama_batch_type;
+  static VALUE _llama_batch_get_one(int argc, VALUE* argv, VALUE klass) {
+    VALUE kw_args = Qnil;
+    ID kw_table[4] = { rb_intern("tokens"), rb_intern("n_tokens"), rb_intern("pos_zero"), rb_intern("seq_id") };
+    VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
+    rb_scan_args(argc, argv, ":", &kw_args);
+    rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
+    if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
+      rb_raise(rb_eArgError, "tokens must be an array");
+      return Qnil;
+    }
+    if (!RB_INTEGER_TYPE_P(kw_values[1])) {
+      rb_raise(rb_eArgError, "n_tokens must be an integer");
+      return Qnil;
+    }
+    if (!RB_INTEGER_TYPE_P(kw_values[2])) {
+      rb_raise(rb_eArgError, "pos_zero must be an integer");
+      return Qnil;
+    }
+    if (!RB_INTEGER_TYPE_P(kw_values[3])) {
+      rb_raise(rb_eArgError, "seq_id must be an integer");
+      return Qnil;
+    }
+    const size_t sz_array = RARRAY_LEN(kw_values[0]);
+    const int32_t n_tokens = NUM2INT(kw_values[1]);
+    const llama_pos pos_zero = NUM2INT(kw_values[2]);
+    const llama_seq_id seq_id = NUM2INT(kw_values[3]);
+    LLaMABatchWrapper* ptr = (LLaMABatchWrapper*)ruby_xmalloc(sizeof(LLaMABatchWrapper));
+    new (ptr) LLaMABatchWrapper();
+    ptr->batch = llama_batch_get_one(nullptr, n_tokens, pos_zero, seq_id);
+    ptr->batch.token = (llama_token*)malloc(sizeof(llama_token) * sz_array);
+    for (size_t i = 0; i < sz_array; i++) {
+      VALUE el = rb_ary_entry(kw_values[0], i);
+      ptr->batch.token[i] = NUM2INT(el);
+    }
+    return TypedData_Wrap_Struct(klass, &llama_batch_type, ptr);
+  }
   static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
     ID kw_table[3] = { rb_intern("max_n_token"), rb_intern("n_embd"), rb_intern("max_n_seq") };
@@ -1983,6 +2026,7 @@ public:
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
     rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
+    rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
     rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
     rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
@@ -2054,6 +2098,8 @@ private:
     rb_scan_args(argc, argv, ":", &kw_args);
     rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
+    rb_warn("eval is deprecated. Use decode instead.");
     if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
       rb_raise(rb_eArgError, "tokens must be an Array");
       return Qnil;
@@ -2104,6 +2150,8 @@ private:
     rb_scan_args(argc, argv, ":", &kw_args);
     rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
+    rb_warn("eval_embd is deprecated. Use decode instead.");
     if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
       rb_raise(rb_eArgError, "tokens must be an Array");
       return Qnil;
@@ -2331,6 +2379,16 @@ private:
     return Qnil;
   }
+  static VALUE _llama_context_kv_cache_seq_div(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE d) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eArgError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    llama_kv_cache_seq_div(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(d));
+    return Qnil;
+  }
   static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2794,6 +2852,8 @@ private:
     rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
     rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
+    rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
     if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
       rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
       return Qnil;
@@ -3160,6 +3220,9 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.11.1'
+  VERSION = '0.12.1'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b1768'
+  LLAMA_CPP_VERSION = 'b1833'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -54,7 +54,7 @@ module LLaMACpp
           embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
         end
-        context.eval(tokens: embd, n_past: n_past)
+        context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
       end
       n_past += embd.size
@@ -77,7 +77,7 @@ module LLaMACpp
         context.sample_tail_free(candidates, z: tfs_z)
         context.sample_typical(candidates, prob: typical_p)
         context.sample_top_p(candidates, prob: top_p)
-        context.sample_temperature(candidates, temperature: temperature)
+        context.sample_temp(candidates, temp: temperature)
         id = context.sample_token(candidates)
         last_n_tokens.shift

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -22,6 +22,9 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
   LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
   LLAMA_FTYPE_MOSTLY_Q6_K: Integer
+  LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
+  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
+  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
   LLAMA_KV_OVERRIDE_INT: Integer
   LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -149,6 +152,7 @@ module LLaMACpp
   class Batch
     public
+    def self.get_one: (tokens: Array[Integer], n_tokens: Integer, pos_zero: Integer, seq_id: Integer) -> ::LLaMACpp::Batch
     def initialize: (max_n_token: Integer, n_embd: Integer, max_n_seq: Integer) -> void
     def n_tokens=: (Integer) -> Integer
     def n_tokens: () -> Integer
@@ -192,6 +196,7 @@ module LLaMACpp
     def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
     def kv_cache_seq_keep: (Integer) -> void
     def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
+    def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
     def set_rng_seed: (Integer) -> void
     def load_session_file: (session_path: String) -> void
     def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void

data/vendor/tmp/llama.cpp/Makefile CHANGED Viewed

@@ -1,8 +1,8 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
-	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
 # Binaries only useful for tests
 TEST_TARGETS = \
@@ -620,6 +620,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.
 perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+imatrix: examples/imatrix/imatrix.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -671,6 +674,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
 lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 ifdef LLAMA_METAL
 metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

data/vendor/tmp/llama.cpp/ggml-backend-impl.h CHANGED Viewed

@@ -90,7 +90,7 @@ extern "C" {
         void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
         // compute graph without a plan
-        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
         // check if the backend supports an operation
         bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

data/vendor/tmp/llama.cpp/ggml-backend.c CHANGED Viewed

@@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
     ggml_backend_synchronize(backend);
 }
-void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    backend->iface.graph_compute(backend, cgraph);
+bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    if (!backend->iface.graph_compute(backend, cgraph)) {
+        return false;
+    }
     // TODO: optional sync
     ggml_backend_synchronize(backend);
+    return true;
 }
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
     GGML_UNUSED(backend);
 }
-static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
     cplan.work_data = cpu_ctx->work_data;
     ggml_graph_compute(cgraph, &cplan);
+    return true;
 }
 static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {

data/vendor/tmp/llama.cpp/ggml-backend.h CHANGED Viewed

@@ -58,7 +58,7 @@ extern "C" {
     GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
     GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
     // tensor copy between different backends