RubyGems - llama_cpp - Versions diffs - 0.13.0 → 0.14.0 - Mend

llama_cpp 0.13.0 → 0.14.0

Files changed (25) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/ext/llama_cpp/llama_cpp.cpp +59 -26
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +6 -4
data/vendor/tmp/llama.cpp/Makefile +2 -3
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +4 -3
data/vendor/tmp/llama.cpp/ggml-backend.c +18 -21
data/vendor/tmp/llama.cpp/ggml-backend.h +16 -15
data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +2 -2
data/vendor/tmp/llama.cpp/ggml-metal.m +63 -7
data/vendor/tmp/llama.cpp/ggml-metal.metal +120 -75
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
data/vendor/tmp/llama.cpp/ggml-quants.c +178 -133
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3432 -1118
data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1327 -773
data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
data/vendor/tmp/llama.cpp/ggml.c +227 -15
data/vendor/tmp/llama.cpp/ggml.h +30 -4
data/vendor/tmp/llama.cpp/llama.cpp +631 -211
data/vendor/tmp/llama.cpp/llama.h +28 -10
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8e8d23f3abceeea388895f198a3906b7a24d692cba97e46934a14567450fc3a2
-  data.tar.gz: 9d1385671b76ea826fbc000910e102fbbb951970f77b7511fdf2653adbc97334
+  metadata.gz: c7d855ccd32ae097f26a671751d6a2178361cf8d8a6c1b99af37859f2c47ca03
+  data.tar.gz: 3b17318424d08c65ad34da3fa14956c86db0a2ea05ac174323a9b8d2b9e69d59
 SHA512:
-  metadata.gz: 24746b8aaaa749b4058ddb64f6b07952356a6947ef1f40bc8bf7010a37b8b476e71632452ce28b6e61b11c66249a9d4fb6573de31e66e750bdb4391ce8f3286c
-  data.tar.gz: 56f79812ecdeecfc2dce6f68a73fc72d4495c6a51cc1d2ea7ccfeeb3e1ac9b6e72e78cbed019108e05987e431c4634bbfa1029f380f813a7fb6e009b5f6ec4e3
+  metadata.gz: 2d90bf9fdd8dbaf5e67b7fb8797a9412168ae6ce5fcfc4c6aca34e194d5beb5204184b5bb36d65dc507a7a618ac9e938987e8d8bf5871e4eb6304b5e6de06020
+  data.tar.gz: eab524367ace146eb6e20786bd530cead145e1651bcdb726afbb5364609d04b22ca8a515016bb0c2d154ea97fb62f19222c122bc9bb5efe7fc389a6f259da6f0

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,18 @@
+## [[0.14.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.13.0...v0.14.0)] - 2024-03-09
+**Breaking Changes**
+- Bump bundled llama.cpp from b2303 to b2361.
+  - Rename embedding accessor to `embeddings` in `ContextParams`.
+  - Remove `do_pooling` accessor from `ContextParams`.
+  - Add `pooling_type` accessor to `ContextParams`.
+  - Fix the size of array returned by `embedding` method in `Context` from `n_embd` to `n_tokens * n_embd`.
+  - Add `embeddings_seq` method to `Context`.
 ## [[0.13.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.7...v0.13.0)] - 2024-03-02
+**Breaking Changes**
 - Bump bundled llama.cpp from b2143 to b2303.
   - Remove deprecated methods:
     - `map_supported?`, `mlock_supported?`, `apply_lora_from_file`, `eval`, `eval_embd`, `sample_classifier_free_guidance`, `sample_temperature`, and `mul_mat_q`.

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -952,6 +952,8 @@ public:
     rb_define_method(rb_cLLaMAContextParams, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads_batch), 0);
     rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_scaling_type), 1);
     rb_define_method(rb_cLLaMAContextParams, "rope_scaling_type", RUBY_METHOD_FUNC(_llama_context_params_get_rope_scaling_type), 0);
+    rb_define_method(rb_cLLaMAContextParams, "pooling_type=", RUBY_METHOD_FUNC(_llama_context_params_set_pooling_type), 1);
+    rb_define_method(rb_cLLaMAContextParams, "pooling_type", RUBY_METHOD_FUNC(_llama_context_params_get_pooling_type), 0);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_base=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_base), 1);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_base", RUBY_METHOD_FUNC(_llama_context_params_get_rope_freq_base), 0);
     rb_define_method(rb_cLLaMAContextParams, "rope_freq_scale=", RUBY_METHOD_FUNC(_llama_context_params_set_rope_freq_scale), 1);
@@ -974,12 +976,10 @@ public:
     rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
     rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
     rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
-    rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
-    rb_define_method(rb_cLLaMAContextParams, "embedding", RUBY_METHOD_FUNC(_llama_context_params_get_embedding), 0);
+    rb_define_method(rb_cLLaMAContextParams, "embeddings=", RUBY_METHOD_FUNC(_llama_context_params_set_embeddings), 1);
+    rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
     rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
     rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
-    rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
-    rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
   }
 private:
@@ -1058,7 +1058,7 @@ private:
   // rope_scaling_type
   static VALUE _llama_context_params_set_rope_scaling_type(VALUE self, VALUE scaling_type) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    ptr->params.rope_scaling_type = NUM2INT(scaling_type);
+    ptr->params.rope_scaling_type = static_cast<enum llama_rope_scaling_type>(NUM2INT(scaling_type));
     return INT2NUM(ptr->params.rope_scaling_type);
   }
@@ -1067,6 +1067,18 @@ private:
     return INT2NUM(ptr->params.rope_scaling_type);
   }
+  // pooling_type
+  static VALUE _llama_context_params_set_pooling_type(VALUE self, VALUE scaling_type) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.pooling_type = static_cast<enum llama_pooling_type>(NUM2INT(scaling_type));
+    return INT2NUM(ptr->params.pooling_type);
+  }
+  static VALUE _llama_context_params_get_pooling_type(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return INT2NUM(ptr->params.pooling_type);
+  }
   // rope_freq_base
   static VALUE _llama_context_params_set_rope_freq_base(VALUE self, VALUE rope_freq_base) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -1199,16 +1211,16 @@ private:
     return ptr->params.logits_all ? Qtrue : Qfalse;
   }
-  // embedding
-  static VALUE _llama_context_params_set_embedding(VALUE self, VALUE embedding) {
+  // embeddings
+  static VALUE _llama_context_params_set_embeddings(VALUE self, VALUE embeddings) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    ptr->params.embedding = RTEST(embedding) ? true : false;
-    return ptr->params.embedding ? Qtrue : Qfalse;
+    ptr->params.embeddings = RTEST(embeddings) ? true : false;
+    return ptr->params.embeddings ? Qtrue : Qfalse;
   }
-  static VALUE _llama_context_params_get_embedding(VALUE self) {
+  static VALUE _llama_context_params_get_embeddings(VALUE self) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    return ptr->params.embedding ? Qtrue : Qfalse;
+    return ptr->params.embeddings ? Qtrue : Qfalse;
   }
   // offload_kqv
@@ -1222,18 +1234,6 @@ private:
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
     return ptr->params.offload_kqv ? Qtrue : Qfalse;
   }
-  // do_pooling
-  static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
-    return ptr->params.do_pooling ? Qtrue : Qfalse;
-  }
-  static VALUE _llama_context_params_get_do_pooling(VALUE self) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    return ptr->params.do_pooling ? Qtrue : Qfalse;
-  }
 };
 const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -2016,6 +2016,7 @@ public:
     rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
     rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
+    rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
     rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
     rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
     rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
@@ -2151,7 +2152,7 @@ private:
     LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
     VALUE params = rb_iv_get(self, "@params");
     LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
-    if (!prms_ptr->params.embedding) {
+    if (!prms_ptr->params.embeddings) {
       rb_raise(rb_eRuntimeError, "embedding parameter is false");
       return Qnil;
     }
@@ -2160,10 +2161,11 @@ private:
       return Qnil;
     }
+    const int n_tokens = NUM2INT(rb_iv_get(self, "@n_tokens"));
     const int n_embd = llama_n_embd(model_ptr->model);
     const float* embd = llama_get_embeddings(ptr->ctx);
     VALUE output = rb_ary_new();
-    for (int i = 0; i < n_embd; i++) {
+    for (int i = 0; i < n_tokens * n_embd; i++) {
       rb_ary_push(output, DBL2NUM((double)(embd[i])));
     }
@@ -2182,7 +2184,7 @@ private:
     }
     VALUE params = rb_iv_get(self, "@params");
     LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
-    if (!prms_ptr->params.embedding) {
+    if (!prms_ptr->params.embeddings) {
       rb_raise(rb_eRuntimeError, "embedding parameter is false");
       return Qnil;
     }
@@ -2200,6 +2202,36 @@ private:
     return output;
   }
+  static VALUE _llama_context_embeddings_seq(VALUE self, VALUE seq_id) {
+    if (!RB_INTEGER_TYPE_P(seq_id)) {
+      rb_raise(rb_eArgError, "seq_id must be an integer");
+      return Qnil;
+    }
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    VALUE params = rb_iv_get(self, "@params");
+    LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
+    if (!prms_ptr->params.embeddings) {
+      rb_raise(rb_eRuntimeError, "embedding parameter is false");
+      return Qnil;
+    }
+    VALUE model = rb_iv_get(self, "@model");
+    LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
+    const int n_embd = llama_n_embd(model_ptr->model);
+    VALUE output = rb_ary_new();
+    const float* embd = llama_get_embeddings_seq(ptr->ctx, NUM2INT(seq_id));
+    for (int i = 0; i < n_embd; i++) {
+      rb_ary_push(output, DBL2NUM((double)(embd[i])));
+    }
+    return output;
+  }
   static VALUE _llama_context_n_ctx(VALUE self) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -3229,6 +3261,7 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_YARN", INT2NUM(LLAMA_ROPE_SCALING_TYPE_YARN));
   rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_TYPE_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_TYPE_MAX_VALUE));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_UNSPECIFIED", INT2NUM(LLAMA_POOLING_TYPE_UNSPECIFIED));
   rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
   rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
   rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.13.0'
+  VERSION = '0.14.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b2303'
+  LLAMA_CPP_VERSION = 'b2361'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -50,6 +50,7 @@ module LLaMACpp
   LLAMA_ROPE_SCALING_TYPE_YARN: Integer
   LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
+  LLAMA_POOLING_TYPE_UNSPECIFIED: Integer
   LLAMA_POOLING_TYPE_NONE: Integer
   LLAMA_POOLING_TYPE_MEAN: Integer
   LLAMA_POOLING_TYPE_CLS: Integer
@@ -201,6 +202,7 @@ module LLaMACpp
     def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
     def embeddings: () -> Array[Float]
     def embeddings_ith: (Integer) -> Array[Float]
+    def embeddings_seq: (Integer) -> Array[Float]
     def decode: (::LLaMACpp::Batch) -> void
     def logits: () -> Array[Float]
     def n_ctx: () -> Integer
@@ -254,6 +256,8 @@ module LLaMACpp
     def n_threads_batch=: (Integer) -> Integer
     def rope_scaling_type=: (Integer) -> Integer
     def rope_scaling_type: () -> Integer
+    def pooling_type=: (Integer) -> Integer
+    def pooling_type: () -> Integer
     def rope_freq_base=: (Float) -> Float
     def rope_freq_base: () -> Float
     def rope_freq_scale=: (Float) -> Float
@@ -276,12 +280,10 @@ module LLaMACpp
     def type_v: () -> Integer
     def logits_all: () -> bool
     def logits_all=: (bool) -> bool
-    def embedding: () -> bool
-    def embedding=: (bool) -> bool
+    def embeddings: () -> bool
+    def embeddings=: (bool) -> bool
     def offload_kqv: () -> bool
     def offload_kqv=: (bool) -> bool
-    def do_pooling: () -> bool
-    def do_pooling=: (bool) -> bool
   end
   class ModelQuantizeParams

data/vendor/tmp/llama.cpp/Makefile CHANGED Viewed

@@ -729,10 +729,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

data/vendor/tmp/llama.cpp/ggml-backend-impl.h CHANGED Viewed

@@ -91,13 +91,14 @@ extern "C" {
         // (optional) complete all pending operations
         void (*GGML_CALL synchronize)(ggml_backend_t backend);
-        // compute graph with a plan
+        // create a plan for ggml_cgraph and free it
         ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
         void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        // compute graph with a plan
+        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
         // compute graph without a plan (async)
-        bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
         // check if the backend supports an operation
         bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

data/vendor/tmp/llama.cpp/ggml-backend.c CHANGED Viewed

@@ -262,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
     backend->iface.graph_plan_free(backend, plan);
 }
-void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    backend->iface.graph_plan_compute(backend, plan);
+enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    return backend->iface.graph_plan_compute(backend, plan);
 }
-bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     return backend->iface.graph_compute(backend, cgraph);
 }
@@ -732,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
     GGML_UNUSED(backend);
 }
-GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
     GGML_UNUSED(backend);
 }
-GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -755,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
     cplan.abort_callback      = cpu_ctx->abort_callback;
     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-    ggml_graph_compute(cgraph, &cplan);
-    return true;
+    return ggml_graph_compute(cgraph, &cplan);
 }
 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -1437,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
     return true;
 }
-static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
+static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
     uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
     uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
@@ -1472,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
         uint64_t compute_start_us = ggml_time_us();
         if (!sched->callback_eval) {
-            if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
-                return false;
+            enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
+            if (ec != GGML_STATUS_SUCCESS) {
+                return ec;
             }
             //ggml_backend_synchronize(split_backend); // necessary to measure compute time
         } else {
@@ -1494,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
-                if (!ggml_backend_graph_compute(split_backend, &gv)) {
-                    return false;
+                enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
+                if (ec != GGML_STATUS_SUCCESS) {
+                    return ec;
                 }
                 if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
@@ -1519,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
     }
 #endif
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@@ -1581,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
     return true;
 }
-bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
     if (!sched->is_reset) {
@@ -1590,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
     ggml_backend_sched_split_graph(sched, graph);
     if (!ggml_backend_sched_alloc_splits(sched)) {
-        return false;
+        return GGML_STATUS_ALLOC_FAILED;
     }
-    if (!ggml_backend_sched_compute_splits(sched)) {
-        return false;
-    }
-    return true;
+    return ggml_backend_sched_compute_splits(sched);
 }
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {

data/vendor/tmp/llama.cpp/ggml-backend.h CHANGED Viewed

@@ -66,12 +66,13 @@ extern "C" {
     GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
     // tensor copy between different backends
     GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
@@ -157,26 +158,26 @@ extern "C" {
     typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
     // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
-    GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
+    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
     // Initialize backend buffers from a measure graph
-    GGML_API bool                  ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
     // Get the number of splits of the last graph
-    GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
-    GGML_API size_t                ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-    GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+    GGML_API void                 ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
     // Allocate and compute graph on the backend scheduler
-    GGML_API bool                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
     // Reset all assignments and allocators - must be called before changing the node backends
-    GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
+    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
     // Set a callback to be called for each resulting node during graph compute
-    GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
     //
     // Utils