llama_cpp 0.11.1 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 705f8a7e2228a324f14fa23ca093e2ce4408eacb839d891284c40e319b07940a
4
- data.tar.gz: fc04d232f2c7ecfa1402aa711eda63e36a03d287a6bc7c8e0d2c791194ad9e9a
3
+ metadata.gz: 13381408318e71cc1fc55c40ee9be6e62ad9e3ad6a8ce39279bb8040614e9b3b
4
+ data.tar.gz: 6456734b18865a7811f08d0d9d599771f574f4b59bd5b54a964ece7428115907
5
5
  SHA512:
6
- metadata.gz: 888ba5bfa23ab51746d49c2cc071f8a220d3de39f6c3a34576f35bcb993fc0be841481dcbca9762504397ca6555571f43b4ba2c0ae3dae3fcd8d29bd2735ae16
7
- data.tar.gz: e8940f8ab7a542569a71ad5c869ac781b8ad958ca02d2a5547b792008c228ae1e9ff23cc5b9552e83b0c631805ec7edfea6138c5ae9d922daca06e5ab3f1490d
6
+ metadata.gz: 1014349771d7aa3c318027de11603e96d5482e4bd5b1bcf0fd4874040245daf44c4cfb801077a698846459a7619ca9e01e0afc3507fc7bd519e7ba68a000a15d
7
+ data.tar.gz: 1315ca8954397edb0db93347a10762e35f829377ef3dba0ea9cf6c67f986972ac8e75b46c410a3ceceefc0474f2abbe6f441e56a60e789ef1d2617fc15cfb29e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,18 @@
1
+ ## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
2
+
3
+ - Bump bundled llama.cpp from b1768 to b1833.
4
+ - Add model file type constants.
5
+ - Add `kv_cache_seq_div` method to `Context`.
6
+
7
+ ## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
8
+
9
+ - Add `get_one` singleton method to `Batch`.
10
+
11
+ **Breaking Changes**
12
+
13
+ - Add deprecation warning to `eval`, `eval_embd`, and `sample_temperature` methods on `Context`.
14
+ - Change to avoid using deprecated methods on `generate` method and example scripts.
15
+
1
16
  ## [[0.11.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.0...v0.11.1)] - 2024-01-08
2
17
 
3
18
  - Fix to set the values of `@n_tokens` and `@has_evaluated` instance variables in `decode` method of `Context`.
data/README.md CHANGED
@@ -22,14 +22,14 @@ If bundler is not being used to manage dependencies, install the gem by executin
22
22
  $ gem install llama_cpp
23
23
  ```
24
24
 
25
- There are several installation options for improving execution performance:
25
+ There are several installation options:
26
26
 
27
27
  ```sh
28
28
  # use OpenBLAS
29
29
  $ gem install llama_cpp -- --with-openblas
30
30
 
31
- # use Metal on macOS
32
- $ gem install llama_cpp -- --with-metal
31
+ # use CUDA
32
+ $ gem install llama_cpp -- --with-cuda
33
33
  ```
34
34
 
35
35
  Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
data/examples/chat.rb CHANGED
@@ -9,6 +9,7 @@
9
9
  require 'llama_cpp'
10
10
  require 'thor'
11
11
  require 'readline'
12
+ require 'etc'
12
13
 
13
14
  class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
14
15
  default_command :main
@@ -30,12 +31,15 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
30
31
  option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
31
32
  option :temp, type: :numeric, desc: 'temperature', default: 0.8
32
33
  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
+ option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
33
35
  def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
34
36
  mdl_params = LLaMACpp::ModelParams.new
35
37
  mdl_params.n_gpu_layers = options[:n_gpu_layers]
36
38
  model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
37
39
  ctx_params = LLaMACpp::ContextParams.new
38
40
  ctx_params.seed = options[:seed] if options[:seed] != -1
41
+ ctx_params.n_threads = options[:n_threads]
42
+ ctx_params.n_threads_batch = options[:n_threads]
39
43
  context = LLaMACpp::Context.new(model: model, params: ctx_params)
40
44
 
41
45
  antiprompt = options[:reverse_prompt] || 'User:'
@@ -70,7 +74,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
70
74
 
71
75
  0.step(embd.size - 1, options[:batch_size]) do |i|
72
76
  n_eval = [options[:batch_size], embd.size - i].min
73
- context.eval(tokens: embd[i...i + n_eval], n_past: n_past)
77
+ context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
74
78
  n_past += n_eval
75
79
  end
76
80
  end
@@ -95,7 +99,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
95
99
  context.sample_tail_free(candidates, z: options[:tfs_z])
96
100
  context.sample_typical(candidates, prob: options[:typical_p])
97
101
  context.sample_top_p(candidates, prob: options[:top_p])
98
- context.sample_temperature(candidates, temperature: options[:temp])
102
+ context.sample_temp(candidates, temp: options[:temp])
99
103
  id = context.sample_token(candidates)
100
104
 
101
105
  last_n_tokens.shift
@@ -7,6 +7,7 @@
7
7
 
8
8
  require 'llama_cpp'
9
9
  require 'thor'
10
+ require 'etc'
10
11
 
11
12
  class Embedding < Thor # rubocop:disable Style/Documentation
12
13
  default_command :main
@@ -15,6 +16,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
15
16
  option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
16
17
  option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
17
18
  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
19
+ option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
18
20
  def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
19
21
  mdl_params = LLaMACpp::ModelParams.new
20
22
  mdl_params.n_gpu_layers = options[:n_gpu_layers]
@@ -22,13 +24,15 @@ class Embedding < Thor # rubocop:disable Style/Documentation
22
24
  ctx_params = LLaMACpp::ContextParams.new
23
25
  ctx_params.embedding = true
24
26
  ctx_params.seed = options[:seed] if options[:seed] != -1
27
+ ctx_params.n_threads = options[:n_threads]
28
+ ctx_params.n_threads_batch = options[:n_threads]
25
29
  context = LLaMACpp::Context.new(model: model, params: ctx_params)
26
30
 
27
31
  embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
28
32
 
29
33
  return unless embd_input.size.positive?
30
34
 
31
- context.eval(tokens: embd_input, n_past: 0)
35
+ context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
32
36
 
33
37
  context.embeddings.each { |val| print("#{val} ") }
34
38
  print("\n")
data/examples/simple.rb CHANGED
@@ -7,12 +7,14 @@
7
7
 
8
8
  require 'llama_cpp'
9
9
  require 'thor'
10
+ require 'etc'
10
11
 
11
12
  class Simple < Thor # rubocop:disable Style/Documentation
12
13
  default_command :main
13
14
  desc 'main', 'Simple completion'
14
15
  option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
15
16
  option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
17
+ option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
16
18
  def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
17
19
  n_len = 32
18
20
  model_params = LLaMACpp::ModelParams.new
@@ -21,7 +23,8 @@ class Simple < Thor # rubocop:disable Style/Documentation
21
23
  context_params.seed = 1234
22
24
  context_params.n_ctx = 2048
23
25
  context_params.logits_all = true
24
- context_params.n_threads = 4
26
+ context_params.n_threads = options[:n_threads]
27
+ context_params.n_threads_batch = options[:n_threads]
25
28
  context = LLaMACpp::Context.new(model: model, params: context_params)
26
29
 
27
30
  tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)
@@ -51,6 +51,7 @@ public:
51
51
  static void define_class(VALUE outer) {
52
52
  rb_cLLaMABatch = rb_define_class_under(outer, "Batch", rb_cObject);
53
53
  rb_define_alloc_func(rb_cLLaMABatch, llama_batch_alloc);
54
+ rb_define_singleton_method(rb_cLLaMABatch, "get_one", RUBY_METHOD_FUNC(_llama_batch_get_one), -1);
54
55
  rb_define_method(rb_cLLaMABatch, "initialize", RUBY_METHOD_FUNC(_llama_batch_initialize), -1);
55
56
  rb_define_method(rb_cLLaMABatch, "n_tokens=", RUBY_METHOD_FUNC(_llama_batch_set_n_tokens), 1);
56
57
  rb_define_method(rb_cLLaMABatch, "n_tokens", RUBY_METHOD_FUNC(_llama_batch_get_n_tokens), 0);
@@ -75,6 +76,48 @@ public:
75
76
  private:
76
77
  static const rb_data_type_t llama_batch_type;
77
78
 
79
+ static VALUE _llama_batch_get_one(int argc, VALUE* argv, VALUE klass) {
80
+ VALUE kw_args = Qnil;
81
+ ID kw_table[4] = { rb_intern("tokens"), rb_intern("n_tokens"), rb_intern("pos_zero"), rb_intern("seq_id") };
82
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
83
+ rb_scan_args(argc, argv, ":", &kw_args);
84
+ rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
85
+
86
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
87
+ rb_raise(rb_eArgError, "tokens must be an array");
88
+ return Qnil;
89
+ }
90
+ if (!RB_INTEGER_TYPE_P(kw_values[1])) {
91
+ rb_raise(rb_eArgError, "n_tokens must be an integer");
92
+ return Qnil;
93
+ }
94
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
95
+ rb_raise(rb_eArgError, "pos_zero must be an integer");
96
+ return Qnil;
97
+ }
98
+ if (!RB_INTEGER_TYPE_P(kw_values[3])) {
99
+ rb_raise(rb_eArgError, "seq_id must be an integer");
100
+ return Qnil;
101
+ }
102
+
103
+ const size_t sz_array = RARRAY_LEN(kw_values[0]);
104
+ const int32_t n_tokens = NUM2INT(kw_values[1]);
105
+ const llama_pos pos_zero = NUM2INT(kw_values[2]);
106
+ const llama_seq_id seq_id = NUM2INT(kw_values[3]);
107
+
108
+ LLaMABatchWrapper* ptr = (LLaMABatchWrapper*)ruby_xmalloc(sizeof(LLaMABatchWrapper));
109
+ new (ptr) LLaMABatchWrapper();
110
+ ptr->batch = llama_batch_get_one(nullptr, n_tokens, pos_zero, seq_id);
111
+
112
+ ptr->batch.token = (llama_token*)malloc(sizeof(llama_token) * sz_array);
113
+ for (size_t i = 0; i < sz_array; i++) {
114
+ VALUE el = rb_ary_entry(kw_values[0], i);
115
+ ptr->batch.token[i] = NUM2INT(el);
116
+ }
117
+
118
+ return TypedData_Wrap_Struct(klass, &llama_batch_type, ptr);
119
+ }
120
+
78
121
  static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
79
122
  VALUE kw_args = Qnil;
80
123
  ID kw_table[3] = { rb_intern("max_n_token"), rb_intern("n_embd"), rb_intern("max_n_seq") };
@@ -1983,6 +2026,7 @@ public:
1983
2026
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
1984
2027
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
1985
2028
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
2029
+ rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
1986
2030
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
1987
2031
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
1988
2032
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
@@ -2054,6 +2098,8 @@ private:
2054
2098
  rb_scan_args(argc, argv, ":", &kw_args);
2055
2099
  rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2056
2100
 
2101
+ rb_warn("eval is deprecated. Use decode instead.");
2102
+
2057
2103
  if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2058
2104
  rb_raise(rb_eArgError, "tokens must be an Array");
2059
2105
  return Qnil;
@@ -2104,6 +2150,8 @@ private:
2104
2150
  rb_scan_args(argc, argv, ":", &kw_args);
2105
2151
  rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2106
2152
 
2153
+ rb_warn("eval_embd is deprecated. Use decode instead.");
2154
+
2107
2155
  if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2108
2156
  rb_raise(rb_eArgError, "tokens must be an Array");
2109
2157
  return Qnil;
@@ -2331,6 +2379,16 @@ private:
2331
2379
  return Qnil;
2332
2380
  }
2333
2381
 
2382
+ static VALUE _llama_context_kv_cache_seq_div(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE d) {
2383
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2384
+ if (ptr->ctx == NULL) {
2385
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2386
+ return Qnil;
2387
+ }
2388
+ llama_kv_cache_seq_div(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(d));
2389
+ return Qnil;
2390
+ }
2391
+
2334
2392
  static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
2335
2393
  LLaMAContextWrapper* ptr = get_llama_context(self);
2336
2394
  if (ptr->ctx == NULL) {
@@ -2794,6 +2852,8 @@ private:
2794
2852
  rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
2795
2853
  rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
2796
2854
 
2855
+ rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
2856
+
2797
2857
  if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
2798
2858
  rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
2799
2859
  return Qnil;
@@ -3160,6 +3220,9 @@ extern "C" void Init_llama_cpp(void) {
3160
3220
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
3161
3221
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
3162
3222
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
3223
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
3224
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
3225
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3163
3226
 
3164
3227
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3165
3228
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.11.1'
6
+ VERSION = '0.12.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1768'
9
+ LLAMA_CPP_VERSION = 'b1833'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -54,7 +54,7 @@ module LLaMACpp
54
54
  embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
55
55
  end
56
56
 
57
- context.eval(tokens: embd, n_past: n_past)
57
+ context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
58
58
  end
59
59
 
60
60
  n_past += embd.size
@@ -77,7 +77,7 @@ module LLaMACpp
77
77
  context.sample_tail_free(candidates, z: tfs_z)
78
78
  context.sample_typical(candidates, prob: typical_p)
79
79
  context.sample_top_p(candidates, prob: top_p)
80
- context.sample_temperature(candidates, temperature: temperature)
80
+ context.sample_temp(candidates, temp: temperature)
81
81
  id = context.sample_token(candidates)
82
82
 
83
83
  last_n_tokens.shift
data/sig/llama_cpp.rbs CHANGED
@@ -22,6 +22,9 @@ module LLaMACpp
22
22
  LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
23
23
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
24
24
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
25
+ LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
26
+ LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
27
+ LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
25
28
 
26
29
  LLAMA_KV_OVERRIDE_INT: Integer
27
30
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -149,6 +152,7 @@ module LLaMACpp
149
152
  class Batch
150
153
  public
151
154
 
155
+ def self.get_one: (tokens: Array[Integer], n_tokens: Integer, pos_zero: Integer, seq_id: Integer) -> ::LLaMACpp::Batch
152
156
  def initialize: (max_n_token: Integer, n_embd: Integer, max_n_seq: Integer) -> void
153
157
  def n_tokens=: (Integer) -> Integer
154
158
  def n_tokens: () -> Integer
@@ -192,6 +196,7 @@ module LLaMACpp
192
196
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
193
197
  def kv_cache_seq_keep: (Integer) -> void
194
198
  def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
199
+ def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
195
200
  def set_rng_seed: (Integer) -> void
196
201
  def load_session_file: (session_path: String) -> void
197
202
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
@@ -1,8 +1,8 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
- main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
3
+ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
4
  simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
5
+ speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -620,6 +620,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
620
620
  perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
621
621
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
622
622
 
623
+ imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
624
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
625
+
623
626
  embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
624
627
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
625
628
 
@@ -671,6 +674,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
671
674
  lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
672
675
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
673
676
 
677
+ passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
678
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
679
+
674
680
  ifdef LLAMA_METAL
675
681
  metal: examples/metal/metal.cpp ggml.o $(OBJS)
676
682
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@@ -90,7 +90,7 @@ extern "C" {
90
90
  void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
91
91
 
92
92
  // compute graph without a plan
93
- void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
93
+ bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
94
94
 
95
95
  // check if the backend supports an operation
96
96
  bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
@@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
195
195
  ggml_backend_synchronize(backend);
196
196
  }
197
197
 
198
- void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
199
- backend->iface.graph_compute(backend, cgraph);
198
+ bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
199
+ if (!backend->iface.graph_compute(backend, cgraph)) {
200
+ return false;
201
+ }
200
202
 
201
203
  // TODO: optional sync
202
204
  ggml_backend_synchronize(backend);
205
+ return true;
203
206
  }
204
207
 
205
208
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
597
600
  GGML_UNUSED(backend);
598
601
  }
599
602
 
600
- static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
603
+ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
601
604
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
602
605
 
603
606
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
611
614
  cplan.work_data = cpu_ctx->work_data;
612
615
 
613
616
  ggml_graph_compute(cgraph, &cplan);
617
+ return true;
614
618
  }
615
619
 
616
620
  static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -58,7 +58,7 @@ extern "C" {
58
58
 
59
59
  GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
60
60
  GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
61
- GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
61
+ GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
62
62
  GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
63
63
 
64
64
  // tensor copy between different backends