llama_cpp 0.12.7 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/sig/llama_cpp.rbs CHANGED
@@ -27,14 +27,14 @@ module LLaMACpp
27
27
  LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
28
28
  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
29
29
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
30
- LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
30
+ LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
31
31
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
32
32
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
33
33
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
34
34
 
35
- LLAMA_KV_OVERRIDE_INT: Integer
36
- LLAMA_KV_OVERRIDE_FLOAT: Integer
37
- LLAMA_KV_OVERRIDE_BOOL: Integer
35
+ LLAMA_KV_OVERRIDE_TYPE_INT: Integer
36
+ LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
37
+ LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
38
38
 
39
39
  LLAMA_GRETYPE_END: Integer
40
40
  LLAMA_GRETYPE_ALT: Integer
@@ -44,19 +44,20 @@ module LLaMACpp
44
44
  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
45
45
  LLAMA_GRETYPE_CHAR_ALT: Integer
46
46
 
47
- LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
48
- LLAMA_ROPE_SCALING_NONE: Integer
49
- LLAMA_ROPE_SCALING_LINEAR: Integer
50
- LLAMA_ROPE_SCALING_YARN: Integer
51
- LLAMA_ROPE_SCALING_MAX_VALUE: Integer
47
+ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
48
+ LLAMA_ROPE_SCALING_TYPE_NONE: Integer
49
+ LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
50
+ LLAMA_ROPE_SCALING_TYPE_YARN: Integer
51
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
52
52
 
53
- LLAMA_POOLING_NONE: Integer
54
- LLAMA_POOLING_MEAN: Integer
55
- LLAMA_POOLING_CLS: Integer
53
+ LLAMA_POOLING_TYPE_UNSPECIFIED: Integer
54
+ LLAMA_POOLING_TYPE_NONE: Integer
55
+ LLAMA_POOLING_TYPE_MEAN: Integer
56
+ LLAMA_POOLING_TYPE_CLS: Integer
56
57
 
57
- LLAMA_SPLIT_NONE: Integer
58
- LLAMA_SPLIT_LAYER: Integer
59
- LLAMA_SPLIT_ROW: Integer
58
+ LLAMA_SPLIT_MODE_NONE: Integer
59
+ LLAMA_SPLIT_MODE_LAYER: Integer
60
+ LLAMA_SPLIT_MODE_ROW: Integer
60
61
 
61
62
  def self?.backend_init: () -> void
62
63
  def self?.backend_free: () -> void
@@ -68,8 +69,6 @@ module LLaMACpp
68
69
  ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
69
70
  def self?.print_system_info: () -> void
70
71
  def self?.time_us: () -> Integer
71
- def self?.mmap_supported?: () -> bool
72
- def self?.mlock_supported?: () -> bool
73
72
  def self?.max_devices: () -> Integer
74
73
  def self?.supports_mmap?: () -> bool
75
74
  def self?.supports_mlock?: () -> bool
@@ -103,7 +102,8 @@ module LLaMACpp
103
102
  def empty?: () -> bool
104
103
  def free: () -> void
105
104
  def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
106
- def apply_lora_from_file: (lora_path: String, ?scale: Float, ?base_model_path: String, ?n_threads: Integer) -> void
105
+ def vocab_type: () -> Integer
106
+ def rope_type: () -> Integer
107
107
  def n_vocab: () -> Integer
108
108
  def n_ctx_train: () -> Integer
109
109
  def n_embd: () -> Integer
@@ -202,8 +202,7 @@ module LLaMACpp
202
202
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
203
203
  def embeddings: () -> Array[Float]
204
204
  def embeddings_ith: (Integer) -> Array[Float]
205
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
206
- def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
205
+ def embeddings_seq: (Integer) -> Array[Float]
207
206
  def decode: (::LLaMACpp::Batch) -> void
208
207
  def logits: () -> Array[Float]
209
208
  def n_ctx: () -> Integer
@@ -216,14 +215,16 @@ module LLaMACpp
216
215
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
217
216
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
218
217
  def kv_cache_seq_keep: (Integer) -> void
219
- def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
218
+ def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
220
219
  def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
220
+ def kv_cache_seq_pos_max: (Integer) -> Integer
221
+ def kv_cache_defrag: () -> void
222
+ def kv_cache_update: () -> void
221
223
  def set_rng_seed: (Integer) -> void
222
224
  def load_session_file: (session_path: String) -> void
223
225
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
224
226
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
225
227
  def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
226
- def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
227
228
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
228
229
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
229
230
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -232,7 +233,6 @@ module LLaMACpp
232
233
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
233
234
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
234
235
  def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
235
- def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
236
236
  def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
237
237
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
238
238
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
@@ -256,6 +256,8 @@ module LLaMACpp
256
256
  def n_threads_batch=: (Integer) -> Integer
257
257
  def rope_scaling_type=: (Integer) -> Integer
258
258
  def rope_scaling_type: () -> Integer
259
+ def pooling_type=: (Integer) -> Integer
260
+ def pooling_type: () -> Integer
259
261
  def rope_freq_base=: (Float) -> Float
260
262
  def rope_freq_base: () -> Float
261
263
  def rope_freq_scale=: (Float) -> Float
@@ -270,20 +272,18 @@ module LLaMACpp
270
272
  def yarn_beta_slow: () -> Float
271
273
  def yarn_orig_ctx=: (Integer) -> Integer
272
274
  def yarn_orig_ctx: () -> Integer
275
+ def defrag_thold=: (Float) -> Float
276
+ def defrag_thold: () -> Float
273
277
  def type_k=: (Integer) -> Integer
274
278
  def type_k: () -> Integer
275
279
  def type_v=: (Integer) -> Integer
276
280
  def type_v: () -> Integer
277
- def mul_mat_q: () -> bool
278
- def mul_mat_q=: (bool) -> bool
279
281
  def logits_all: () -> bool
280
282
  def logits_all=: (bool) -> bool
281
- def embedding: () -> bool
282
- def embedding=: (bool) -> bool
283
+ def embeddings: () -> bool
284
+ def embeddings=: (bool) -> bool
283
285
  def offload_kqv: () -> bool
284
286
  def offload_kqv=: (bool) -> bool
285
- def do_pooling: () -> bool
286
- def do_pooling=: (bool) -> bool
287
287
  end
288
288
 
289
289
  class ModelQuantizeParams
@@ -383,8 +383,13 @@ ifdef LLAMA_BLIS
383
383
  endif # LLAMA_BLIS
384
384
 
385
385
  ifdef LLAMA_CUBLAS
386
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
387
- MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
386
+ ifneq ('', '$(wildcard /opt/cuda)')
387
+ CUDA_PATH ?= /opt/cuda
388
+ else
389
+ CUDA_PATH ?= /usr/local/cuda
390
+ endif
391
+ MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
392
+ MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
388
393
  OBJS += ggml-cuda.o
389
394
  MK_NVCCFLAGS += -use_fast_math
390
395
  ifdef LLAMA_FATAL_WARNINGS
@@ -599,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
599
604
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
600
605
  ifdef LLAMA_CUBLAS
601
606
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
602
- CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
607
+ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
603
608
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
604
609
  ifndef CUDA_DOCKER_ARCH
605
610
  ifndef CUDA_POWER_ARCH
@@ -724,10 +729,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
724
729
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
725
730
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
726
731
 
727
- server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
732
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
728
733
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
729
- $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
730
- $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
734
+ $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
731
735
 
732
736
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
733
737
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -91,19 +91,22 @@ extern "C" {
91
91
  // (optional) complete all pending operations
92
92
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
93
 
94
- // compute graph with a plan
94
+ // create a plan for ggml_cgraph and free it
95
95
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
96
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
- void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
98
97
 
98
+ // compute graph with a plan
99
+ enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
99
100
  // compute graph without a plan (async)
100
- bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
101
+ enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
101
102
 
102
103
  // check if the backend supports an operation
103
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
104
105
  };
105
106
 
106
107
  struct ggml_backend {
108
+ ggml_guid_t guid;
109
+
107
110
  struct ggml_backend_i iface;
108
111
 
109
112
  ggml_backend_context_t context;
@@ -12,7 +12,6 @@
12
12
 
13
13
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
14
 
15
-
16
15
  // backend buffer type
17
16
 
18
17
  const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
@@ -159,6 +158,13 @@ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml
159
158
 
160
159
  // backend
161
160
 
161
+ ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
162
+ if (backend == NULL) {
163
+ return NULL;
164
+ }
165
+ return backend->guid;
166
+ }
167
+
162
168
  const char * ggml_backend_name(ggml_backend_t backend) {
163
169
  if (backend == NULL) {
164
170
  return "NULL";
@@ -256,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
256
262
  backend->iface.graph_plan_free(backend, plan);
257
263
  }
258
264
 
259
- void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
260
- backend->iface.graph_plan_compute(backend, plan);
265
+ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
266
+ return backend->iface.graph_plan_compute(backend, plan);
261
267
  }
262
268
 
263
- bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
269
+ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
264
270
  return backend->iface.graph_compute(backend, cgraph);
265
271
  }
266
272
 
@@ -726,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
726
732
  GGML_UNUSED(backend);
727
733
  }
728
734
 
729
- GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
735
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
730
736
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
731
737
 
732
- ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
738
+ return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
733
739
 
734
740
  GGML_UNUSED(backend);
735
741
  }
736
742
 
737
- GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
743
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
738
744
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
739
745
 
740
746
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -749,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
749
755
  cplan.abort_callback = cpu_ctx->abort_callback;
750
756
  cplan.abort_callback_data = cpu_ctx->abort_callback_data;
751
757
 
752
- ggml_graph_compute(cgraph, &cplan);
753
- return true;
758
+ return ggml_graph_compute(cgraph, &cplan);
754
759
  }
755
760
 
756
761
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -781,6 +786,11 @@ static struct ggml_backend_i cpu_backend_i = {
781
786
  /* .supports_op = */ ggml_backend_cpu_supports_op,
782
787
  };
783
788
 
789
+ static ggml_guid_t ggml_backend_cpu_guid(void) {
790
+ static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
791
+ return &guid;
792
+ }
793
+
784
794
  ggml_backend_t ggml_backend_cpu_init(void) {
785
795
  struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
786
796
  if (ctx == NULL) {
@@ -800,6 +810,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
800
810
  }
801
811
 
802
812
  *cpu_backend = (struct ggml_backend) {
813
+ /* .guid = */ ggml_backend_cpu_guid(),
803
814
  /* .interface = */ cpu_backend_i,
804
815
  /* .context = */ ctx
805
816
  };
@@ -807,7 +818,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
807
818
  }
808
819
 
809
820
  GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
810
- return backend && backend->iface.get_name == ggml_backend_cpu_name;
821
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
811
822
  }
812
823
 
813
824
  void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
@@ -1425,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1425
1436
  return true;
1426
1437
  }
1427
1438
 
1428
- static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1439
+ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1429
1440
  uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1430
1441
  uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1431
1442
 
@@ -1460,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1460
1471
 
1461
1472
  uint64_t compute_start_us = ggml_time_us();
1462
1473
  if (!sched->callback_eval) {
1463
- if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
1464
- return false;
1474
+ enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
1475
+ if (ec != GGML_STATUS_SUCCESS) {
1476
+ return ec;
1465
1477
  }
1466
1478
  //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1467
1479
  } else {
@@ -1482,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1482
1494
 
1483
1495
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1484
1496
 
1485
- if (!ggml_backend_graph_compute(split_backend, &gv)) {
1486
- return false;
1497
+ enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
1498
+ if (ec != GGML_STATUS_SUCCESS) {
1499
+ return ec;
1487
1500
  }
1488
1501
 
1489
1502
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
@@ -1507,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1507
1520
  }
1508
1521
  #endif
1509
1522
 
1510
- return true;
1523
+ return GGML_STATUS_SUCCESS;
1511
1524
  }
1512
1525
 
1513
1526
  ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@@ -1569,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1569
1582
  return true;
1570
1583
  }
1571
1584
 
1572
- bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1585
+ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1573
1586
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1574
1587
 
1575
1588
  if (!sched->is_reset) {
@@ -1578,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
1578
1591
 
1579
1592
  ggml_backend_sched_split_graph(sched, graph);
1580
1593
  if (!ggml_backend_sched_alloc_splits(sched)) {
1581
- return false;
1594
+ return GGML_STATUS_ALLOC_FAILED;
1582
1595
  }
1583
1596
 
1584
- if (!ggml_backend_sched_compute_splits(sched)) {
1585
- return false;
1586
- }
1587
-
1588
- return true;
1597
+ return ggml_backend_sched_compute_splits(sched);
1589
1598
  }
1590
1599
 
1591
1600
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
@@ -49,7 +49,7 @@ extern "C" {
49
49
  // Backend
50
50
  //
51
51
 
52
-
52
+ GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
53
53
  GGML_API const char * ggml_backend_name(ggml_backend_t backend);
54
54
  GGML_API void ggml_backend_free(ggml_backend_t backend);
55
55
 
@@ -66,12 +66,13 @@ extern "C" {
66
66
 
67
67
  GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
68
68
 
69
- GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
69
+ GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
70
+ GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
71
+
72
+ GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
73
+ GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
70
74
 
71
- GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
72
- GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
73
- GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
74
- GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
75
+ GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
75
76
 
76
77
  // tensor copy between different backends
77
78
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
@@ -157,26 +158,26 @@ extern "C" {
157
158
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
158
159
 
159
160
  // Initialize a backend scheduler
160
- GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
161
- GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
161
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
162
+ GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
162
163
  // Initialize backend buffers from a measure graph
163
- GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
164
+ GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
164
165
  // Get the number of splits of the last graph
165
- GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
166
+ GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
166
167
 
167
- GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
168
+ GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
168
169
 
169
- GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
170
- GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
170
+ GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
171
+ GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
171
172
 
172
173
  // Allocate and compute graph on the backend scheduler
173
- GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
174
+ GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
174
175
 
175
176
  // Reset all assignments and allocators - must be called before changing the node backends
176
- GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
177
+ GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
177
178
 
178
179
  // Set a callback to be called for each resulting node during graph compute
179
- GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
180
+ GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
180
181
 
181
182
  //
182
183
  // Utils