llama_cpp 0.12.7 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
data/sig/llama_cpp.rbs CHANGED
@@ -27,14 +27,14 @@ module LLaMACpp
27
27
  LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
28
28
  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
29
29
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
30
- LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
30
+ LLAMA_FTYPE_MOSTLY_IQ3_XS: Integer
31
31
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
32
32
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
33
33
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
34
34
 
35
- LLAMA_KV_OVERRIDE_INT: Integer
36
- LLAMA_KV_OVERRIDE_FLOAT: Integer
37
- LLAMA_KV_OVERRIDE_BOOL: Integer
35
+ LLAMA_KV_OVERRIDE_TYPE_INT: Integer
36
+ LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
37
+ LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
38
38
 
39
39
  LLAMA_GRETYPE_END: Integer
40
40
  LLAMA_GRETYPE_ALT: Integer
@@ -44,19 +44,20 @@ module LLaMACpp
44
44
  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
45
45
  LLAMA_GRETYPE_CHAR_ALT: Integer
46
46
 
47
- LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
48
- LLAMA_ROPE_SCALING_NONE: Integer
49
- LLAMA_ROPE_SCALING_LINEAR: Integer
50
- LLAMA_ROPE_SCALING_YARN: Integer
51
- LLAMA_ROPE_SCALING_MAX_VALUE: Integer
47
+ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
48
+ LLAMA_ROPE_SCALING_TYPE_NONE: Integer
49
+ LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
50
+ LLAMA_ROPE_SCALING_TYPE_YARN: Integer
51
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE: Integer
52
52
 
53
- LLAMA_POOLING_NONE: Integer
54
- LLAMA_POOLING_MEAN: Integer
55
- LLAMA_POOLING_CLS: Integer
53
+ LLAMA_POOLING_TYPE_UNSPECIFIED: Integer
54
+ LLAMA_POOLING_TYPE_NONE: Integer
55
+ LLAMA_POOLING_TYPE_MEAN: Integer
56
+ LLAMA_POOLING_TYPE_CLS: Integer
56
57
 
57
- LLAMA_SPLIT_NONE: Integer
58
- LLAMA_SPLIT_LAYER: Integer
59
- LLAMA_SPLIT_ROW: Integer
58
+ LLAMA_SPLIT_MODE_NONE: Integer
59
+ LLAMA_SPLIT_MODE_LAYER: Integer
60
+ LLAMA_SPLIT_MODE_ROW: Integer
60
61
 
61
62
  def self?.backend_init: () -> void
62
63
  def self?.backend_free: () -> void
@@ -68,8 +69,6 @@ module LLaMACpp
68
69
  ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
69
70
  def self?.print_system_info: () -> void
70
71
  def self?.time_us: () -> Integer
71
- def self?.mmap_supported?: () -> bool
72
- def self?.mlock_supported?: () -> bool
73
72
  def self?.max_devices: () -> Integer
74
73
  def self?.supports_mmap?: () -> bool
75
74
  def self?.supports_mlock?: () -> bool
@@ -103,7 +102,8 @@ module LLaMACpp
103
102
  def empty?: () -> bool
104
103
  def free: () -> void
105
104
  def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
106
- def apply_lora_from_file: (lora_path: String, ?scale: Float, ?base_model_path: String, ?n_threads: Integer) -> void
105
+ def vocab_type: () -> Integer
106
+ def rope_type: () -> Integer
107
107
  def n_vocab: () -> Integer
108
108
  def n_ctx_train: () -> Integer
109
109
  def n_embd: () -> Integer
@@ -202,8 +202,7 @@ module LLaMACpp
202
202
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
203
203
  def embeddings: () -> Array[Float]
204
204
  def embeddings_ith: (Integer) -> Array[Float]
205
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
206
- def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
205
+ def embeddings_seq: (Integer) -> Array[Float]
207
206
  def decode: (::LLaMACpp::Batch) -> void
208
207
  def logits: () -> Array[Float]
209
208
  def n_ctx: () -> Integer
@@ -216,14 +215,16 @@ module LLaMACpp
216
215
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
217
216
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
218
217
  def kv_cache_seq_keep: (Integer) -> void
219
- def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
218
+ def kv_cache_seq_add: (Integer, Integer, Integer, Integer) -> void
220
219
  def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
220
+ def kv_cache_seq_pos_max: (Integer) -> Integer
221
+ def kv_cache_defrag: () -> void
222
+ def kv_cache_update: () -> void
221
223
  def set_rng_seed: (Integer) -> void
222
224
  def load_session_file: (session_path: String) -> void
223
225
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
224
226
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
225
227
  def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
226
- def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
227
228
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
228
229
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
229
230
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -232,7 +233,6 @@ module LLaMACpp
232
233
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
233
234
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
234
235
  def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
235
- def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
236
236
  def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
237
237
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
238
238
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
@@ -256,6 +256,8 @@ module LLaMACpp
256
256
  def n_threads_batch=: (Integer) -> Integer
257
257
  def rope_scaling_type=: (Integer) -> Integer
258
258
  def rope_scaling_type: () -> Integer
259
+ def pooling_type=: (Integer) -> Integer
260
+ def pooling_type: () -> Integer
259
261
  def rope_freq_base=: (Float) -> Float
260
262
  def rope_freq_base: () -> Float
261
263
  def rope_freq_scale=: (Float) -> Float
@@ -270,20 +272,18 @@ module LLaMACpp
270
272
  def yarn_beta_slow: () -> Float
271
273
  def yarn_orig_ctx=: (Integer) -> Integer
272
274
  def yarn_orig_ctx: () -> Integer
275
+ def defrag_thold=: (Float) -> Float
276
+ def defrag_thold: () -> Float
273
277
  def type_k=: (Integer) -> Integer
274
278
  def type_k: () -> Integer
275
279
  def type_v=: (Integer) -> Integer
276
280
  def type_v: () -> Integer
277
- def mul_mat_q: () -> bool
278
- def mul_mat_q=: (bool) -> bool
279
281
  def logits_all: () -> bool
280
282
  def logits_all=: (bool) -> bool
281
- def embedding: () -> bool
282
- def embedding=: (bool) -> bool
283
+ def embeddings: () -> bool
284
+ def embeddings=: (bool) -> bool
283
285
  def offload_kqv: () -> bool
284
286
  def offload_kqv=: (bool) -> bool
285
- def do_pooling: () -> bool
286
- def do_pooling=: (bool) -> bool
287
287
  end
288
288
 
289
289
  class ModelQuantizeParams
@@ -383,8 +383,13 @@ ifdef LLAMA_BLIS
383
383
  endif # LLAMA_BLIS
384
384
 
385
385
  ifdef LLAMA_CUBLAS
386
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
387
- MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
386
+ ifneq ('', '$(wildcard /opt/cuda)')
387
+ CUDA_PATH ?= /opt/cuda
388
+ else
389
+ CUDA_PATH ?= /usr/local/cuda
390
+ endif
391
+ MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
392
+ MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
388
393
  OBJS += ggml-cuda.o
389
394
  MK_NVCCFLAGS += -use_fast_math
390
395
  ifdef LLAMA_FATAL_WARNINGS
@@ -599,7 +604,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
599
604
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
600
605
  ifdef LLAMA_CUBLAS
601
606
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
602
- CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
607
+ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
603
608
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
604
609
  ifndef CUDA_DOCKER_ARCH
605
610
  ifndef CUDA_POWER_ARCH
@@ -724,10 +729,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
724
729
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
725
730
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
726
731
 
727
- server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
732
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
728
733
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
729
- $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
730
- $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
734
+ $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
731
735
 
732
736
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
733
737
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -91,19 +91,22 @@ extern "C" {
91
91
  // (optional) complete all pending operations
92
92
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
93
 
94
- // compute graph with a plan
94
+ // create a plan for ggml_cgraph and free it
95
95
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
96
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
- void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
98
97
 
98
+ // compute graph with a plan
99
+ enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
99
100
  // compute graph without a plan (async)
100
- bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
101
+ enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
101
102
 
102
103
  // check if the backend supports an operation
103
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
104
105
  };
105
106
 
106
107
  struct ggml_backend {
108
+ ggml_guid_t guid;
109
+
107
110
  struct ggml_backend_i iface;
108
111
 
109
112
  ggml_backend_context_t context;
@@ -12,7 +12,6 @@
12
12
 
13
13
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
14
 
15
-
16
15
  // backend buffer type
17
16
 
18
17
  const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
@@ -159,6 +158,13 @@ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml
159
158
 
160
159
  // backend
161
160
 
161
+ ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
162
+ if (backend == NULL) {
163
+ return NULL;
164
+ }
165
+ return backend->guid;
166
+ }
167
+
162
168
  const char * ggml_backend_name(ggml_backend_t backend) {
163
169
  if (backend == NULL) {
164
170
  return "NULL";
@@ -256,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
256
262
  backend->iface.graph_plan_free(backend, plan);
257
263
  }
258
264
 
259
- void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
260
- backend->iface.graph_plan_compute(backend, plan);
265
+ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
266
+ return backend->iface.graph_plan_compute(backend, plan);
261
267
  }
262
268
 
263
- bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
269
+ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
264
270
  return backend->iface.graph_compute(backend, cgraph);
265
271
  }
266
272
 
@@ -726,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
726
732
  GGML_UNUSED(backend);
727
733
  }
728
734
 
729
- GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
735
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
730
736
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
731
737
 
732
- ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
738
+ return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
733
739
 
734
740
  GGML_UNUSED(backend);
735
741
  }
736
742
 
737
- GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
743
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
738
744
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
739
745
 
740
746
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -749,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
749
755
  cplan.abort_callback = cpu_ctx->abort_callback;
750
756
  cplan.abort_callback_data = cpu_ctx->abort_callback_data;
751
757
 
752
- ggml_graph_compute(cgraph, &cplan);
753
- return true;
758
+ return ggml_graph_compute(cgraph, &cplan);
754
759
  }
755
760
 
756
761
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -781,6 +786,11 @@ static struct ggml_backend_i cpu_backend_i = {
781
786
  /* .supports_op = */ ggml_backend_cpu_supports_op,
782
787
  };
783
788
 
789
+ static ggml_guid_t ggml_backend_cpu_guid(void) {
790
+ static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
791
+ return &guid;
792
+ }
793
+
784
794
  ggml_backend_t ggml_backend_cpu_init(void) {
785
795
  struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
786
796
  if (ctx == NULL) {
@@ -800,6 +810,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
800
810
  }
801
811
 
802
812
  *cpu_backend = (struct ggml_backend) {
813
+ /* .guid = */ ggml_backend_cpu_guid(),
803
814
  /* .interface = */ cpu_backend_i,
804
815
  /* .context = */ ctx
805
816
  };
@@ -807,7 +818,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
807
818
  }
808
819
 
809
820
  GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
810
- return backend && backend->iface.get_name == ggml_backend_cpu_name;
821
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
811
822
  }
812
823
 
813
824
  void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
@@ -1425,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1425
1436
  return true;
1426
1437
  }
1427
1438
 
1428
- static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1439
+ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1429
1440
  uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1430
1441
  uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1431
1442
 
@@ -1460,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1460
1471
 
1461
1472
  uint64_t compute_start_us = ggml_time_us();
1462
1473
  if (!sched->callback_eval) {
1463
- if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
1464
- return false;
1474
+ enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
1475
+ if (ec != GGML_STATUS_SUCCESS) {
1476
+ return ec;
1465
1477
  }
1466
1478
  //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1467
1479
  } else {
@@ -1482,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1482
1494
 
1483
1495
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1484
1496
 
1485
- if (!ggml_backend_graph_compute(split_backend, &gv)) {
1486
- return false;
1497
+ enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
1498
+ if (ec != GGML_STATUS_SUCCESS) {
1499
+ return ec;
1487
1500
  }
1488
1501
 
1489
1502
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
@@ -1507,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1507
1520
  }
1508
1521
  #endif
1509
1522
 
1510
- return true;
1523
+ return GGML_STATUS_SUCCESS;
1511
1524
  }
1512
1525
 
1513
1526
  ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@@ -1569,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1569
1582
  return true;
1570
1583
  }
1571
1584
 
1572
- bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1585
+ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1573
1586
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1574
1587
 
1575
1588
  if (!sched->is_reset) {
@@ -1578,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
1578
1591
 
1579
1592
  ggml_backend_sched_split_graph(sched, graph);
1580
1593
  if (!ggml_backend_sched_alloc_splits(sched)) {
1581
- return false;
1594
+ return GGML_STATUS_ALLOC_FAILED;
1582
1595
  }
1583
1596
 
1584
- if (!ggml_backend_sched_compute_splits(sched)) {
1585
- return false;
1586
- }
1587
-
1588
- return true;
1597
+ return ggml_backend_sched_compute_splits(sched);
1589
1598
  }
1590
1599
 
1591
1600
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
@@ -49,7 +49,7 @@ extern "C" {
49
49
  // Backend
50
50
  //
51
51
 
52
-
52
+ GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
53
53
  GGML_API const char * ggml_backend_name(ggml_backend_t backend);
54
54
  GGML_API void ggml_backend_free(ggml_backend_t backend);
55
55
 
@@ -66,12 +66,13 @@ extern "C" {
66
66
 
67
67
  GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
68
68
 
69
- GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
69
+ GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
70
+ GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
71
+
72
+ GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
73
+ GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
70
74
 
71
- GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
72
- GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
73
- GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
74
- GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
75
+ GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
75
76
 
76
77
  // tensor copy between different backends
77
78
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
@@ -157,26 +158,26 @@ extern "C" {
157
158
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
158
159
 
159
160
  // Initialize a backend scheduler
160
- GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
161
- GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
161
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
162
+ GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
162
163
  // Initialize backend buffers from a measure graph
163
- GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
164
+ GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
164
165
  // Get the number of splits of the last graph
165
- GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
166
+ GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
166
167
 
167
- GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
168
+ GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
168
169
 
169
- GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
170
- GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
170
+ GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
171
+ GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
171
172
 
172
173
  // Allocate and compute graph on the backend scheduler
173
- GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
174
+ GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
174
175
 
175
176
  // Reset all assignments and allocators - must be called before changing the node backends
176
- GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
177
+ GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
177
178
 
178
179
  // Set a callback to be called for each resulting node during graph compute
179
- GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
180
+ GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
180
181
 
181
182
  //
182
183
  // Utils