llama_cpp 0.14.0 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
221
221
  GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
222
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
223
 
224
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
225
224
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
226
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
227
227
 
228
228
  if (!size) {
229
229
  return;
230
230
  }
231
231
 
232
- tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
232
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
233
233
  }
234
234
 
235
235
  GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
236
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
237
 
238
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
238
239
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
239
- GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
240
240
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
241
241
 
242
242
  if (!size) {
243
243
  return;
244
244
  }
245
245
 
246
- tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
246
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
247
247
  }
248
248
 
249
249
  void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
255
255
  }
256
256
 
257
257
  ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
258
+ GGML_ASSERT(backend->iface.graph_plan_create != NULL);
259
+
258
260
  return backend->iface.graph_plan_create(backend, cgraph);
259
261
  }
260
262
 
261
263
  void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
264
+ GGML_ASSERT(backend->iface.graph_plan_free != NULL);
265
+
262
266
  backend->iface.graph_plan_free(backend, plan);
263
267
  }
264
268
 
265
269
  enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
270
+ GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
271
+
266
272
  return backend->iface.graph_plan_compute(backend, plan);
267
273
  }
268
274
 
269
275
  enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
276
+ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
277
+ ggml_backend_synchronize(backend);
278
+ return err;
279
+ }
280
+
281
+ bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
270
282
  return backend->iface.graph_compute(backend, cgraph);
271
283
  }
272
284
 
@@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
314
326
  }
315
327
  }
316
328
 
317
- void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
329
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
318
330
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
319
331
 
320
332
  if (src == dst) {
321
333
  return;
322
334
  }
323
335
 
324
- if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
325
- if (backend->iface.cpy_tensor_async != NULL) {
326
- if (backend->iface.cpy_tensor_async(backend, src, dst)) {
327
- return;
328
- }
336
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
337
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
338
+ return;
329
339
  }
330
340
  }
331
341
 
332
- size_t nbytes = ggml_nbytes(src);
342
+ // an async copy would normally happen after all the queued operations on both backends are completed
343
+ // sync src, set_async dst
333
344
  if (ggml_backend_buffer_is_host(src->buffer)) {
334
- ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
335
- }
336
- else {
345
+ ggml_backend_synchronize(backend_src);
346
+ ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
347
+ } else {
348
+ ggml_backend_synchronize(backend_src);
337
349
  ggml_backend_tensor_copy(src, dst);
350
+ ggml_backend_synchronize(backend_dst);
351
+ }
352
+ }
353
+
354
+ // events
355
+
356
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
357
+ if (backend->iface.event_new == NULL) {
358
+ return NULL;
359
+ }
360
+ return backend->iface.event_new(backend);
361
+ }
362
+
363
+ void ggml_backend_event_free(ggml_backend_event_t event) {
364
+ if (event == NULL) {
365
+ return;
338
366
  }
367
+ event->backend->iface.event_free(event);
368
+ }
369
+
370
+ void ggml_backend_event_record(ggml_backend_event_t event) {
371
+ GGML_ASSERT(event->backend->iface.event_record != NULL);
372
+
373
+ event->backend->iface.event_record(event);
374
+ }
375
+
376
+ void ggml_backend_event_synchronize(ggml_backend_event_t event) {
377
+ GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
378
+
379
+ event->backend->iface.event_synchronize(event);
339
380
  }
340
381
 
382
+ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
383
+ GGML_ASSERT(backend->iface.event_wait != NULL);
384
+
385
+ backend->iface.event_wait(backend, event);
386
+ }
341
387
 
342
388
  // backend registry
343
389
 
344
- #define GGML_MAX_BACKENDS_REG 16
390
+ #define GGML_REG_MAX_BACKENDS 16
345
391
 
346
392
  struct ggml_backend_reg {
347
393
  char name[128];
@@ -350,7 +396,7 @@ struct ggml_backend_reg {
350
396
  void * user_data;
351
397
  };
352
398
 
353
- static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
399
+ static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
354
400
  static size_t ggml_backend_registry_count = 0;
355
401
 
356
402
  GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
@@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
395
441
  }
396
442
 
397
443
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
398
- GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
444
+ GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
399
445
 
400
446
  size_t id = ggml_backend_registry_count;
401
447
 
@@ -746,8 +792,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
746
792
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
747
793
 
748
794
  if (cpu_ctx->work_size < cplan.work_size) {
749
- // TODO: may be faster to free and use malloc to avoid the copy
750
- cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
795
+ free(cpu_ctx->work_data);
796
+ cpu_ctx->work_data = malloc(cplan.work_size);
797
+ if (cpu_ctx->work_data == NULL) {
798
+ cpu_ctx->work_size = 0;
799
+ return GGML_STATUS_ALLOC_FAILED;
800
+ }
751
801
  cpu_ctx->work_size = cplan.work_size;
752
802
  }
753
803
  cplan.work_data = cpu_ctx->work_data;
@@ -784,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
784
834
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
785
835
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
786
836
  /* .supports_op = */ ggml_backend_cpu_supports_op,
837
+ /* .event_new = */ NULL,
838
+ /* .event_free = */ NULL,
839
+ /* .event_record = */ NULL,
840
+ /* .event_wait = */ NULL,
841
+ /* .event_synchronize = */ NULL,
787
842
  };
788
843
 
789
844
  static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -939,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
939
994
 
940
995
  // scheduler
941
996
 
942
- #define GGML_MAX_BACKENDS 16
943
- #define GGML_MAX_SPLITS 256
944
- #define GGML_MAX_SPLIT_INPUTS 16
997
+ #ifndef GGML_SCHED_MAX_BACKENDS
998
+ #define GGML_SCHED_MAX_BACKENDS 16
999
+ #endif
1000
+
1001
+ #ifndef GGML_SCHED_MAX_SPLITS
1002
+ #define GGML_SCHED_MAX_SPLITS 256
1003
+ #endif
1004
+
1005
+ #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1006
+ #define GGML_SCHED_MAX_SPLIT_INPUTS 16
1007
+ #endif
1008
+
1009
+ #ifndef GGML_SCHED_MAX_COPIES
1010
+ #define GGML_SCHED_MAX_COPIES 4
1011
+ #endif
945
1012
 
946
1013
  struct ggml_backend_sched_split {
947
1014
  int backend_id;
948
1015
  int i_start;
949
1016
  int i_end;
950
- struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
1017
+ struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
951
1018
  int n_inputs;
952
1019
  // graph view of this split
953
1020
  struct ggml_cgraph graph;
@@ -955,45 +1022,53 @@ struct ggml_backend_sched_split {
955
1022
 
956
1023
  struct ggml_backend_sched {
957
1024
  bool is_reset; // true if the scheduler has been reset since the last graph split
1025
+ bool is_alloc;
958
1026
 
959
1027
  int n_backends;
960
- ggml_backend_t backends[GGML_MAX_BACKENDS];
961
- ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
962
1028
 
1029
+ ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1030
+ ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
963
1031
  ggml_gallocr_t galloc;
964
1032
 
965
1033
  // hash keys of the nodes in the graph
966
1034
  struct ggml_hash_set hash_set;
967
1035
  // hash values
968
1036
  int * tensor_backend_id;
969
- struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
1037
+ struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
970
1038
 
971
- int * node_backend_ids; // [n_nodes]
972
- int n_nodes;
1039
+ int * node_backend_ids; // [graph_size]
1040
+ int * leaf_backend_ids; // [graph_size]
973
1041
 
974
1042
  // copy of the graph with modified inputs
975
1043
  struct ggml_cgraph * graph;
976
1044
 
977
- struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
1045
+ // graph splits
1046
+ struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
978
1047
  int n_splits;
979
1048
 
1049
+ // pipeline parallelism support
1050
+ int n_copies;
1051
+ int cur_copy;
1052
+ ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1053
+ struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1054
+ int n_graph_inputs;
1055
+
980
1056
  struct ggml_context * ctx;
981
1057
 
982
1058
  ggml_backend_sched_eval_callback callback_eval;
983
1059
  void * callback_eval_user_data;
984
1060
 
985
1061
  // align context_buffer to GGML_MEM_ALIGN
986
- #ifdef _MSC_VER
1062
+ #ifdef _MSC_VER
987
1063
  __declspec(align(GGML_MEM_ALIGN))
988
- #else
1064
+ #else
989
1065
  __attribute__((aligned(GGML_MEM_ALIGN)))
990
- #endif
991
- char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1066
+ #endif
1067
+ char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
992
1068
  };
993
1069
 
994
- #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
995
- #define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
996
- #define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
1070
+ #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
1071
+ #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
997
1072
 
998
1073
  // returns the priority of the backend, lower id is higher priority
999
1074
  static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@@ -1005,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1005
1080
  return -1;
1006
1081
  }
1007
1082
 
1008
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
1083
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1084
+ ggml_backend_buffer_t buffer = tensor->buffer;
1009
1085
  if (buffer == NULL) {
1010
1086
  return -1;
1011
1087
  }
@@ -1016,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
1016
1092
  return i;
1017
1093
  }
1018
1094
  }
1019
- GGML_ASSERT(false && "tensor buffer type not supported by any backend");
1020
- return -1; // silence warning
1095
+
1096
+ fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1097
+ __func__, ggml_backend_buffer_name(buffer), tensor->name);
1098
+ GGML_ASSERT(false);
1099
+
1100
+ return -1;
1021
1101
  }
1022
1102
 
1023
1103
  #if 0
1024
- static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
1104
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1025
1105
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1026
1106
  #define GET_CAUSE(node) causes[hash_id(node)]
1027
1107
  #else
@@ -1035,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1035
1115
 
1036
1116
  // assign pre-allocated nodes to their backend
1037
1117
  // dst
1038
- int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
1118
+ int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
1039
1119
  if (cur_backend != -1) {
1040
- SET_CAUSE(node, "1.dst");
1120
+ SET_CAUSE(tensor, "1.dst");
1041
1121
  return cur_backend;
1042
1122
  }
1123
+
1043
1124
  // view_src
1044
1125
  if (tensor->view_src != NULL) {
1045
- cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
1126
+ cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1046
1127
  if (cur_backend != -1) {
1047
- SET_CAUSE(node, "1.vsrc");
1128
+ SET_CAUSE(tensor, "1.vsrc");
1048
1129
  return cur_backend;
1049
1130
  }
1050
1131
  }
1132
+
1133
+ // input
1134
+ if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1135
+ cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
1136
+ SET_CAUSE(tensor, "1.inp");
1137
+ return cur_backend;
1138
+ }
1139
+
1051
1140
  // assign nodes that use weights to the backend of the weights
1052
1141
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1053
1142
  const struct ggml_tensor * src = tensor->src[i];
@@ -1055,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1055
1144
  continue;
1056
1145
  }
1057
1146
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1058
- int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
1147
+ int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
1059
1148
  // operations with weights are always run on the same backend as the weights
1060
- SET_CAUSE(node, "1.wgt%d", i);
1149
+ SET_CAUSE(tensor, "1.wgt%d", i);
1061
1150
  return src_backend;
1062
1151
  }
1063
1152
  }
@@ -1093,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1093
1182
  if (ggml_is_view_op(node->op)) {
1094
1183
  continue;
1095
1184
  }
1096
- ggml_backend_t tensor_backend = tensor_backend(node);
1185
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1097
1186
  fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1098
1187
  fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1099
1188
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1101,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1101
1190
  if (src == NULL) {
1102
1191
  continue;
1103
1192
  }
1104
- ggml_backend_t src_backend = tensor_backend(src);
1193
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1105
1194
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1106
1195
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1107
1196
  }
@@ -1118,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1118
1207
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1119
1208
  // reset splits
1120
1209
  sched->n_splits = 0;
1210
+ sched->n_graph_inputs = 0;
1121
1211
  sched->is_reset = false;
1122
1212
 
1123
1213
  struct ggml_init_params params = {
@@ -1163,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1163
1253
  }
1164
1254
  }
1165
1255
  #ifdef DEBUG_PASS1
1166
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1256
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1167
1257
  #endif
1168
1258
 
1169
1259
  // pass 2: expand current backend assignments
@@ -1171,10 +1261,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1171
1261
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1172
1262
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1173
1263
 
1174
- // pass 2.1 expand gpu up
1264
+
1265
+ // pass 2.2 expand gpu down
1175
1266
  {
1176
1267
  int cur_backend_id = -1;
1177
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1268
+ for (int i = 0; i < graph->n_nodes; i++) {
1178
1269
  struct ggml_tensor * node = graph->nodes[i];
1179
1270
  if (ggml_is_view_op(node->op)) {
1180
1271
  continue;
@@ -1189,15 +1280,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1189
1280
  }
1190
1281
  } else {
1191
1282
  tensor_backend_id(node) = cur_backend_id;
1192
- SET_CAUSE(node, "2.1");
1283
+ SET_CAUSE(node, "2.2");
1193
1284
  }
1194
1285
  }
1195
1286
  }
1196
1287
 
1197
- // pass 2.2 expand gpu down
1288
+ // pass 2.1 expand gpu up
1198
1289
  {
1199
1290
  int cur_backend_id = -1;
1200
- for (int i = 0; i < graph->n_nodes; i++) {
1291
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1201
1292
  struct ggml_tensor * node = graph->nodes[i];
1202
1293
  if (ggml_is_view_op(node->op)) {
1203
1294
  continue;
@@ -1212,15 +1303,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1212
1303
  }
1213
1304
  } else {
1214
1305
  tensor_backend_id(node) = cur_backend_id;
1215
- SET_CAUSE(node, "2.2");
1306
+ SET_CAUSE(node, "2.1");
1216
1307
  }
1217
1308
  }
1218
1309
  }
1219
1310
 
1220
- // pass 2.3 expand rest up
1311
+
1312
+ // pass 2.4 expand rest down
1221
1313
  {
1222
1314
  int cur_backend_id = -1;
1223
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1315
+ for (int i = 0; i < graph->n_nodes; i++) {
1224
1316
  struct ggml_tensor * node = graph->nodes[i];
1225
1317
  if (ggml_is_view_op(node->op)) {
1226
1318
  continue;
@@ -1230,15 +1322,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1230
1322
  cur_backend_id = tensor_backend_id;
1231
1323
  } else {
1232
1324
  tensor_backend_id(node) = cur_backend_id;
1233
- SET_CAUSE(node, "2.3");
1325
+ SET_CAUSE(node, "2.4");
1234
1326
  }
1235
1327
  }
1236
1328
  }
1237
-
1238
- // pass 2.4 expand rest down
1329
+ // pass 2.3 expand rest up
1239
1330
  {
1240
1331
  int cur_backend_id = -1;
1241
- for (int i = 0; i < graph->n_nodes; i++) {
1332
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1242
1333
  struct ggml_tensor * node = graph->nodes[i];
1243
1334
  if (ggml_is_view_op(node->op)) {
1244
1335
  continue;
@@ -1248,12 +1339,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1248
1339
  cur_backend_id = tensor_backend_id;
1249
1340
  } else {
1250
1341
  tensor_backend_id(node) = cur_backend_id;
1251
- SET_CAUSE(node, "2.4");
1342
+ SET_CAUSE(node, "2.3");
1252
1343
  }
1253
1344
  }
1254
1345
  }
1346
+
1255
1347
  #ifdef DEBUG_PASS2
1256
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1348
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1257
1349
  #endif
1258
1350
 
1259
1351
  // pass 3: assign backends to remaining src from dst and view_src
@@ -1283,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1283
1375
  }
1284
1376
  }
1285
1377
  #ifdef DEBUG_PASS3
1286
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1378
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1287
1379
  #endif
1288
1380
 
1289
1381
  // pass 4: split graph, find tensors that need to be copied
@@ -1315,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1315
1407
  if (tensor_backend_id != cur_backend_id) {
1316
1408
  sched->splits[cur_split].i_end = i;
1317
1409
  cur_split++;
1318
- GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1410
+ GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
1319
1411
  sched->splits[cur_split].backend_id = tensor_backend_id;
1320
1412
  sched->splits[cur_split].i_start = i;
1321
1413
  sched->splits[cur_split].n_inputs = 0;
@@ -1328,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1328
1420
  if (src == NULL) {
1329
1421
  continue;
1330
1422
  }
1423
+
1331
1424
  int src_backend_id = tensor_backend_id(src);
1332
1425
  assert(src_backend_id != -1); // all inputs should be assigned by now
1426
+
1427
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
1428
+ size_t id = hash_id(src);
1429
+ if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1430
+ ggml_backend_t backend = sched->backends[src_backend_id];
1431
+ for (int c = 0; c < sched->n_copies; c++) {
1432
+ struct ggml_tensor * tensor_copy;
1433
+ if (c == sched->cur_copy) {
1434
+ tensor_copy = src; // use the original tensor as the current copy
1435
+ } else {
1436
+ tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1437
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1438
+ }
1439
+ if (sched->n_copies > 1) {
1440
+ ggml_set_input(tensor_copy);
1441
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1442
+ }
1443
+ sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1444
+ tensor_backend_id(tensor_copy) = src_backend_id;
1445
+ SET_CAUSE(tensor_copy, "4.cpy");
1446
+ }
1447
+ int n_graph_inputs = sched->n_graph_inputs++;
1448
+ GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1449
+ sched->graph_inputs[n_graph_inputs] = src;
1450
+ }
1451
+ }
1452
+
1333
1453
  if (src_backend_id != tensor_backend_id) {
1334
1454
  // create a copy of the input in the split's backend
1335
1455
  size_t id = hash_id(src);
1336
- if (sched->tensor_copies[id][cur_backend_id] == NULL) {
1456
+ if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1337
1457
  ggml_backend_t backend = sched->backends[cur_backend_id];
1338
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1339
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1340
-
1341
- sched->tensor_copies[id][cur_backend_id] = tensor_copy;
1342
- tensor_backend_id(tensor_copy) = cur_backend_id;
1343
- SET_CAUSE(tensor_copy, "4.cpy");
1344
-
1458
+ for (int c = 0; c < sched->n_copies; c++) {
1459
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1460
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1461
+ if (sched->n_copies > 1) {
1462
+ ggml_set_input(tensor_copy);
1463
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1464
+ }
1465
+ sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1466
+ tensor_backend_id(tensor_copy) = cur_backend_id;
1467
+ SET_CAUSE(tensor_copy, "4.cpy");
1468
+ }
1345
1469
  int n_inputs = sched->splits[cur_split].n_inputs++;
1346
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1470
+ GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1347
1471
  sched->splits[cur_split].inputs[n_inputs] = src;
1348
1472
  }
1349
- node->src[j] = sched->tensor_copies[id][cur_backend_id];
1473
+ node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1350
1474
  }
1351
1475
  }
1352
1476
  }
@@ -1354,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1354
1478
  sched->n_splits = cur_split + 1;
1355
1479
  }
1356
1480
  #ifdef DEBUG_PASS4
1357
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1481
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1358
1482
  #endif
1359
1483
 
1360
1484
  #ifndef NDEBUG
1361
1485
  // sanity check: all sources should have the same backend as the node
1362
1486
  for (int i = 0; i < graph->n_nodes; i++) {
1363
1487
  struct ggml_tensor * node = graph->nodes[i];
1364
- ggml_backend_t tensor_backend = tensor_backend(node);
1488
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1365
1489
  if (tensor_backend == NULL) {
1366
1490
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1367
1491
  }
1368
- if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
1492
+ if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
1369
1493
  fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1370
1494
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1371
- node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
1495
+ node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
1496
+ ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
1372
1497
  }
1373
1498
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1374
1499
  struct ggml_tensor * src = node->src[j];
1375
1500
  if (src == NULL) {
1376
1501
  continue;
1377
1502
  }
1378
- ggml_backend_t src_backend = tensor_backend(src);
1503
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1379
1504
  if (src_backend != tensor_backend /* && src_backend != NULL */) {
1380
1505
  fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1381
1506
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1382
1507
  j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1383
1508
  }
1384
- if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
1509
+ if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
1385
1510
  fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1386
1511
  src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1387
- src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
1512
+ src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
1513
+ ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
1388
1514
  }
1389
1515
  }
1390
1516
  }
@@ -1392,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1392
1518
  #endif
1393
1519
 
1394
1520
  // create copies of the graph for each split
1395
- // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
1396
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
1521
+ // TODO: avoid this copy
1522
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
1397
1523
  for (int i = 0; i < sched->n_splits; i++) {
1398
1524
  struct ggml_backend_sched_split * split = &sched->splits[i];
1399
1525
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1400
1526
 
1527
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1401
1528
  for (int j = 0; j < split->n_inputs; j++) {
1402
1529
  struct ggml_tensor * input = split->inputs[j];
1403
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
1530
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
1404
1531
 
1405
1532
  // add a dependency to the input source so that it is not freed before the copy is done
1406
1533
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1534
+ input_dep->src[0] = input;
1407
1535
  sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1408
1536
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1409
1537
 
@@ -1417,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1417
1545
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1418
1546
  }
1419
1547
  }
1548
+
1549
+ if (sched->n_copies > 1) {
1550
+ // add input copies as leafs so that they are allocated first
1551
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1552
+ struct ggml_tensor * input = sched->graph_inputs[i];
1553
+ size_t id = hash_id(input);
1554
+ int backend_id = tensor_backend_id(input);
1555
+ for (int c = 0; c < sched->n_copies; c++) {
1556
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1557
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1558
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1559
+ }
1560
+ }
1561
+
1562
+ for (int i = 0; i < sched->n_splits; i++) {
1563
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1564
+ int backend_id = split->backend_id;
1565
+ for (int j = 0; j < split->n_inputs; j++) {
1566
+ struct ggml_tensor * input = split->inputs[j];
1567
+ size_t id = hash_id(input);
1568
+ for (int c = 0; c < sched->n_copies; c++) {
1569
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1570
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1571
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1572
+ }
1573
+ }
1574
+ }
1575
+ }
1576
+
1577
+ // add leafs from the original graph
1578
+ for (int i = 0; i < graph->n_leafs; i++) {
1579
+ struct ggml_tensor * leaf = graph->leafs[i];
1580
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1581
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1582
+ }
1583
+
1420
1584
  sched->graph = graph_copy;
1421
1585
  }
1422
1586
 
1423
1587
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1424
- // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1588
+ // allocate graph
1425
1589
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1590
+ // the re-allocation may cause the split inputs to be moved to a different address
1591
+ ggml_backend_sched_synchronize(sched);
1426
1592
  #ifndef NDEBUG
1427
- fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
1593
+ fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1428
1594
  #endif
1429
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1595
+ ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1430
1596
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1431
- fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
1597
+ fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1432
1598
  return false;
1433
1599
  }
1434
1600
  }
@@ -1437,9 +1603,6 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1437
1603
  }
1438
1604
 
1439
1605
  static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1440
- uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1441
- uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1442
-
1443
1606
  struct ggml_backend_sched_split * splits = sched->splits;
1444
1607
 
1445
1608
  for (int i = 0; i < sched->n_splits; i++) {
@@ -1448,34 +1611,36 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1448
1611
  ggml_backend_t split_backend = sched->backends[split_backend_id];
1449
1612
 
1450
1613
  // copy the input tensors to the split backend
1451
- uint64_t copy_start_us = ggml_time_us();
1452
1614
  for (int j = 0; j < split->n_inputs; j++) {
1615
+ ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1453
1616
  struct ggml_tensor * input = split->inputs[j];
1454
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
1617
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1455
1618
 
1456
- GGML_ASSERT(input->buffer != NULL);
1457
- GGML_ASSERT(input_cpy->buffer != NULL);
1619
+ if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1620
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1621
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1622
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1623
+ } else {
1624
+ ggml_backend_synchronize(split_backend);
1625
+ }
1626
+ ggml_backend_tensor_copy(input, input_cpy);
1627
+ } else {
1628
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1629
+ ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1630
+ } else {
1631
+ ggml_backend_synchronize(split_backend);
1632
+ ggml_backend_synchronize(input_backend);
1633
+ }
1458
1634
 
1459
- ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1635
+ ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1636
+ }
1460
1637
  }
1461
- //ggml_backend_synchronize(split_backend); // necessary to measure copy time
1462
- int64_t copy_end_us = ggml_time_us();
1463
- copy_us[split_backend_id] += copy_end_us - copy_start_us;
1464
1638
 
1465
- #if 0
1466
- char split_filename[GGML_MAX_NAME];
1467
- snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
1468
- ggml_graph_dump_dot(split->graph, NULL, split_filename);
1469
- #endif
1470
-
1471
-
1472
- uint64_t compute_start_us = ggml_time_us();
1473
1639
  if (!sched->callback_eval) {
1474
- enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
1640
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
1475
1641
  if (ec != GGML_STATUS_SUCCESS) {
1476
1642
  return ec;
1477
1643
  }
1478
- //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1479
1644
  } else {
1480
1645
  // similar to ggml_backend_compare_graph_backend
1481
1646
  for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
@@ -1494,11 +1659,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1494
1659
 
1495
1660
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1496
1661
 
1497
- enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
1662
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
1498
1663
  if (ec != GGML_STATUS_SUCCESS) {
1499
1664
  return ec;
1500
1665
  }
1501
1666
 
1667
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1668
+ ggml_backend_synchronize(split_backend);
1669
+
1502
1670
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1503
1671
  break;
1504
1672
  }
@@ -1506,39 +1674,54 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1506
1674
  j0 = j1;
1507
1675
  }
1508
1676
  }
1509
- uint64_t compute_end_us = ggml_time_us();
1510
- compute_us[split_backend_id] += compute_end_us - compute_start_us;
1511
- }
1512
1677
 
1513
- #if 0
1514
- // per-backend timings
1515
- fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
1516
- for (int i = 0; i < sched->n_backends; i++) {
1517
- if (copy_us[i] > 0 || compute_us[i] > 0) {
1518
- fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
1678
+ // record the event of this copy
1679
+ if (split->n_inputs > 0) {
1680
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1681
+ ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1682
+ }
1519
1683
  }
1520
1684
  }
1521
- #endif
1685
+
1686
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1522
1687
 
1523
1688
  return GGML_STATUS_SUCCESS;
1524
1689
  }
1525
1690
 
1526
- ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
1691
+ ggml_backend_sched_t ggml_backend_sched_new(
1692
+ ggml_backend_t * backends,
1693
+ ggml_backend_buffer_type_t * bufts,
1694
+ int n_backends,
1695
+ size_t graph_size,
1696
+ bool parallel) {
1527
1697
  GGML_ASSERT(n_backends > 0);
1528
- GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
1698
+ GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1699
+ GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1529
1700
 
1530
1701
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1531
1702
 
1532
1703
  // initialize hash table
1533
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1704
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1534
1705
  sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1535
1706
  sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1536
1707
  sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1708
+ sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
1537
1709
 
1538
1710
  sched->n_backends = n_backends;
1539
- for (int i = 0; i < n_backends; i++) {
1540
- sched->backends[i] = backends[i];
1541
- sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1711
+
1712
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1713
+
1714
+ GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
1715
+
1716
+ for (int b = 0; b < n_backends; b++) {
1717
+ sched->backends[b] = backends[b];
1718
+ sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1719
+ GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1720
+ if (sched->n_copies > 1) {
1721
+ for (int c = 0; c < sched->n_copies; c++) {
1722
+ sched->events[b][c] = ggml_backend_event_new(backends[b]);
1723
+ }
1724
+ }
1542
1725
  }
1543
1726
 
1544
1727
  sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
@@ -1552,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1552
1735
  if (sched == NULL) {
1553
1736
  return;
1554
1737
  }
1738
+ for (int b = 0; b < sched->n_backends; b++) {
1739
+ for (int c = 0; c < sched->n_copies; c++) {
1740
+ ggml_backend_event_free(sched->events[b][c]);
1741
+ }
1742
+ }
1555
1743
  ggml_gallocr_free(sched->galloc);
1556
1744
  ggml_free(sched->ctx);
1557
1745
  free(sched->hash_set.keys);
1558
1746
  free(sched->tensor_backend_id);
1559
1747
  free(sched->tensor_copies);
1560
1748
  free(sched->node_backend_ids);
1749
+ free(sched->leaf_backend_ids);
1561
1750
  free(sched);
1562
1751
  }
1563
1752
 
@@ -1569,34 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1569
1758
  memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1570
1759
 
1571
1760
  sched->is_reset = true;
1761
+ sched->is_alloc = false;
1572
1762
  }
1573
1763
 
1574
1764
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1575
1765
  ggml_backend_sched_split_graph(sched, measure_graph);
1576
1766
 
1577
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
1767
+ // TODO: extract this to a separate function
1768
+ if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1578
1769
  return false;
1579
1770
  }
1580
1771
 
1581
1772
  ggml_backend_sched_reset(sched);
1773
+ ggml_backend_sched_synchronize(sched);
1774
+
1775
+ return true;
1776
+ }
1777
+
1778
+ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1779
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1780
+
1781
+ ggml_backend_sched_split_graph(sched, graph);
1782
+
1783
+ if (!ggml_backend_sched_alloc_splits(sched)) {
1784
+ return false;
1785
+ }
1786
+
1787
+ sched->is_alloc = true;
1788
+
1582
1789
  return true;
1583
1790
  }
1584
1791
 
1585
1792
  enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1586
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1793
+ enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
1794
+ ggml_backend_sched_synchronize(sched);
1795
+ return err;
1796
+ }
1587
1797
 
1588
- if (!sched->is_reset) {
1798
+ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1799
+ if (!sched->is_reset && !sched->is_alloc) {
1589
1800
  ggml_backend_sched_reset(sched);
1590
1801
  }
1591
1802
 
1592
- ggml_backend_sched_split_graph(sched, graph);
1593
- if (!ggml_backend_sched_alloc_splits(sched)) {
1594
- return GGML_STATUS_ALLOC_FAILED;
1803
+ if (!sched->is_alloc) {
1804
+ if (!ggml_backend_sched_alloc_graph(sched, graph)) {
1805
+ return GGML_STATUS_ALLOC_FAILED;
1806
+ }
1595
1807
  }
1596
1808
 
1597
1809
  return ggml_backend_sched_compute_splits(sched);
1598
1810
  }
1599
1811
 
1812
+ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1813
+ for (int i = 0; i < sched->n_backends; i++) {
1814
+ ggml_backend_synchronize(sched->backends[i]);
1815
+ }
1816
+ }
1817
+
1600
1818
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1601
1819
  sched->callback_eval = callback;
1602
1820
  sched->callback_eval_user_data = user_data;
@@ -1606,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1606
1824
  return sched->n_splits;
1607
1825
  }
1608
1826
 
1827
+ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1828
+ return sched->n_copies;
1829
+ }
1830
+
1609
1831
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1610
1832
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1611
1833
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1834
+
1612
1835
  return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1613
1836
  }
1614
1837
 
1615
- void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1838
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1616
1839
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1617
1840
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1618
1841
  tensor_backend_id(node) = backend_index;
1619
1842
  }
1620
1843
 
1621
- ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1844
+ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1622
1845
  int backend_index = tensor_backend_id(node);
1623
1846
  if (backend_index == -1) {
1624
1847
  return NULL;