llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
221
221
  GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
222
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
223
 
224
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
225
224
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
226
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
227
227
 
228
228
  if (!size) {
229
229
  return;
230
230
  }
231
231
 
232
- tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
232
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
233
233
  }
234
234
 
235
235
  GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
236
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
237
 
238
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
238
239
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
239
- GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
240
240
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
241
241
 
242
242
  if (!size) {
243
243
  return;
244
244
  }
245
245
 
246
- tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
246
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
247
247
  }
248
248
 
249
249
  void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
255
255
  }
256
256
 
257
257
  ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
258
+ GGML_ASSERT(backend->iface.graph_plan_create != NULL);
259
+
258
260
  return backend->iface.graph_plan_create(backend, cgraph);
259
261
  }
260
262
 
261
263
  void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
264
+ GGML_ASSERT(backend->iface.graph_plan_free != NULL);
265
+
262
266
  backend->iface.graph_plan_free(backend, plan);
263
267
  }
264
268
 
265
- void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
266
- backend->iface.graph_plan_compute(backend, plan);
269
+ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
270
+ GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
271
+
272
+ return backend->iface.graph_plan_compute(backend, plan);
273
+ }
274
+
275
+ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
276
+ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
277
+ ggml_backend_synchronize(backend);
278
+ return err;
267
279
  }
268
280
 
269
- bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
281
+ bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
270
282
  return backend->iface.graph_compute(backend, cgraph);
271
283
  }
272
284
 
@@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
314
326
  }
315
327
  }
316
328
 
317
- void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
329
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
318
330
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
319
331
 
320
332
  if (src == dst) {
321
333
  return;
322
334
  }
323
335
 
324
- if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
325
- if (backend->iface.cpy_tensor_async != NULL) {
326
- if (backend->iface.cpy_tensor_async(backend, src, dst)) {
327
- return;
328
- }
336
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
337
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
338
+ return;
329
339
  }
330
340
  }
331
341
 
332
- size_t nbytes = ggml_nbytes(src);
342
+ // an async copy would normally happen after all the queued operations on both backends are completed
343
+ // sync src, set_async dst
333
344
  if (ggml_backend_buffer_is_host(src->buffer)) {
334
- ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
335
- }
336
- else {
345
+ ggml_backend_synchronize(backend_src);
346
+ ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
347
+ } else {
348
+ ggml_backend_synchronize(backend_src);
337
349
  ggml_backend_tensor_copy(src, dst);
350
+ ggml_backend_synchronize(backend_dst);
338
351
  }
339
352
  }
340
353
 
354
+ // events
355
+
356
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
357
+ if (backend->iface.event_new == NULL) {
358
+ return NULL;
359
+ }
360
+ return backend->iface.event_new(backend);
361
+ }
362
+
363
+ void ggml_backend_event_free(ggml_backend_event_t event) {
364
+ if (event == NULL) {
365
+ return;
366
+ }
367
+ event->backend->iface.event_free(event);
368
+ }
369
+
370
+ void ggml_backend_event_record(ggml_backend_event_t event) {
371
+ GGML_ASSERT(event->backend->iface.event_record != NULL);
372
+
373
+ event->backend->iface.event_record(event);
374
+ }
375
+
376
+ void ggml_backend_event_synchronize(ggml_backend_event_t event) {
377
+ GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
378
+
379
+ event->backend->iface.event_synchronize(event);
380
+ }
381
+
382
+ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
383
+ GGML_ASSERT(backend->iface.event_wait != NULL);
384
+
385
+ backend->iface.event_wait(backend, event);
386
+ }
341
387
 
342
388
  // backend registry
343
389
 
344
- #define GGML_MAX_BACKENDS_REG 16
390
+ #define GGML_REG_MAX_BACKENDS 16
345
391
 
346
392
  struct ggml_backend_reg {
347
393
  char name[128];
@@ -350,7 +396,7 @@ struct ggml_backend_reg {
350
396
  void * user_data;
351
397
  };
352
398
 
353
- static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
399
+ static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
354
400
  static size_t ggml_backend_registry_count = 0;
355
401
 
356
402
  GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
@@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
395
441
  }
396
442
 
397
443
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
398
- GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
444
+ GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
399
445
 
400
446
  size_t id = ggml_backend_registry_count;
401
447
 
@@ -732,22 +778,26 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
732
778
  GGML_UNUSED(backend);
733
779
  }
734
780
 
735
- GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
781
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
736
782
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
737
783
 
738
- ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
784
+ return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
739
785
 
740
786
  GGML_UNUSED(backend);
741
787
  }
742
788
 
743
- GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
789
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
744
790
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
745
791
 
746
792
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
747
793
 
748
794
  if (cpu_ctx->work_size < cplan.work_size) {
749
- // TODO: may be faster to free and use malloc to avoid the copy
750
- cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
795
+ free(cpu_ctx->work_data);
796
+ cpu_ctx->work_data = malloc(cplan.work_size);
797
+ if (cpu_ctx->work_data == NULL) {
798
+ cpu_ctx->work_size = 0;
799
+ return GGML_STATUS_ALLOC_FAILED;
800
+ }
751
801
  cpu_ctx->work_size = cplan.work_size;
752
802
  }
753
803
  cplan.work_data = cpu_ctx->work_data;
@@ -755,8 +805,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
755
805
  cplan.abort_callback = cpu_ctx->abort_callback;
756
806
  cplan.abort_callback_data = cpu_ctx->abort_callback_data;
757
807
 
758
- ggml_graph_compute(cgraph, &cplan);
759
- return true;
808
+ return ggml_graph_compute(cgraph, &cplan);
760
809
  }
761
810
 
762
811
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -785,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
785
834
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
786
835
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
787
836
  /* .supports_op = */ ggml_backend_cpu_supports_op,
837
+ /* .event_new = */ NULL,
838
+ /* .event_free = */ NULL,
839
+ /* .event_record = */ NULL,
840
+ /* .event_wait = */ NULL,
841
+ /* .event_synchronize = */ NULL,
788
842
  };
789
843
 
790
844
  static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -940,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
940
994
 
941
995
  // scheduler
942
996
 
943
- #define GGML_MAX_BACKENDS 16
944
- #define GGML_MAX_SPLITS 256
945
- #define GGML_MAX_SPLIT_INPUTS 16
997
+ #ifndef GGML_SCHED_MAX_BACKENDS
998
+ #define GGML_SCHED_MAX_BACKENDS 16
999
+ #endif
1000
+
1001
+ #ifndef GGML_SCHED_MAX_SPLITS
1002
+ #define GGML_SCHED_MAX_SPLITS 256
1003
+ #endif
1004
+
1005
+ #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1006
+ #define GGML_SCHED_MAX_SPLIT_INPUTS 16
1007
+ #endif
1008
+
1009
+ #ifndef GGML_SCHED_MAX_COPIES
1010
+ #define GGML_SCHED_MAX_COPIES 4
1011
+ #endif
946
1012
 
947
1013
  struct ggml_backend_sched_split {
948
1014
  int backend_id;
949
1015
  int i_start;
950
1016
  int i_end;
951
- struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
1017
+ struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
952
1018
  int n_inputs;
953
1019
  // graph view of this split
954
1020
  struct ggml_cgraph graph;
@@ -956,45 +1022,53 @@ struct ggml_backend_sched_split {
956
1022
 
957
1023
  struct ggml_backend_sched {
958
1024
  bool is_reset; // true if the scheduler has been reset since the last graph split
1025
+ bool is_alloc;
959
1026
 
960
1027
  int n_backends;
961
- ggml_backend_t backends[GGML_MAX_BACKENDS];
962
- ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
963
1028
 
1029
+ ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1030
+ ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
964
1031
  ggml_gallocr_t galloc;
965
1032
 
966
1033
  // hash keys of the nodes in the graph
967
1034
  struct ggml_hash_set hash_set;
968
1035
  // hash values
969
1036
  int * tensor_backend_id;
970
- struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
1037
+ struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
971
1038
 
972
- int * node_backend_ids; // [n_nodes]
973
- int n_nodes;
1039
+ int * node_backend_ids; // [graph_size]
1040
+ int * leaf_backend_ids; // [graph_size]
974
1041
 
975
1042
  // copy of the graph with modified inputs
976
1043
  struct ggml_cgraph * graph;
977
1044
 
978
- struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
1045
+ // graph splits
1046
+ struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
979
1047
  int n_splits;
980
1048
 
1049
+ // pipeline parallelism support
1050
+ int n_copies;
1051
+ int cur_copy;
1052
+ ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1053
+ struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1054
+ int n_graph_inputs;
1055
+
981
1056
  struct ggml_context * ctx;
982
1057
 
983
1058
  ggml_backend_sched_eval_callback callback_eval;
984
1059
  void * callback_eval_user_data;
985
1060
 
986
1061
  // align context_buffer to GGML_MEM_ALIGN
987
- #ifdef _MSC_VER
1062
+ #ifdef _MSC_VER
988
1063
  __declspec(align(GGML_MEM_ALIGN))
989
- #else
1064
+ #else
990
1065
  __attribute__((aligned(GGML_MEM_ALIGN)))
991
- #endif
992
- char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1066
+ #endif
1067
+ char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
993
1068
  };
994
1069
 
995
- #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
996
- #define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
997
- #define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
1070
+ #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
1071
+ #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
998
1072
 
999
1073
  // returns the priority of the backend, lower id is higher priority
1000
1074
  static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@@ -1006,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1006
1080
  return -1;
1007
1081
  }
1008
1082
 
1009
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
1083
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1084
+ ggml_backend_buffer_t buffer = tensor->buffer;
1010
1085
  if (buffer == NULL) {
1011
1086
  return -1;
1012
1087
  }
@@ -1017,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
1017
1092
  return i;
1018
1093
  }
1019
1094
  }
1020
- GGML_ASSERT(false && "tensor buffer type not supported by any backend");
1021
- return -1; // silence warning
1095
+
1096
+ fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1097
+ __func__, ggml_backend_buffer_name(buffer), tensor->name);
1098
+ GGML_ASSERT(false);
1099
+
1100
+ return -1;
1022
1101
  }
1023
1102
 
1024
1103
  #if 0
1025
- static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
1104
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1026
1105
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1027
1106
  #define GET_CAUSE(node) causes[hash_id(node)]
1028
1107
  #else
@@ -1036,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1036
1115
 
1037
1116
  // assign pre-allocated nodes to their backend
1038
1117
  // dst
1039
- int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
1118
+ int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
1040
1119
  if (cur_backend != -1) {
1041
- SET_CAUSE(node, "1.dst");
1120
+ SET_CAUSE(tensor, "1.dst");
1042
1121
  return cur_backend;
1043
1122
  }
1123
+
1044
1124
  // view_src
1045
1125
  if (tensor->view_src != NULL) {
1046
- cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
1126
+ cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1047
1127
  if (cur_backend != -1) {
1048
- SET_CAUSE(node, "1.vsrc");
1128
+ SET_CAUSE(tensor, "1.vsrc");
1049
1129
  return cur_backend;
1050
1130
  }
1051
1131
  }
1132
+
1133
+ // input
1134
+ if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1135
+ cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
1136
+ SET_CAUSE(tensor, "1.inp");
1137
+ return cur_backend;
1138
+ }
1139
+
1052
1140
  // assign nodes that use weights to the backend of the weights
1053
1141
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1054
1142
  const struct ggml_tensor * src = tensor->src[i];
@@ -1056,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1056
1144
  continue;
1057
1145
  }
1058
1146
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1059
- int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
1147
+ int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
1060
1148
  // operations with weights are always run on the same backend as the weights
1061
- SET_CAUSE(node, "1.wgt%d", i);
1149
+ SET_CAUSE(tensor, "1.wgt%d", i);
1062
1150
  return src_backend;
1063
1151
  }
1064
1152
  }
@@ -1094,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1094
1182
  if (ggml_is_view_op(node->op)) {
1095
1183
  continue;
1096
1184
  }
1097
- ggml_backend_t tensor_backend = tensor_backend(node);
1185
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1098
1186
  fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1099
1187
  fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1100
1188
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1102,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1102
1190
  if (src == NULL) {
1103
1191
  continue;
1104
1192
  }
1105
- ggml_backend_t src_backend = tensor_backend(src);
1193
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1106
1194
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1107
1195
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1108
1196
  }
@@ -1119,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1119
1207
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1120
1208
  // reset splits
1121
1209
  sched->n_splits = 0;
1210
+ sched->n_graph_inputs = 0;
1122
1211
  sched->is_reset = false;
1123
1212
 
1124
1213
  struct ggml_init_params params = {
@@ -1164,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1164
1253
  }
1165
1254
  }
1166
1255
  #ifdef DEBUG_PASS1
1167
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1256
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1168
1257
  #endif
1169
1258
 
1170
1259
  // pass 2: expand current backend assignments
@@ -1172,10 +1261,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1172
1261
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1173
1262
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1174
1263
 
1175
- // pass 2.1 expand gpu up
1264
+
1265
+ // pass 2.2 expand gpu down
1176
1266
  {
1177
1267
  int cur_backend_id = -1;
1178
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1268
+ for (int i = 0; i < graph->n_nodes; i++) {
1179
1269
  struct ggml_tensor * node = graph->nodes[i];
1180
1270
  if (ggml_is_view_op(node->op)) {
1181
1271
  continue;
@@ -1190,15 +1280,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1190
1280
  }
1191
1281
  } else {
1192
1282
  tensor_backend_id(node) = cur_backend_id;
1193
- SET_CAUSE(node, "2.1");
1283
+ SET_CAUSE(node, "2.2");
1194
1284
  }
1195
1285
  }
1196
1286
  }
1197
1287
 
1198
- // pass 2.2 expand gpu down
1288
+ // pass 2.1 expand gpu up
1199
1289
  {
1200
1290
  int cur_backend_id = -1;
1201
- for (int i = 0; i < graph->n_nodes; i++) {
1291
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1202
1292
  struct ggml_tensor * node = graph->nodes[i];
1203
1293
  if (ggml_is_view_op(node->op)) {
1204
1294
  continue;
@@ -1213,15 +1303,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1213
1303
  }
1214
1304
  } else {
1215
1305
  tensor_backend_id(node) = cur_backend_id;
1216
- SET_CAUSE(node, "2.2");
1306
+ SET_CAUSE(node, "2.1");
1217
1307
  }
1218
1308
  }
1219
1309
  }
1220
1310
 
1221
- // pass 2.3 expand rest up
1311
+
1312
+ // pass 2.4 expand rest down
1222
1313
  {
1223
1314
  int cur_backend_id = -1;
1224
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1315
+ for (int i = 0; i < graph->n_nodes; i++) {
1225
1316
  struct ggml_tensor * node = graph->nodes[i];
1226
1317
  if (ggml_is_view_op(node->op)) {
1227
1318
  continue;
@@ -1231,15 +1322,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1231
1322
  cur_backend_id = tensor_backend_id;
1232
1323
  } else {
1233
1324
  tensor_backend_id(node) = cur_backend_id;
1234
- SET_CAUSE(node, "2.3");
1325
+ SET_CAUSE(node, "2.4");
1235
1326
  }
1236
1327
  }
1237
1328
  }
1238
-
1239
- // pass 2.4 expand rest down
1329
+ // pass 2.3 expand rest up
1240
1330
  {
1241
1331
  int cur_backend_id = -1;
1242
- for (int i = 0; i < graph->n_nodes; i++) {
1332
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1243
1333
  struct ggml_tensor * node = graph->nodes[i];
1244
1334
  if (ggml_is_view_op(node->op)) {
1245
1335
  continue;
@@ -1249,12 +1339,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1249
1339
  cur_backend_id = tensor_backend_id;
1250
1340
  } else {
1251
1341
  tensor_backend_id(node) = cur_backend_id;
1252
- SET_CAUSE(node, "2.4");
1342
+ SET_CAUSE(node, "2.3");
1253
1343
  }
1254
1344
  }
1255
1345
  }
1346
+
1256
1347
  #ifdef DEBUG_PASS2
1257
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1348
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1258
1349
  #endif
1259
1350
 
1260
1351
  // pass 3: assign backends to remaining src from dst and view_src
@@ -1284,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1284
1375
  }
1285
1376
  }
1286
1377
  #ifdef DEBUG_PASS3
1287
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1378
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1288
1379
  #endif
1289
1380
 
1290
1381
  // pass 4: split graph, find tensors that need to be copied
@@ -1316,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1316
1407
  if (tensor_backend_id != cur_backend_id) {
1317
1408
  sched->splits[cur_split].i_end = i;
1318
1409
  cur_split++;
1319
- GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1410
+ GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
1320
1411
  sched->splits[cur_split].backend_id = tensor_backend_id;
1321
1412
  sched->splits[cur_split].i_start = i;
1322
1413
  sched->splits[cur_split].n_inputs = 0;
@@ -1329,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1329
1420
  if (src == NULL) {
1330
1421
  continue;
1331
1422
  }
1423
+
1332
1424
  int src_backend_id = tensor_backend_id(src);
1333
1425
  assert(src_backend_id != -1); // all inputs should be assigned by now
1426
+
1427
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
1428
+ size_t id = hash_id(src);
1429
+ if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1430
+ ggml_backend_t backend = sched->backends[src_backend_id];
1431
+ for (int c = 0; c < sched->n_copies; c++) {
1432
+ struct ggml_tensor * tensor_copy;
1433
+ if (c == sched->cur_copy) {
1434
+ tensor_copy = src; // use the original tensor as the current copy
1435
+ } else {
1436
+ tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1437
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1438
+ }
1439
+ if (sched->n_copies > 1) {
1440
+ ggml_set_input(tensor_copy);
1441
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1442
+ }
1443
+ sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1444
+ tensor_backend_id(tensor_copy) = src_backend_id;
1445
+ SET_CAUSE(tensor_copy, "4.cpy");
1446
+ }
1447
+ int n_graph_inputs = sched->n_graph_inputs++;
1448
+ GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1449
+ sched->graph_inputs[n_graph_inputs] = src;
1450
+ }
1451
+ }
1452
+
1334
1453
  if (src_backend_id != tensor_backend_id) {
1335
1454
  // create a copy of the input in the split's backend
1336
1455
  size_t id = hash_id(src);
1337
- if (sched->tensor_copies[id][cur_backend_id] == NULL) {
1456
+ if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1338
1457
  ggml_backend_t backend = sched->backends[cur_backend_id];
1339
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1340
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1341
-
1342
- sched->tensor_copies[id][cur_backend_id] = tensor_copy;
1343
- tensor_backend_id(tensor_copy) = cur_backend_id;
1344
- SET_CAUSE(tensor_copy, "4.cpy");
1345
-
1458
+ for (int c = 0; c < sched->n_copies; c++) {
1459
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1460
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1461
+ if (sched->n_copies > 1) {
1462
+ ggml_set_input(tensor_copy);
1463
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1464
+ }
1465
+ sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1466
+ tensor_backend_id(tensor_copy) = cur_backend_id;
1467
+ SET_CAUSE(tensor_copy, "4.cpy");
1468
+ }
1346
1469
  int n_inputs = sched->splits[cur_split].n_inputs++;
1347
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1470
+ GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1348
1471
  sched->splits[cur_split].inputs[n_inputs] = src;
1349
1472
  }
1350
- node->src[j] = sched->tensor_copies[id][cur_backend_id];
1473
+ node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1351
1474
  }
1352
1475
  }
1353
1476
  }
@@ -1355,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1355
1478
  sched->n_splits = cur_split + 1;
1356
1479
  }
1357
1480
  #ifdef DEBUG_PASS4
1358
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1481
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1359
1482
  #endif
1360
1483
 
1361
1484
  #ifndef NDEBUG
1362
1485
  // sanity check: all sources should have the same backend as the node
1363
1486
  for (int i = 0; i < graph->n_nodes; i++) {
1364
1487
  struct ggml_tensor * node = graph->nodes[i];
1365
- ggml_backend_t tensor_backend = tensor_backend(node);
1488
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1366
1489
  if (tensor_backend == NULL) {
1367
1490
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1368
1491
  }
1369
- if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
1492
+ if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
1370
1493
  fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1371
1494
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1372
- node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
1495
+ node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
1496
+ ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
1373
1497
  }
1374
1498
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1375
1499
  struct ggml_tensor * src = node->src[j];
1376
1500
  if (src == NULL) {
1377
1501
  continue;
1378
1502
  }
1379
- ggml_backend_t src_backend = tensor_backend(src);
1503
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1380
1504
  if (src_backend != tensor_backend /* && src_backend != NULL */) {
1381
1505
  fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1382
1506
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1383
1507
  j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1384
1508
  }
1385
- if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
1509
+ if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
1386
1510
  fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1387
1511
  src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1388
- src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
1512
+ src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
1513
+ ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
1389
1514
  }
1390
1515
  }
1391
1516
  }
@@ -1393,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1393
1518
  #endif
1394
1519
 
1395
1520
  // create copies of the graph for each split
1396
- // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
1397
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
1521
+ // TODO: avoid this copy
1522
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
1398
1523
  for (int i = 0; i < sched->n_splits; i++) {
1399
1524
  struct ggml_backend_sched_split * split = &sched->splits[i];
1400
1525
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1401
1526
 
1527
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1402
1528
  for (int j = 0; j < split->n_inputs; j++) {
1403
1529
  struct ggml_tensor * input = split->inputs[j];
1404
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
1530
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
1405
1531
 
1406
1532
  // add a dependency to the input source so that it is not freed before the copy is done
1407
1533
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1534
+ input_dep->src[0] = input;
1408
1535
  sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1409
1536
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1410
1537
 
@@ -1418,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1418
1545
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1419
1546
  }
1420
1547
  }
1548
+
1549
+ if (sched->n_copies > 1) {
1550
+ // add input copies as leafs so that they are allocated first
1551
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1552
+ struct ggml_tensor * input = sched->graph_inputs[i];
1553
+ size_t id = hash_id(input);
1554
+ int backend_id = tensor_backend_id(input);
1555
+ for (int c = 0; c < sched->n_copies; c++) {
1556
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1557
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1558
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1559
+ }
1560
+ }
1561
+
1562
+ for (int i = 0; i < sched->n_splits; i++) {
1563
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1564
+ int backend_id = split->backend_id;
1565
+ for (int j = 0; j < split->n_inputs; j++) {
1566
+ struct ggml_tensor * input = split->inputs[j];
1567
+ size_t id = hash_id(input);
1568
+ for (int c = 0; c < sched->n_copies; c++) {
1569
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1570
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1571
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1572
+ }
1573
+ }
1574
+ }
1575
+ }
1576
+
1577
+ // add leafs from the original graph
1578
+ for (int i = 0; i < graph->n_leafs; i++) {
1579
+ struct ggml_tensor * leaf = graph->leafs[i];
1580
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1581
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1582
+ }
1583
+
1421
1584
  sched->graph = graph_copy;
1422
1585
  }
1423
1586
 
1424
1587
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1425
- // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1588
+ // allocate graph
1426
1589
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1590
+ // the re-allocation may cause the split inputs to be moved to a different address
1591
+ ggml_backend_sched_synchronize(sched);
1427
1592
  #ifndef NDEBUG
1428
- fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
1593
+ fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1429
1594
  #endif
1430
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1595
+ ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1431
1596
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1432
- fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
1597
+ fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1433
1598
  return false;
1434
1599
  }
1435
1600
  }
@@ -1437,10 +1602,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1437
1602
  return true;
1438
1603
  }
1439
1604
 
1440
- static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1441
- uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1442
- uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1443
-
1605
+ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1444
1606
  struct ggml_backend_sched_split * splits = sched->splits;
1445
1607
 
1446
1608
  for (int i = 0; i < sched->n_splits; i++) {
@@ -1449,33 +1611,36 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1449
1611
  ggml_backend_t split_backend = sched->backends[split_backend_id];
1450
1612
 
1451
1613
  // copy the input tensors to the split backend
1452
- uint64_t copy_start_us = ggml_time_us();
1453
1614
  for (int j = 0; j < split->n_inputs; j++) {
1615
+ ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1454
1616
  struct ggml_tensor * input = split->inputs[j];
1455
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
1617
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1456
1618
 
1457
- GGML_ASSERT(input->buffer != NULL);
1458
- GGML_ASSERT(input_cpy->buffer != NULL);
1619
+ if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1620
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1621
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1622
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1623
+ } else {
1624
+ ggml_backend_synchronize(split_backend);
1625
+ }
1626
+ ggml_backend_tensor_copy(input, input_cpy);
1627
+ } else {
1628
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1629
+ ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1630
+ } else {
1631
+ ggml_backend_synchronize(split_backend);
1632
+ ggml_backend_synchronize(input_backend);
1633
+ }
1459
1634
 
1460
- ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1635
+ ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1636
+ }
1461
1637
  }
1462
- //ggml_backend_synchronize(split_backend); // necessary to measure copy time
1463
- int64_t copy_end_us = ggml_time_us();
1464
- copy_us[split_backend_id] += copy_end_us - copy_start_us;
1465
-
1466
- #if 0
1467
- char split_filename[GGML_MAX_NAME];
1468
- snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
1469
- ggml_graph_dump_dot(split->graph, NULL, split_filename);
1470
- #endif
1471
1638
 
1472
-
1473
- uint64_t compute_start_us = ggml_time_us();
1474
1639
  if (!sched->callback_eval) {
1475
- if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
1476
- return false;
1640
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
1641
+ if (ec != GGML_STATUS_SUCCESS) {
1642
+ return ec;
1477
1643
  }
1478
- //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1479
1644
  } else {
1480
1645
  // similar to ggml_backend_compare_graph_backend
1481
1646
  for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
@@ -1494,10 +1659,14 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1494
1659
 
1495
1660
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1496
1661
 
1497
- if (!ggml_backend_graph_compute(split_backend, &gv)) {
1498
- return false;
1662
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
1663
+ if (ec != GGML_STATUS_SUCCESS) {
1664
+ return ec;
1499
1665
  }
1500
1666
 
1667
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1668
+ ggml_backend_synchronize(split_backend);
1669
+
1501
1670
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1502
1671
  break;
1503
1672
  }
@@ -1505,39 +1674,54 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1505
1674
  j0 = j1;
1506
1675
  }
1507
1676
  }
1508
- uint64_t compute_end_us = ggml_time_us();
1509
- compute_us[split_backend_id] += compute_end_us - compute_start_us;
1510
- }
1511
1677
 
1512
- #if 0
1513
- // per-backend timings
1514
- fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
1515
- for (int i = 0; i < sched->n_backends; i++) {
1516
- if (copy_us[i] > 0 || compute_us[i] > 0) {
1517
- fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
1678
+ // record the event of this copy
1679
+ if (split->n_inputs > 0) {
1680
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1681
+ ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1682
+ }
1518
1683
  }
1519
1684
  }
1520
- #endif
1521
1685
 
1522
- return true;
1686
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1687
+
1688
+ return GGML_STATUS_SUCCESS;
1523
1689
  }
1524
1690
 
1525
- ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
1691
+ ggml_backend_sched_t ggml_backend_sched_new(
1692
+ ggml_backend_t * backends,
1693
+ ggml_backend_buffer_type_t * bufts,
1694
+ int n_backends,
1695
+ size_t graph_size,
1696
+ bool parallel) {
1526
1697
  GGML_ASSERT(n_backends > 0);
1527
- GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
1698
+ GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1699
+ GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1528
1700
 
1529
1701
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1530
1702
 
1531
1703
  // initialize hash table
1532
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1704
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1533
1705
  sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1534
1706
  sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1535
1707
  sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1708
+ sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
1536
1709
 
1537
1710
  sched->n_backends = n_backends;
1538
- for (int i = 0; i < n_backends; i++) {
1539
- sched->backends[i] = backends[i];
1540
- sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1711
+
1712
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1713
+
1714
+ GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
1715
+
1716
+ for (int b = 0; b < n_backends; b++) {
1717
+ sched->backends[b] = backends[b];
1718
+ sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1719
+ GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1720
+ if (sched->n_copies > 1) {
1721
+ for (int c = 0; c < sched->n_copies; c++) {
1722
+ sched->events[b][c] = ggml_backend_event_new(backends[b]);
1723
+ }
1724
+ }
1541
1725
  }
1542
1726
 
1543
1727
  sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
@@ -1551,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1551
1735
  if (sched == NULL) {
1552
1736
  return;
1553
1737
  }
1738
+ for (int b = 0; b < sched->n_backends; b++) {
1739
+ for (int c = 0; c < sched->n_copies; c++) {
1740
+ ggml_backend_event_free(sched->events[b][c]);
1741
+ }
1742
+ }
1554
1743
  ggml_gallocr_free(sched->galloc);
1555
1744
  ggml_free(sched->ctx);
1556
1745
  free(sched->hash_set.keys);
1557
1746
  free(sched->tensor_backend_id);
1558
1747
  free(sched->tensor_copies);
1559
1748
  free(sched->node_backend_ids);
1749
+ free(sched->leaf_backend_ids);
1560
1750
  free(sched);
1561
1751
  }
1562
1752
 
@@ -1568,38 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1568
1758
  memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1569
1759
 
1570
1760
  sched->is_reset = true;
1761
+ sched->is_alloc = false;
1571
1762
  }
1572
1763
 
1573
1764
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1574
1765
  ggml_backend_sched_split_graph(sched, measure_graph);
1575
1766
 
1576
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
1767
+ // TODO: extract this to a separate function
1768
+ if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1577
1769
  return false;
1578
1770
  }
1579
1771
 
1580
1772
  ggml_backend_sched_reset(sched);
1773
+ ggml_backend_sched_synchronize(sched);
1774
+
1581
1775
  return true;
1582
1776
  }
1583
1777
 
1584
- bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1585
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1586
-
1587
- if (!sched->is_reset) {
1588
- ggml_backend_sched_reset(sched);
1589
- }
1778
+ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1779
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1590
1780
 
1591
1781
  ggml_backend_sched_split_graph(sched, graph);
1782
+
1592
1783
  if (!ggml_backend_sched_alloc_splits(sched)) {
1593
1784
  return false;
1594
1785
  }
1595
1786
 
1596
- if (!ggml_backend_sched_compute_splits(sched)) {
1597
- return false;
1598
- }
1787
+ sched->is_alloc = true;
1599
1788
 
1600
1789
  return true;
1601
1790
  }
1602
1791
 
1792
+ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1793
+ enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
1794
+ ggml_backend_sched_synchronize(sched);
1795
+ return err;
1796
+ }
1797
+
1798
+ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1799
+ if (!sched->is_reset && !sched->is_alloc) {
1800
+ ggml_backend_sched_reset(sched);
1801
+ }
1802
+
1803
+ if (!sched->is_alloc) {
1804
+ if (!ggml_backend_sched_alloc_graph(sched, graph)) {
1805
+ return GGML_STATUS_ALLOC_FAILED;
1806
+ }
1807
+ }
1808
+
1809
+ return ggml_backend_sched_compute_splits(sched);
1810
+ }
1811
+
1812
+ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1813
+ for (int i = 0; i < sched->n_backends; i++) {
1814
+ ggml_backend_synchronize(sched->backends[i]);
1815
+ }
1816
+ }
1817
+
1603
1818
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1604
1819
  sched->callback_eval = callback;
1605
1820
  sched->callback_eval_user_data = user_data;
@@ -1609,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1609
1824
  return sched->n_splits;
1610
1825
  }
1611
1826
 
1827
+ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1828
+ return sched->n_copies;
1829
+ }
1830
+
1612
1831
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1613
1832
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1614
1833
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1834
+
1615
1835
  return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1616
1836
  }
1617
1837
 
1618
- void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1838
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1619
1839
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1620
1840
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1621
1841
  tensor_backend_id(node) = backend_index;
1622
1842
  }
1623
1843
 
1624
- ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1844
+ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1625
1845
  int backend_index = tensor_backend_id(node);
1626
1846
  if (backend_index == -1) {
1627
1847
  return NULL;