llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
221
221
  GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
222
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
223
 
224
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
225
224
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
226
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
227
227
 
228
228
  if (!size) {
229
229
  return;
230
230
  }
231
231
 
232
- tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
232
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
233
233
  }
234
234
 
235
235
  GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
236
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
237
 
238
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
238
239
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
239
- GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
240
240
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
241
241
 
242
242
  if (!size) {
243
243
  return;
244
244
  }
245
245
 
246
- tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
246
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
247
247
  }
248
248
 
249
249
  void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
255
255
  }
256
256
 
257
257
  ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
258
+ GGML_ASSERT(backend->iface.graph_plan_create != NULL);
259
+
258
260
  return backend->iface.graph_plan_create(backend, cgraph);
259
261
  }
260
262
 
261
263
  void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
264
+ GGML_ASSERT(backend->iface.graph_plan_free != NULL);
265
+
262
266
  backend->iface.graph_plan_free(backend, plan);
263
267
  }
264
268
 
265
- void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
266
- backend->iface.graph_plan_compute(backend, plan);
269
+ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
270
+ GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
271
+
272
+ return backend->iface.graph_plan_compute(backend, plan);
273
+ }
274
+
275
+ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
276
+ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
277
+ ggml_backend_synchronize(backend);
278
+ return err;
267
279
  }
268
280
 
269
- bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
281
+ bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
270
282
  return backend->iface.graph_compute(backend, cgraph);
271
283
  }
272
284
 
@@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
314
326
  }
315
327
  }
316
328
 
317
- void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
329
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
318
330
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
319
331
 
320
332
  if (src == dst) {
321
333
  return;
322
334
  }
323
335
 
324
- if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
325
- if (backend->iface.cpy_tensor_async != NULL) {
326
- if (backend->iface.cpy_tensor_async(backend, src, dst)) {
327
- return;
328
- }
336
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
337
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
338
+ return;
329
339
  }
330
340
  }
331
341
 
332
- size_t nbytes = ggml_nbytes(src);
342
+ // an async copy would normally happen after all the queued operations on both backends are completed
343
+ // sync src, set_async dst
333
344
  if (ggml_backend_buffer_is_host(src->buffer)) {
334
- ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
335
- }
336
- else {
345
+ ggml_backend_synchronize(backend_src);
346
+ ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
347
+ } else {
348
+ ggml_backend_synchronize(backend_src);
337
349
  ggml_backend_tensor_copy(src, dst);
350
+ ggml_backend_synchronize(backend_dst);
338
351
  }
339
352
  }
340
353
 
354
+ // events
355
+
356
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
357
+ if (backend->iface.event_new == NULL) {
358
+ return NULL;
359
+ }
360
+ return backend->iface.event_new(backend);
361
+ }
362
+
363
+ void ggml_backend_event_free(ggml_backend_event_t event) {
364
+ if (event == NULL) {
365
+ return;
366
+ }
367
+ event->backend->iface.event_free(event);
368
+ }
369
+
370
+ void ggml_backend_event_record(ggml_backend_event_t event) {
371
+ GGML_ASSERT(event->backend->iface.event_record != NULL);
372
+
373
+ event->backend->iface.event_record(event);
374
+ }
375
+
376
+ void ggml_backend_event_synchronize(ggml_backend_event_t event) {
377
+ GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
378
+
379
+ event->backend->iface.event_synchronize(event);
380
+ }
381
+
382
+ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
383
+ GGML_ASSERT(backend->iface.event_wait != NULL);
384
+
385
+ backend->iface.event_wait(backend, event);
386
+ }
341
387
 
342
388
  // backend registry
343
389
 
344
- #define GGML_MAX_BACKENDS_REG 16
390
+ #define GGML_REG_MAX_BACKENDS 16
345
391
 
346
392
  struct ggml_backend_reg {
347
393
  char name[128];
@@ -350,7 +396,7 @@ struct ggml_backend_reg {
350
396
  void * user_data;
351
397
  };
352
398
 
353
- static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
399
+ static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
354
400
  static size_t ggml_backend_registry_count = 0;
355
401
 
356
402
  GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
@@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
395
441
  }
396
442
 
397
443
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
398
- GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
444
+ GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
399
445
 
400
446
  size_t id = ggml_backend_registry_count;
401
447
 
@@ -732,22 +778,26 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
732
778
  GGML_UNUSED(backend);
733
779
  }
734
780
 
735
- GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
781
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
736
782
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
737
783
 
738
- ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
784
+ return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
739
785
 
740
786
  GGML_UNUSED(backend);
741
787
  }
742
788
 
743
- GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
789
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
744
790
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
745
791
 
746
792
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
747
793
 
748
794
  if (cpu_ctx->work_size < cplan.work_size) {
749
- // TODO: may be faster to free and use malloc to avoid the copy
750
- cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
795
+ free(cpu_ctx->work_data);
796
+ cpu_ctx->work_data = malloc(cplan.work_size);
797
+ if (cpu_ctx->work_data == NULL) {
798
+ cpu_ctx->work_size = 0;
799
+ return GGML_STATUS_ALLOC_FAILED;
800
+ }
751
801
  cpu_ctx->work_size = cplan.work_size;
752
802
  }
753
803
  cplan.work_data = cpu_ctx->work_data;
@@ -755,8 +805,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
755
805
  cplan.abort_callback = cpu_ctx->abort_callback;
756
806
  cplan.abort_callback_data = cpu_ctx->abort_callback_data;
757
807
 
758
- ggml_graph_compute(cgraph, &cplan);
759
- return true;
808
+ return ggml_graph_compute(cgraph, &cplan);
760
809
  }
761
810
 
762
811
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -785,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
785
834
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
786
835
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
787
836
  /* .supports_op = */ ggml_backend_cpu_supports_op,
837
+ /* .event_new = */ NULL,
838
+ /* .event_free = */ NULL,
839
+ /* .event_record = */ NULL,
840
+ /* .event_wait = */ NULL,
841
+ /* .event_synchronize = */ NULL,
788
842
  };
789
843
 
790
844
  static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -940,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
940
994
 
941
995
  // scheduler
942
996
 
943
- #define GGML_MAX_BACKENDS 16
944
- #define GGML_MAX_SPLITS 256
945
- #define GGML_MAX_SPLIT_INPUTS 16
997
+ #ifndef GGML_SCHED_MAX_BACKENDS
998
+ #define GGML_SCHED_MAX_BACKENDS 16
999
+ #endif
1000
+
1001
+ #ifndef GGML_SCHED_MAX_SPLITS
1002
+ #define GGML_SCHED_MAX_SPLITS 256
1003
+ #endif
1004
+
1005
+ #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1006
+ #define GGML_SCHED_MAX_SPLIT_INPUTS 16
1007
+ #endif
1008
+
1009
+ #ifndef GGML_SCHED_MAX_COPIES
1010
+ #define GGML_SCHED_MAX_COPIES 4
1011
+ #endif
946
1012
 
947
1013
  struct ggml_backend_sched_split {
948
1014
  int backend_id;
949
1015
  int i_start;
950
1016
  int i_end;
951
- struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
1017
+ struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
952
1018
  int n_inputs;
953
1019
  // graph view of this split
954
1020
  struct ggml_cgraph graph;
@@ -956,45 +1022,53 @@ struct ggml_backend_sched_split {
956
1022
 
957
1023
  struct ggml_backend_sched {
958
1024
  bool is_reset; // true if the scheduler has been reset since the last graph split
1025
+ bool is_alloc;
959
1026
 
960
1027
  int n_backends;
961
- ggml_backend_t backends[GGML_MAX_BACKENDS];
962
- ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
963
1028
 
1029
+ ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1030
+ ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
964
1031
  ggml_gallocr_t galloc;
965
1032
 
966
1033
  // hash keys of the nodes in the graph
967
1034
  struct ggml_hash_set hash_set;
968
1035
  // hash values
969
1036
  int * tensor_backend_id;
970
- struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
1037
+ struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
971
1038
 
972
- int * node_backend_ids; // [n_nodes]
973
- int n_nodes;
1039
+ int * node_backend_ids; // [graph_size]
1040
+ int * leaf_backend_ids; // [graph_size]
974
1041
 
975
1042
  // copy of the graph with modified inputs
976
1043
  struct ggml_cgraph * graph;
977
1044
 
978
- struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
1045
+ // graph splits
1046
+ struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
979
1047
  int n_splits;
980
1048
 
1049
+ // pipeline parallelism support
1050
+ int n_copies;
1051
+ int cur_copy;
1052
+ ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1053
+ struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1054
+ int n_graph_inputs;
1055
+
981
1056
  struct ggml_context * ctx;
982
1057
 
983
1058
  ggml_backend_sched_eval_callback callback_eval;
984
1059
  void * callback_eval_user_data;
985
1060
 
986
1061
  // align context_buffer to GGML_MEM_ALIGN
987
- #ifdef _MSC_VER
1062
+ #ifdef _MSC_VER
988
1063
  __declspec(align(GGML_MEM_ALIGN))
989
- #else
1064
+ #else
990
1065
  __attribute__((aligned(GGML_MEM_ALIGN)))
991
- #endif
992
- char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1066
+ #endif
1067
+ char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
993
1068
  };
994
1069
 
995
- #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
996
- #define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
997
- #define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
1070
+ #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
1071
+ #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
998
1072
 
999
1073
  // returns the priority of the backend, lower id is higher priority
1000
1074
  static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@@ -1006,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1006
1080
  return -1;
1007
1081
  }
1008
1082
 
1009
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
1083
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1084
+ ggml_backend_buffer_t buffer = tensor->buffer;
1010
1085
  if (buffer == NULL) {
1011
1086
  return -1;
1012
1087
  }
@@ -1017,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
1017
1092
  return i;
1018
1093
  }
1019
1094
  }
1020
- GGML_ASSERT(false && "tensor buffer type not supported by any backend");
1021
- return -1; // silence warning
1095
+
1096
+ fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1097
+ __func__, ggml_backend_buffer_name(buffer), tensor->name);
1098
+ GGML_ASSERT(false);
1099
+
1100
+ return -1;
1022
1101
  }
1023
1102
 
1024
1103
  #if 0
1025
- static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
1104
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1026
1105
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1027
1106
  #define GET_CAUSE(node) causes[hash_id(node)]
1028
1107
  #else
@@ -1036,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1036
1115
 
1037
1116
  // assign pre-allocated nodes to their backend
1038
1117
  // dst
1039
- int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
1118
+ int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
1040
1119
  if (cur_backend != -1) {
1041
- SET_CAUSE(node, "1.dst");
1120
+ SET_CAUSE(tensor, "1.dst");
1042
1121
  return cur_backend;
1043
1122
  }
1123
+
1044
1124
  // view_src
1045
1125
  if (tensor->view_src != NULL) {
1046
- cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
1126
+ cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1047
1127
  if (cur_backend != -1) {
1048
- SET_CAUSE(node, "1.vsrc");
1128
+ SET_CAUSE(tensor, "1.vsrc");
1049
1129
  return cur_backend;
1050
1130
  }
1051
1131
  }
1132
+
1133
+ // input
1134
+ if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1135
+ cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
1136
+ SET_CAUSE(tensor, "1.inp");
1137
+ return cur_backend;
1138
+ }
1139
+
1052
1140
  // assign nodes that use weights to the backend of the weights
1053
1141
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1054
1142
  const struct ggml_tensor * src = tensor->src[i];
@@ -1056,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1056
1144
  continue;
1057
1145
  }
1058
1146
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1059
- int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
1147
+ int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
1060
1148
  // operations with weights are always run on the same backend as the weights
1061
- SET_CAUSE(node, "1.wgt%d", i);
1149
+ SET_CAUSE(tensor, "1.wgt%d", i);
1062
1150
  return src_backend;
1063
1151
  }
1064
1152
  }
@@ -1094,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1094
1182
  if (ggml_is_view_op(node->op)) {
1095
1183
  continue;
1096
1184
  }
1097
- ggml_backend_t tensor_backend = tensor_backend(node);
1185
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1098
1186
  fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1099
1187
  fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1100
1188
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1102,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1102
1190
  if (src == NULL) {
1103
1191
  continue;
1104
1192
  }
1105
- ggml_backend_t src_backend = tensor_backend(src);
1193
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1106
1194
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1107
1195
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1108
1196
  }
@@ -1119,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1119
1207
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1120
1208
  // reset splits
1121
1209
  sched->n_splits = 0;
1210
+ sched->n_graph_inputs = 0;
1122
1211
  sched->is_reset = false;
1123
1212
 
1124
1213
  struct ggml_init_params params = {
@@ -1164,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1164
1253
  }
1165
1254
  }
1166
1255
  #ifdef DEBUG_PASS1
1167
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1256
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1168
1257
  #endif
1169
1258
 
1170
1259
  // pass 2: expand current backend assignments
@@ -1172,10 +1261,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1172
1261
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1173
1262
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1174
1263
 
1175
- // pass 2.1 expand gpu up
1264
+
1265
+ // pass 2.2 expand gpu down
1176
1266
  {
1177
1267
  int cur_backend_id = -1;
1178
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1268
+ for (int i = 0; i < graph->n_nodes; i++) {
1179
1269
  struct ggml_tensor * node = graph->nodes[i];
1180
1270
  if (ggml_is_view_op(node->op)) {
1181
1271
  continue;
@@ -1190,15 +1280,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1190
1280
  }
1191
1281
  } else {
1192
1282
  tensor_backend_id(node) = cur_backend_id;
1193
- SET_CAUSE(node, "2.1");
1283
+ SET_CAUSE(node, "2.2");
1194
1284
  }
1195
1285
  }
1196
1286
  }
1197
1287
 
1198
- // pass 2.2 expand gpu down
1288
+ // pass 2.1 expand gpu up
1199
1289
  {
1200
1290
  int cur_backend_id = -1;
1201
- for (int i = 0; i < graph->n_nodes; i++) {
1291
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1202
1292
  struct ggml_tensor * node = graph->nodes[i];
1203
1293
  if (ggml_is_view_op(node->op)) {
1204
1294
  continue;
@@ -1213,15 +1303,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1213
1303
  }
1214
1304
  } else {
1215
1305
  tensor_backend_id(node) = cur_backend_id;
1216
- SET_CAUSE(node, "2.2");
1306
+ SET_CAUSE(node, "2.1");
1217
1307
  }
1218
1308
  }
1219
1309
  }
1220
1310
 
1221
- // pass 2.3 expand rest up
1311
+
1312
+ // pass 2.4 expand rest down
1222
1313
  {
1223
1314
  int cur_backend_id = -1;
1224
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1315
+ for (int i = 0; i < graph->n_nodes; i++) {
1225
1316
  struct ggml_tensor * node = graph->nodes[i];
1226
1317
  if (ggml_is_view_op(node->op)) {
1227
1318
  continue;
@@ -1231,15 +1322,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1231
1322
  cur_backend_id = tensor_backend_id;
1232
1323
  } else {
1233
1324
  tensor_backend_id(node) = cur_backend_id;
1234
- SET_CAUSE(node, "2.3");
1325
+ SET_CAUSE(node, "2.4");
1235
1326
  }
1236
1327
  }
1237
1328
  }
1238
-
1239
- // pass 2.4 expand rest down
1329
+ // pass 2.3 expand rest up
1240
1330
  {
1241
1331
  int cur_backend_id = -1;
1242
- for (int i = 0; i < graph->n_nodes; i++) {
1332
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1243
1333
  struct ggml_tensor * node = graph->nodes[i];
1244
1334
  if (ggml_is_view_op(node->op)) {
1245
1335
  continue;
@@ -1249,12 +1339,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1249
1339
  cur_backend_id = tensor_backend_id;
1250
1340
  } else {
1251
1341
  tensor_backend_id(node) = cur_backend_id;
1252
- SET_CAUSE(node, "2.4");
1342
+ SET_CAUSE(node, "2.3");
1253
1343
  }
1254
1344
  }
1255
1345
  }
1346
+
1256
1347
  #ifdef DEBUG_PASS2
1257
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1348
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1258
1349
  #endif
1259
1350
 
1260
1351
  // pass 3: assign backends to remaining src from dst and view_src
@@ -1284,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1284
1375
  }
1285
1376
  }
1286
1377
  #ifdef DEBUG_PASS3
1287
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1378
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1288
1379
  #endif
1289
1380
 
1290
1381
  // pass 4: split graph, find tensors that need to be copied
@@ -1316,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1316
1407
  if (tensor_backend_id != cur_backend_id) {
1317
1408
  sched->splits[cur_split].i_end = i;
1318
1409
  cur_split++;
1319
- GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1410
+ GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
1320
1411
  sched->splits[cur_split].backend_id = tensor_backend_id;
1321
1412
  sched->splits[cur_split].i_start = i;
1322
1413
  sched->splits[cur_split].n_inputs = 0;
@@ -1329,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1329
1420
  if (src == NULL) {
1330
1421
  continue;
1331
1422
  }
1423
+
1332
1424
  int src_backend_id = tensor_backend_id(src);
1333
1425
  assert(src_backend_id != -1); // all inputs should be assigned by now
1426
+
1427
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
1428
+ size_t id = hash_id(src);
1429
+ if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1430
+ ggml_backend_t backend = sched->backends[src_backend_id];
1431
+ for (int c = 0; c < sched->n_copies; c++) {
1432
+ struct ggml_tensor * tensor_copy;
1433
+ if (c == sched->cur_copy) {
1434
+ tensor_copy = src; // use the original tensor as the current copy
1435
+ } else {
1436
+ tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1437
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1438
+ }
1439
+ if (sched->n_copies > 1) {
1440
+ ggml_set_input(tensor_copy);
1441
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1442
+ }
1443
+ sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1444
+ tensor_backend_id(tensor_copy) = src_backend_id;
1445
+ SET_CAUSE(tensor_copy, "4.cpy");
1446
+ }
1447
+ int n_graph_inputs = sched->n_graph_inputs++;
1448
+ GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1449
+ sched->graph_inputs[n_graph_inputs] = src;
1450
+ }
1451
+ }
1452
+
1334
1453
  if (src_backend_id != tensor_backend_id) {
1335
1454
  // create a copy of the input in the split's backend
1336
1455
  size_t id = hash_id(src);
1337
- if (sched->tensor_copies[id][cur_backend_id] == NULL) {
1456
+ if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1338
1457
  ggml_backend_t backend = sched->backends[cur_backend_id];
1339
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1340
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1341
-
1342
- sched->tensor_copies[id][cur_backend_id] = tensor_copy;
1343
- tensor_backend_id(tensor_copy) = cur_backend_id;
1344
- SET_CAUSE(tensor_copy, "4.cpy");
1345
-
1458
+ for (int c = 0; c < sched->n_copies; c++) {
1459
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1460
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1461
+ if (sched->n_copies > 1) {
1462
+ ggml_set_input(tensor_copy);
1463
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1464
+ }
1465
+ sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1466
+ tensor_backend_id(tensor_copy) = cur_backend_id;
1467
+ SET_CAUSE(tensor_copy, "4.cpy");
1468
+ }
1346
1469
  int n_inputs = sched->splits[cur_split].n_inputs++;
1347
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1470
+ GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1348
1471
  sched->splits[cur_split].inputs[n_inputs] = src;
1349
1472
  }
1350
- node->src[j] = sched->tensor_copies[id][cur_backend_id];
1473
+ node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1351
1474
  }
1352
1475
  }
1353
1476
  }
@@ -1355,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1355
1478
  sched->n_splits = cur_split + 1;
1356
1479
  }
1357
1480
  #ifdef DEBUG_PASS4
1358
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1481
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1359
1482
  #endif
1360
1483
 
1361
1484
  #ifndef NDEBUG
1362
1485
  // sanity check: all sources should have the same backend as the node
1363
1486
  for (int i = 0; i < graph->n_nodes; i++) {
1364
1487
  struct ggml_tensor * node = graph->nodes[i];
1365
- ggml_backend_t tensor_backend = tensor_backend(node);
1488
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1366
1489
  if (tensor_backend == NULL) {
1367
1490
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1368
1491
  }
1369
- if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
1492
+ if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
1370
1493
  fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1371
1494
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1372
- node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
1495
+ node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
1496
+ ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
1373
1497
  }
1374
1498
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1375
1499
  struct ggml_tensor * src = node->src[j];
1376
1500
  if (src == NULL) {
1377
1501
  continue;
1378
1502
  }
1379
- ggml_backend_t src_backend = tensor_backend(src);
1503
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1380
1504
  if (src_backend != tensor_backend /* && src_backend != NULL */) {
1381
1505
  fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1382
1506
  node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1383
1507
  j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1384
1508
  }
1385
- if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
1509
+ if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
1386
1510
  fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1387
1511
  src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1388
- src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
1512
+ src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
1513
+ ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
1389
1514
  }
1390
1515
  }
1391
1516
  }
@@ -1393,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1393
1518
  #endif
1394
1519
 
1395
1520
  // create copies of the graph for each split
1396
- // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
1397
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
1521
+ // TODO: avoid this copy
1522
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
1398
1523
  for (int i = 0; i < sched->n_splits; i++) {
1399
1524
  struct ggml_backend_sched_split * split = &sched->splits[i];
1400
1525
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1401
1526
 
1527
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1402
1528
  for (int j = 0; j < split->n_inputs; j++) {
1403
1529
  struct ggml_tensor * input = split->inputs[j];
1404
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
1530
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
1405
1531
 
1406
1532
  // add a dependency to the input source so that it is not freed before the copy is done
1407
1533
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1534
+ input_dep->src[0] = input;
1408
1535
  sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1409
1536
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1410
1537
 
@@ -1418,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1418
1545
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1419
1546
  }
1420
1547
  }
1548
+
1549
+ if (sched->n_copies > 1) {
1550
+ // add input copies as leafs so that they are allocated first
1551
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1552
+ struct ggml_tensor * input = sched->graph_inputs[i];
1553
+ size_t id = hash_id(input);
1554
+ int backend_id = tensor_backend_id(input);
1555
+ for (int c = 0; c < sched->n_copies; c++) {
1556
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1557
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1558
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1559
+ }
1560
+ }
1561
+
1562
+ for (int i = 0; i < sched->n_splits; i++) {
1563
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1564
+ int backend_id = split->backend_id;
1565
+ for (int j = 0; j < split->n_inputs; j++) {
1566
+ struct ggml_tensor * input = split->inputs[j];
1567
+ size_t id = hash_id(input);
1568
+ for (int c = 0; c < sched->n_copies; c++) {
1569
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1570
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1571
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1572
+ }
1573
+ }
1574
+ }
1575
+ }
1576
+
1577
+ // add leafs from the original graph
1578
+ for (int i = 0; i < graph->n_leafs; i++) {
1579
+ struct ggml_tensor * leaf = graph->leafs[i];
1580
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1581
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1582
+ }
1583
+
1421
1584
  sched->graph = graph_copy;
1422
1585
  }
1423
1586
 
1424
1587
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1425
- // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1588
+ // allocate graph
1426
1589
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1590
+ // the re-allocation may cause the split inputs to be moved to a different address
1591
+ ggml_backend_sched_synchronize(sched);
1427
1592
  #ifndef NDEBUG
1428
- fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
1593
+ fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1429
1594
  #endif
1430
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1595
+ ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1431
1596
  if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1432
- fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
1597
+ fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1433
1598
  return false;
1434
1599
  }
1435
1600
  }
@@ -1437,10 +1602,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1437
1602
  return true;
1438
1603
  }
1439
1604
 
1440
- static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1441
- uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1442
- uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1443
-
1605
+ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1444
1606
  struct ggml_backend_sched_split * splits = sched->splits;
1445
1607
 
1446
1608
  for (int i = 0; i < sched->n_splits; i++) {
@@ -1449,33 +1611,36 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1449
1611
  ggml_backend_t split_backend = sched->backends[split_backend_id];
1450
1612
 
1451
1613
  // copy the input tensors to the split backend
1452
- uint64_t copy_start_us = ggml_time_us();
1453
1614
  for (int j = 0; j < split->n_inputs; j++) {
1615
+ ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1454
1616
  struct ggml_tensor * input = split->inputs[j];
1455
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
1617
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1456
1618
 
1457
- GGML_ASSERT(input->buffer != NULL);
1458
- GGML_ASSERT(input_cpy->buffer != NULL);
1619
+ if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1620
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1621
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1622
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1623
+ } else {
1624
+ ggml_backend_synchronize(split_backend);
1625
+ }
1626
+ ggml_backend_tensor_copy(input, input_cpy);
1627
+ } else {
1628
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1629
+ ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1630
+ } else {
1631
+ ggml_backend_synchronize(split_backend);
1632
+ ggml_backend_synchronize(input_backend);
1633
+ }
1459
1634
 
1460
- ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1635
+ ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1636
+ }
1461
1637
  }
1462
- //ggml_backend_synchronize(split_backend); // necessary to measure copy time
1463
- int64_t copy_end_us = ggml_time_us();
1464
- copy_us[split_backend_id] += copy_end_us - copy_start_us;
1465
-
1466
- #if 0
1467
- char split_filename[GGML_MAX_NAME];
1468
- snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
1469
- ggml_graph_dump_dot(split->graph, NULL, split_filename);
1470
- #endif
1471
1638
 
1472
-
1473
- uint64_t compute_start_us = ggml_time_us();
1474
1639
  if (!sched->callback_eval) {
1475
- if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
1476
- return false;
1640
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
1641
+ if (ec != GGML_STATUS_SUCCESS) {
1642
+ return ec;
1477
1643
  }
1478
- //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1479
1644
  } else {
1480
1645
  // similar to ggml_backend_compare_graph_backend
1481
1646
  for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
@@ -1494,10 +1659,14 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1494
1659
 
1495
1660
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1496
1661
 
1497
- if (!ggml_backend_graph_compute(split_backend, &gv)) {
1498
- return false;
1662
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
1663
+ if (ec != GGML_STATUS_SUCCESS) {
1664
+ return ec;
1499
1665
  }
1500
1666
 
1667
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1668
+ ggml_backend_synchronize(split_backend);
1669
+
1501
1670
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1502
1671
  break;
1503
1672
  }
@@ -1505,39 +1674,54 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1505
1674
  j0 = j1;
1506
1675
  }
1507
1676
  }
1508
- uint64_t compute_end_us = ggml_time_us();
1509
- compute_us[split_backend_id] += compute_end_us - compute_start_us;
1510
- }
1511
1677
 
1512
- #if 0
1513
- // per-backend timings
1514
- fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
1515
- for (int i = 0; i < sched->n_backends; i++) {
1516
- if (copy_us[i] > 0 || compute_us[i] > 0) {
1517
- fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
1678
+ // record the event of this copy
1679
+ if (split->n_inputs > 0) {
1680
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1681
+ ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1682
+ }
1518
1683
  }
1519
1684
  }
1520
- #endif
1521
1685
 
1522
- return true;
1686
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1687
+
1688
+ return GGML_STATUS_SUCCESS;
1523
1689
  }
1524
1690
 
1525
- ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
1691
+ ggml_backend_sched_t ggml_backend_sched_new(
1692
+ ggml_backend_t * backends,
1693
+ ggml_backend_buffer_type_t * bufts,
1694
+ int n_backends,
1695
+ size_t graph_size,
1696
+ bool parallel) {
1526
1697
  GGML_ASSERT(n_backends > 0);
1527
- GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
1698
+ GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1699
+ GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1528
1700
 
1529
1701
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1530
1702
 
1531
1703
  // initialize hash table
1532
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1704
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1533
1705
  sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1534
1706
  sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1535
1707
  sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1708
+ sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
1536
1709
 
1537
1710
  sched->n_backends = n_backends;
1538
- for (int i = 0; i < n_backends; i++) {
1539
- sched->backends[i] = backends[i];
1540
- sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1711
+
1712
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1713
+
1714
+ GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
1715
+
1716
+ for (int b = 0; b < n_backends; b++) {
1717
+ sched->backends[b] = backends[b];
1718
+ sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1719
+ GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1720
+ if (sched->n_copies > 1) {
1721
+ for (int c = 0; c < sched->n_copies; c++) {
1722
+ sched->events[b][c] = ggml_backend_event_new(backends[b]);
1723
+ }
1724
+ }
1541
1725
  }
1542
1726
 
1543
1727
  sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
@@ -1551,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1551
1735
  if (sched == NULL) {
1552
1736
  return;
1553
1737
  }
1738
+ for (int b = 0; b < sched->n_backends; b++) {
1739
+ for (int c = 0; c < sched->n_copies; c++) {
1740
+ ggml_backend_event_free(sched->events[b][c]);
1741
+ }
1742
+ }
1554
1743
  ggml_gallocr_free(sched->galloc);
1555
1744
  ggml_free(sched->ctx);
1556
1745
  free(sched->hash_set.keys);
1557
1746
  free(sched->tensor_backend_id);
1558
1747
  free(sched->tensor_copies);
1559
1748
  free(sched->node_backend_ids);
1749
+ free(sched->leaf_backend_ids);
1560
1750
  free(sched);
1561
1751
  }
1562
1752
 
@@ -1568,38 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1568
1758
  memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1569
1759
 
1570
1760
  sched->is_reset = true;
1761
+ sched->is_alloc = false;
1571
1762
  }
1572
1763
 
1573
1764
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1574
1765
  ggml_backend_sched_split_graph(sched, measure_graph);
1575
1766
 
1576
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
1767
+ // TODO: extract this to a separate function
1768
+ if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1577
1769
  return false;
1578
1770
  }
1579
1771
 
1580
1772
  ggml_backend_sched_reset(sched);
1773
+ ggml_backend_sched_synchronize(sched);
1774
+
1581
1775
  return true;
1582
1776
  }
1583
1777
 
1584
- bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1585
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1586
-
1587
- if (!sched->is_reset) {
1588
- ggml_backend_sched_reset(sched);
1589
- }
1778
+ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1779
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1590
1780
 
1591
1781
  ggml_backend_sched_split_graph(sched, graph);
1782
+
1592
1783
  if (!ggml_backend_sched_alloc_splits(sched)) {
1593
1784
  return false;
1594
1785
  }
1595
1786
 
1596
- if (!ggml_backend_sched_compute_splits(sched)) {
1597
- return false;
1598
- }
1787
+ sched->is_alloc = true;
1599
1788
 
1600
1789
  return true;
1601
1790
  }
1602
1791
 
1792
+ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1793
+ enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
1794
+ ggml_backend_sched_synchronize(sched);
1795
+ return err;
1796
+ }
1797
+
1798
+ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1799
+ if (!sched->is_reset && !sched->is_alloc) {
1800
+ ggml_backend_sched_reset(sched);
1801
+ }
1802
+
1803
+ if (!sched->is_alloc) {
1804
+ if (!ggml_backend_sched_alloc_graph(sched, graph)) {
1805
+ return GGML_STATUS_ALLOC_FAILED;
1806
+ }
1807
+ }
1808
+
1809
+ return ggml_backend_sched_compute_splits(sched);
1810
+ }
1811
+
1812
+ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1813
+ for (int i = 0; i < sched->n_backends; i++) {
1814
+ ggml_backend_synchronize(sched->backends[i]);
1815
+ }
1816
+ }
1817
+
1603
1818
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1604
1819
  sched->callback_eval = callback;
1605
1820
  sched->callback_eval_user_data = user_data;
@@ -1609,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1609
1824
  return sched->n_splits;
1610
1825
  }
1611
1826
 
1827
+ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1828
+ return sched->n_copies;
1829
+ }
1830
+
1612
1831
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1613
1832
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1614
1833
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1834
+
1615
1835
  return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1616
1836
  }
1617
1837
 
1618
- void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1838
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1619
1839
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1620
1840
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1621
1841
  tensor_backend_id(node) = backend_index;
1622
1842
  }
1623
1843
 
1624
- ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1844
+ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1625
1845
  int backend_index = tensor_backend_id(node);
1626
1846
  if (backend_index == -1) {
1627
1847
  return NULL;