llama_cpp 0.12.5 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
219
219
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
220
220
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
221
221
 
222
+ if (!size) {
223
+ return;
224
+ }
225
+
222
226
  tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
223
227
  }
224
228
 
@@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
229
233
  GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
230
234
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
231
235
 
236
+ if (!size) {
237
+ return;
238
+ }
239
+
232
240
  tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
233
241
  }
234
242
 
@@ -475,6 +483,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
475
483
 
476
484
  // backend CPU
477
485
 
486
+ static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
487
+
478
488
  GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
479
489
  return "CPU";
480
490
 
@@ -482,7 +492,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t
482
492
  }
483
493
 
484
494
  GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
485
- return (void *)buffer->context;
495
+ uintptr_t data = (uintptr_t)buffer->context;
496
+
497
+ // align the buffer
498
+ if (data % TENSOR_ALIGNMENT != 0) {
499
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
500
+ }
501
+
502
+ return (void *)data;
486
503
  }
487
504
 
488
505
  GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -540,8 +557,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
540
557
  /* .reset = */ NULL,
541
558
  };
542
559
 
543
- static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
544
-
545
560
  GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
546
561
  return "CPU";
547
562
 
@@ -550,9 +565,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend
550
565
 
551
566
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
552
567
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
553
- void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
554
-
555
- GGML_ASSERT(data != NULL && "failed to allocate buffer");
568
+ void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
569
+ if (data == NULL) {
570
+ fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
571
+ return NULL;
572
+ }
556
573
 
557
574
  return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
558
575
  }
@@ -653,6 +670,9 @@ struct ggml_backend_cpu_context {
653
670
  int n_threads;
654
671
  void * work_data;
655
672
  size_t work_size;
673
+
674
+ ggml_abort_callback abort_callback;
675
+ void * abort_callback_data;
656
676
  };
657
677
 
658
678
  GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
@@ -691,6 +711,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
691
711
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
692
712
  }
693
713
 
714
+ cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
715
+ cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
716
+
694
717
  return cpu_plan;
695
718
  }
696
719
 
@@ -721,9 +744,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
721
744
  cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
722
745
  cpu_ctx->work_size = cplan.work_size;
723
746
  }
724
-
725
747
  cplan.work_data = cpu_ctx->work_data;
726
748
 
749
+ cplan.abort_callback = cpu_ctx->abort_callback;
750
+ cplan.abort_callback_data = cpu_ctx->abort_callback_data;
751
+
727
752
  ggml_graph_compute(cgraph, &cplan);
728
753
  return true;
729
754
  }
@@ -731,7 +756,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
731
756
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
732
757
  switch (op->op) {
733
758
  case GGML_OP_CPY:
734
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
759
+ return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
735
760
  case GGML_OP_MUL_MAT:
736
761
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
737
762
  default:
@@ -758,12 +783,21 @@ static struct ggml_backend_i cpu_backend_i = {
758
783
 
759
784
  ggml_backend_t ggml_backend_cpu_init(void) {
760
785
  struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
786
+ if (ctx == NULL) {
787
+ return NULL;
788
+ }
761
789
 
762
- ctx->n_threads = GGML_DEFAULT_N_THREADS;
763
- ctx->work_data = NULL;
764
- ctx->work_size = 0;
790
+ ctx->n_threads = GGML_DEFAULT_N_THREADS;
791
+ ctx->work_data = NULL;
792
+ ctx->work_size = 0;
793
+ ctx->abort_callback = NULL;
794
+ ctx->abort_callback_data = NULL;
765
795
 
766
796
  ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
797
+ if (cpu_backend == NULL) {
798
+ free(ctx);
799
+ return NULL;
800
+ }
767
801
 
768
802
  *cpu_backend = (struct ggml_backend) {
769
803
  /* .interface = */ cpu_backend_i,
@@ -783,7 +817,16 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
783
817
  ctx->n_threads = n_threads;
784
818
  }
785
819
 
820
+ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
821
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
822
+
823
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
824
+ ctx->abort_callback = abort_callback;
825
+ ctx->abort_callback_data = abort_callback_data;
826
+ }
827
+
786
828
  GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
829
+ GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
787
830
  return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
788
831
  }
789
832
 
@@ -847,6 +890,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
847
890
  ctx->n_buffers = n_buffers;
848
891
  ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
849
892
 
893
+ GGML_ASSERT(ctx->buffers != NULL);
894
+
850
895
  size_t total_size = 0;
851
896
  for (size_t i = 0; i < n_buffers; i++) {
852
897
  ctx->buffers[i] = buffers[i];
@@ -868,6 +913,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
868
913
  }
869
914
  }
870
915
 
916
+ // creates a copy of the tensor with the same memory layout
917
+ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
918
+ struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
919
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
920
+ dup->nb[i] = tensor->nb[i];
921
+ }
922
+ return dup;
923
+ }
924
+
925
+ static bool ggml_is_view_op(enum ggml_op op) {
926
+ return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
927
+ }
871
928
 
872
929
  // scheduler
873
930
 
@@ -876,7 +933,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
876
933
  #define GGML_MAX_SPLIT_INPUTS 16
877
934
 
878
935
  struct ggml_backend_sched_split {
879
- ggml_tallocr_t tallocr;
936
+ int backend_id;
880
937
  int i_start;
881
938
  int i_end;
882
939
  struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
@@ -891,15 +948,17 @@ struct ggml_backend_sched {
891
948
  int n_backends;
892
949
  ggml_backend_t backends[GGML_MAX_BACKENDS];
893
950
  ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
894
- ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
895
951
 
896
952
  ggml_gallocr_t galloc;
897
953
 
898
954
  // hash keys of the nodes in the graph
899
955
  struct ggml_hash_set hash_set;
900
- // hash values (arrays of [hash_set.size])
901
- ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
902
- struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
956
+ // hash values
957
+ int * tensor_backend_id;
958
+ struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
959
+
960
+ int * node_backend_ids; // [n_nodes]
961
+ int n_nodes;
903
962
 
904
963
  // copy of the graph with modified inputs
905
964
  struct ggml_cgraph * graph;
@@ -909,75 +968,45 @@ struct ggml_backend_sched {
909
968
 
910
969
  struct ggml_context * ctx;
911
970
 
971
+ ggml_backend_sched_eval_callback callback_eval;
972
+ void * callback_eval_user_data;
973
+
912
974
  // align context_buffer to GGML_MEM_ALIGN
913
975
  #ifdef _MSC_VER
914
976
  __declspec(align(GGML_MEM_ALIGN))
915
977
  #else
916
978
  __attribute__((aligned(GGML_MEM_ALIGN)))
917
979
  #endif
918
- char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
919
-
920
- ggml_backend_sched_eval_callback callback_eval;
921
- void * callback_eval_user_data;
980
+ char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
922
981
  };
923
982
 
924
983
  #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
925
- #define node_allocr(node) sched->node_talloc[hash_id(node)]
984
+ #define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
985
+ #define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
926
986
 
927
- static bool ggml_is_view_op(enum ggml_op op) {
928
- return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
929
- }
930
-
931
- // returns the priority of the backend, lower is better
932
- static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
987
+ // returns the priority of the backend, lower id is higher priority
988
+ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
933
989
  for (int i = 0; i < sched->n_backends; i++) {
934
990
  if (sched->backends[i] == backend) {
935
991
  return i;
936
992
  }
937
993
  }
938
- return INT_MAX;
939
- }
940
-
941
- static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
942
- for (int i = 0; i < sched->n_backends; i++) {
943
- if (sched->tallocs[i] == allocr) {
944
- return i;
945
- }
946
- }
947
- return INT_MAX;
994
+ return -1;
948
995
  }
949
996
 
950
- static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
997
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
951
998
  if (buffer == NULL) {
952
- return NULL;
953
- }
954
-
955
- // check if this is already allocate in a allocr buffer (from user manual allocations)
956
- for (int i = 0; i < sched->n_backends; i++) {
957
- if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
958
- return sched->tallocs[i];
959
- }
999
+ return -1;
960
1000
  }
961
1001
 
962
1002
  // find highest prio backend that supports the buffer type
963
1003
  for (int i = 0; i < sched->n_backends; i++) {
964
1004
  if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
965
- return sched->tallocs[i];
1005
+ return i;
966
1006
  }
967
1007
  }
968
1008
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
969
- }
970
-
971
- static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
972
- if (allocr == NULL) {
973
- return NULL;
974
- }
975
- for (int i = 0; i < sched->n_backends; i++) {
976
- if (sched->tallocs[i] == allocr) {
977
- return sched->backends[i];
978
- }
979
- }
980
- GGML_UNREACHABLE();
1009
+ return -1; // silence warning
981
1010
  }
982
1011
 
983
1012
  #if 0
@@ -990,37 +1019,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I
990
1019
  #endif
991
1020
 
992
1021
  // returns the backend that should be used for the node based on the current locations
993
- static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1022
+ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1023
+ // TODO: use supports_op to check if the backend supports the op
1024
+
994
1025
  // assign pre-allocated nodes to their backend
995
1026
  // dst
996
- ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
997
- if (cur_allocr != NULL) {
1027
+ int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
1028
+ if (cur_backend != -1) {
998
1029
  SET_CAUSE(node, "1.dst");
999
- return cur_allocr;
1030
+ return cur_backend;
1000
1031
  }
1001
1032
  // view_src
1002
- if (node->view_src != NULL) {
1003
- cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
1004
- if (cur_allocr != NULL) {
1033
+ if (tensor->view_src != NULL) {
1034
+ cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
1035
+ if (cur_backend != -1) {
1005
1036
  SET_CAUSE(node, "1.vsrc");
1006
- return cur_allocr;
1037
+ return cur_backend;
1007
1038
  }
1008
1039
  }
1009
1040
  // assign nodes that use weights to the backend of the weights
1010
1041
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1011
- const struct ggml_tensor * src = node->src[i];
1042
+ const struct ggml_tensor * src = tensor->src[i];
1012
1043
  if (src == NULL) {
1013
- break;
1044
+ continue;
1014
1045
  }
1015
1046
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1016
- ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
1047
+ int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
1017
1048
  // operations with weights are always run on the same backend as the weights
1018
1049
  SET_CAUSE(node, "1.wgt%d", i);
1019
- return src_allocr;
1050
+ return src_backend;
1020
1051
  }
1021
1052
  }
1022
1053
 
1023
- return NULL;
1054
+ return -1;
1024
1055
  }
1025
1056
 
1026
1057
  static char * fmt_size(size_t size) {
@@ -1033,11 +1064,11 @@ static char * fmt_size(size_t size) {
1033
1064
  return buffer;
1034
1065
  }
1035
1066
 
1036
- static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1067
+ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1037
1068
  int cur_split = 0;
1038
1069
  for (int i = 0; i < graph->n_nodes; i++) {
1039
1070
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1040
- ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
1071
+ ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1041
1072
  fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1042
1073
  sched->splits[cur_split].n_inputs);
1043
1074
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
@@ -1051,17 +1082,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
1051
1082
  if (ggml_is_view_op(node->op)) {
1052
1083
  continue;
1053
1084
  }
1054
- ggml_tallocr_t node_allocr = node_allocr(node);
1055
- ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
1085
+ ggml_backend_t tensor_backend = tensor_backend(node);
1056
1086
  fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1057
- fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
1087
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1058
1088
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1059
1089
  struct ggml_tensor * src = node->src[j];
1060
1090
  if (src == NULL) {
1061
- break;
1091
+ continue;
1062
1092
  }
1063
- ggml_tallocr_t src_allocr = node_allocr(src);
1064
- ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
1093
+ ggml_backend_t src_backend = tensor_backend(src);
1065
1094
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1066
1095
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1067
1096
  }
@@ -1069,23 +1098,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
1069
1098
  }
1070
1099
  }
1071
1100
 
1072
- // creates a copy of the tensor with the same memory layout
1073
- static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
1074
- struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
1075
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
1076
- dup->nb[i] = tensor->nb[i];
1077
- }
1078
- return dup;
1079
- }
1080
-
1081
-
1082
1101
  //#define DEBUG_PASS1
1083
1102
  //#define DEBUG_PASS2
1084
1103
  //#define DEBUG_PASS3
1085
1104
  //#define DEBUG_PASS4
1086
1105
 
1087
1106
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1088
- static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1107
+ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1089
1108
  // reset splits
1090
1109
  sched->n_splits = 0;
1091
1110
  sched->is_reset = false;
@@ -1107,28 +1126,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1107
1126
  // pass 1: assign backends to ops with pre-allocated inputs
1108
1127
  for (int i = 0; i < graph->n_leafs; i++) {
1109
1128
  struct ggml_tensor * leaf = graph->leafs[i];
1110
- if (node_allocr(leaf) != NULL) {
1129
+ if (tensor_backend_id(leaf) != -1) {
1111
1130
  // do not overwrite user assignments
1112
1131
  continue;
1113
1132
  }
1114
- node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
1133
+ tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1115
1134
  }
1116
1135
 
1117
1136
  for (int i = 0; i < graph->n_nodes; i++) {
1118
1137
  struct ggml_tensor * node = graph->nodes[i];
1119
- if (node_allocr(node) != NULL) {
1138
+ if (tensor_backend_id(node) != -1) {
1120
1139
  // do not overwrite user assignments
1121
1140
  continue;
1122
1141
  }
1123
- node_allocr(node) = sched_allocr_from_cur(sched, node);
1142
+ tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
1124
1143
  // src
1125
1144
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1126
1145
  struct ggml_tensor * src = node->src[j];
1127
1146
  if (src == NULL) {
1128
- break;
1147
+ continue;
1129
1148
  }
1130
- if (node_allocr(src) == NULL) {
1131
- node_allocr(src) = sched_allocr_from_cur(sched, src);
1149
+ if (tensor_backend_id(src) == -1) {
1150
+ tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
1132
1151
  }
1133
1152
  }
1134
1153
  }
@@ -1143,22 +1162,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1143
1162
 
1144
1163
  // pass 2.1 expand gpu up
1145
1164
  {
1146
- ggml_tallocr_t cur_allocr = NULL;
1165
+ int cur_backend_id = -1;
1147
1166
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
1148
1167
  struct ggml_tensor * node = graph->nodes[i];
1149
1168
  if (ggml_is_view_op(node->op)) {
1150
1169
  continue;
1151
1170
  }
1152
- ggml_tallocr_t node_allocr = node_allocr(node);
1153
- if (node_allocr != NULL) {
1154
- if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1171
+ int tensor_backend_id = tensor_backend_id(node);
1172
+ if (tensor_backend_id != -1) {
1173
+ if (tensor_backend_id == sched->n_backends - 1) {
1155
1174
  // skip cpu (lowest prio backend)
1156
- cur_allocr = NULL;
1175
+ cur_backend_id = -1;
1157
1176
  } else {
1158
- cur_allocr = node_allocr;
1177
+ cur_backend_id = tensor_backend_id;
1159
1178
  }
1160
1179
  } else {
1161
- node_allocr(node) = cur_allocr;
1180
+ tensor_backend_id(node) = cur_backend_id;
1162
1181
  SET_CAUSE(node, "2.1");
1163
1182
  }
1164
1183
  }
@@ -1166,22 +1185,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1166
1185
 
1167
1186
  // pass 2.2 expand gpu down
1168
1187
  {
1169
- ggml_tallocr_t cur_allocr = NULL;
1188
+ int cur_backend_id = -1;
1170
1189
  for (int i = 0; i < graph->n_nodes; i++) {
1171
1190
  struct ggml_tensor * node = graph->nodes[i];
1172
1191
  if (ggml_is_view_op(node->op)) {
1173
1192
  continue;
1174
1193
  }
1175
- ggml_tallocr_t node_allocr = node_allocr(node);
1176
- if (node_allocr != NULL) {
1177
- if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1194
+ int tensor_backend_id = tensor_backend_id(node);
1195
+ if (tensor_backend_id != -1) {
1196
+ if (tensor_backend_id == sched->n_backends - 1) {
1178
1197
  // skip cpu (lowest prio backend)
1179
- cur_allocr = NULL;
1198
+ cur_backend_id = -1;
1180
1199
  } else {
1181
- cur_allocr = node_allocr;
1200
+ cur_backend_id = tensor_backend_id;
1182
1201
  }
1183
1202
  } else {
1184
- node_allocr(node) = cur_allocr;
1203
+ tensor_backend_id(node) = cur_backend_id;
1185
1204
  SET_CAUSE(node, "2.2");
1186
1205
  }
1187
1206
  }
@@ -1189,17 +1208,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1189
1208
 
1190
1209
  // pass 2.3 expand rest up
1191
1210
  {
1192
- ggml_tallocr_t cur_allocr = NULL;
1211
+ int cur_backend_id = -1;
1193
1212
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
1194
1213
  struct ggml_tensor * node = graph->nodes[i];
1195
1214
  if (ggml_is_view_op(node->op)) {
1196
1215
  continue;
1197
1216
  }
1198
- ggml_tallocr_t node_allocr = node_allocr(node);
1199
- if (node_allocr != NULL) {
1200
- cur_allocr = node_allocr;
1217
+ int tensor_backend_id = tensor_backend_id(node);
1218
+ if (tensor_backend_id != -1) {
1219
+ cur_backend_id = tensor_backend_id;
1201
1220
  } else {
1202
- node_allocr(node) = cur_allocr;
1221
+ tensor_backend_id(node) = cur_backend_id;
1203
1222
  SET_CAUSE(node, "2.3");
1204
1223
  }
1205
1224
  }
@@ -1207,17 +1226,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1207
1226
 
1208
1227
  // pass 2.4 expand rest down
1209
1228
  {
1210
- ggml_tallocr_t cur_allocr = NULL;
1229
+ int cur_backend_id = -1;
1211
1230
  for (int i = 0; i < graph->n_nodes; i++) {
1212
1231
  struct ggml_tensor * node = graph->nodes[i];
1213
1232
  if (ggml_is_view_op(node->op)) {
1214
1233
  continue;
1215
1234
  }
1216
- ggml_tallocr_t node_allocr = node_allocr(node);
1217
- if (node_allocr != NULL) {
1218
- cur_allocr = node_allocr;
1235
+ int tensor_backend_id = tensor_backend_id(node);
1236
+ if (tensor_backend_id != -1) {
1237
+ cur_backend_id = tensor_backend_id;
1219
1238
  } else {
1220
- node_allocr(node) = cur_allocr;
1239
+ tensor_backend_id(node) = cur_backend_id;
1221
1240
  SET_CAUSE(node, "2.4");
1222
1241
  }
1223
1242
  }
@@ -1229,24 +1248,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1229
1248
  // pass 3: assign backends to remaining src from dst and view_src
1230
1249
  for (int i = 0; i < graph->n_nodes; i++) {
1231
1250
  struct ggml_tensor * node = graph->nodes[i];
1232
- ggml_tallocr_t cur_allocr = node_allocr(node);
1233
- if (node->view_src != NULL && cur_allocr == NULL) {
1234
- cur_allocr = node_allocr(node) = node_allocr(node->view_src);
1251
+ int cur_backend_id = tensor_backend_id(node);
1252
+ if (node->view_src != NULL && cur_backend_id == -1) {
1253
+ cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
1235
1254
  SET_CAUSE(node, "3.vsrc");
1236
1255
  }
1237
1256
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1238
1257
  struct ggml_tensor * src = node->src[j];
1239
1258
  if (src == NULL) {
1240
- break;
1259
+ continue;
1241
1260
  }
1242
- ggml_tallocr_t src_allocr = node_allocr(src);
1243
- if (src_allocr == NULL) {
1261
+ int src_backend_id = tensor_backend_id(src);
1262
+ if (src_backend_id == -1) {
1244
1263
  if (src->view_src != NULL) {
1245
1264
  // views are always on the same backend as the source
1246
- node_allocr(src) = node_allocr(src->view_src);
1265
+ tensor_backend_id(src) = tensor_backend_id(src->view_src);
1247
1266
  SET_CAUSE(src, "3.vsrc");
1248
1267
  } else {
1249
- node_allocr(src) = cur_allocr;
1268
+ tensor_backend_id(src) = cur_backend_id;
1250
1269
  SET_CAUSE(src, "3.cur");
1251
1270
  }
1252
1271
  }
@@ -1263,15 +1282,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1263
1282
  for (int i = 0; i < graph->n_nodes; i++) {
1264
1283
  struct ggml_tensor * node = graph->nodes[i];
1265
1284
  if (!ggml_is_view_op(node->op)) {
1266
- sched->splits[0].tallocr = node_allocr(node);
1285
+ sched->splits[0].backend_id = tensor_backend_id(node);
1267
1286
  break;
1268
1287
  }
1269
1288
  }
1270
1289
  sched->splits[0].i_start = 0;
1271
1290
  sched->splits[0].n_inputs = 0;
1272
1291
  memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1273
- ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1274
- size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1292
+ int cur_backend_id = sched->splits[0].backend_id;
1275
1293
  for (int i = 0; i < graph->n_nodes; i++) {
1276
1294
  struct ggml_tensor * node = graph->nodes[i];
1277
1295
 
@@ -1279,64 +1297,45 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1279
1297
  continue;
1280
1298
  }
1281
1299
 
1282
- ggml_tallocr_t node_allocr = node_allocr(node);
1300
+ int tensor_backend_id = tensor_backend_id(node);
1283
1301
 
1284
- GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
1302
+ GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
1285
1303
 
1286
- if (node_allocr != cur_allocr) {
1304
+ if (tensor_backend_id != cur_backend_id) {
1287
1305
  sched->splits[cur_split].i_end = i;
1288
1306
  cur_split++;
1289
1307
  GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1290
- sched->splits[cur_split].tallocr = node_allocr;
1308
+ sched->splits[cur_split].backend_id = tensor_backend_id;
1291
1309
  sched->splits[cur_split].i_start = i;
1292
1310
  sched->splits[cur_split].n_inputs = 0;
1293
- cur_allocr = node_allocr;
1294
- cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1311
+ cur_backend_id = tensor_backend_id;
1295
1312
  }
1296
1313
 
1297
1314
  // find inputs that are not on the same backend
1298
1315
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1299
1316
  struct ggml_tensor * src = node->src[j];
1300
1317
  if (src == NULL) {
1301
- break;
1318
+ continue;
1302
1319
  }
1303
- ggml_tallocr_t src_allocr = node_allocr(src);
1304
- GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1305
- if (src_allocr != node_allocr) {
1320
+ int src_backend_id = tensor_backend_id(src);
1321
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1322
+ if (src_backend_id != tensor_backend_id) {
1306
1323
  // create a copy of the input in the split's backend
1307
1324
  size_t id = hash_id(src);
1308
- if (sched->node_copies[id][cur_backend_id] == NULL) {
1309
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1325
+ if (sched->tensor_copies[id][cur_backend_id] == NULL) {
1326
+ ggml_backend_t backend = sched->backends[cur_backend_id];
1310
1327
  struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1311
1328
  ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1312
1329
 
1313
- sched->node_copies[id][cur_backend_id] = tensor_copy;
1314
- node_allocr(tensor_copy) = cur_allocr;
1330
+ sched->tensor_copies[id][cur_backend_id] = tensor_copy;
1331
+ tensor_backend_id(tensor_copy) = cur_backend_id;
1315
1332
  SET_CAUSE(tensor_copy, "4.cpy");
1316
1333
 
1317
1334
  int n_inputs = sched->splits[cur_split].n_inputs++;
1318
1335
  GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1319
1336
  sched->splits[cur_split].inputs[n_inputs] = src;
1320
1337
  }
1321
- node->src[j] = sched->node_copies[id][cur_backend_id];
1322
-
1323
- #if 0
1324
- // check if the input is already in the split
1325
- bool found = false;
1326
- for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
1327
- if (sched->splits[cur_split].inputs[k] == src) {
1328
- found = true;
1329
- break;
1330
- }
1331
- }
1332
-
1333
- if (!found) {
1334
- int n_inputs = sched->splits[cur_split].n_inputs++;
1335
- //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
1336
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1337
- sched->splits[cur_split].inputs[n_inputs] = src;
1338
- }
1339
- #endif
1338
+ node->src[j] = sched->tensor_copies[id][cur_backend_id];
1340
1339
  }
1341
1340
  }
1342
1341
  }
@@ -1351,30 +1350,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1351
1350
  // sanity check: all sources should have the same backend as the node
1352
1351
  for (int i = 0; i < graph->n_nodes; i++) {
1353
1352
  struct ggml_tensor * node = graph->nodes[i];
1354
- ggml_tallocr_t node_allocr = node_allocr(node);
1355
- if (node_allocr == NULL) {
1353
+ ggml_backend_t tensor_backend = tensor_backend(node);
1354
+ if (tensor_backend == NULL) {
1356
1355
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1357
1356
  }
1358
- if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
1357
+ if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
1359
1358
  fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1360
- node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1361
- node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
1359
+ node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1360
+ node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
1362
1361
  }
1363
1362
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1364
1363
  struct ggml_tensor * src = node->src[j];
1365
1364
  if (src == NULL) {
1366
- break;
1365
+ continue;
1367
1366
  }
1368
- ggml_tallocr_t src_allocr = node_allocr(src);
1369
- if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
1367
+ ggml_backend_t src_backend = tensor_backend(src);
1368
+ if (src_backend != tensor_backend /* && src_backend != NULL */) {
1370
1369
  fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1371
- node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1372
- j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
1370
+ node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1371
+ j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1373
1372
  }
1374
- if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
1373
+ if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
1375
1374
  fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1376
- src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
1377
- src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
1375
+ src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1376
+ src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
1378
1377
  }
1379
1378
  }
1380
1379
  }
@@ -1388,32 +1387,45 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1388
1387
  struct ggml_backend_sched_split * split = &sched->splits[i];
1389
1388
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1390
1389
 
1391
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1392
1390
  for (int j = 0; j < split->n_inputs; j++) {
1393
1391
  struct ggml_tensor * input = split->inputs[j];
1394
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
1392
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
1393
+
1395
1394
  // add a dependency to the input source so that it is not freed before the copy is done
1396
- GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
1397
- input_cpy->src[0] = input;
1395
+ struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1396
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1397
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1398
+
1399
+ // add a dependency to the input copy so that it is allocated at the start of the split
1400
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1398
1401
  graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1399
1402
  }
1400
1403
 
1401
1404
  for (int j = split->i_start; j < split->i_end; j++) {
1405
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1402
1406
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1403
1407
  }
1404
1408
  }
1405
1409
  sched->graph = graph_copy;
1406
1410
  }
1407
1411
 
1408
- static void sched_alloc_splits(ggml_backend_sched_t sched) {
1409
- ggml_gallocr_alloc_graph_n(
1410
- sched->galloc,
1411
- sched->graph,
1412
- sched->hash_set,
1413
- sched->node_talloc);
1412
+ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1413
+ // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1414
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1415
+ #ifndef NDEBUG
1416
+ fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
1417
+ #endif
1418
+ ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1419
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1420
+ fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
1421
+ return false;
1422
+ }
1423
+ }
1424
+
1425
+ return true;
1414
1426
  }
1415
1427
 
1416
- static void sched_compute_splits(ggml_backend_sched_t sched) {
1428
+ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1417
1429
  uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1418
1430
  uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1419
1431
 
@@ -1421,20 +1433,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1421
1433
 
1422
1434
  for (int i = 0; i < sched->n_splits; i++) {
1423
1435
  struct ggml_backend_sched_split * split = &splits[i];
1424
- ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
1425
- int split_backend_id = sched_backend_prio(sched, split_backend);
1436
+ int split_backend_id = split->backend_id;
1437
+ ggml_backend_t split_backend = sched->backends[split_backend_id];
1426
1438
 
1427
1439
  // copy the input tensors to the split backend
1428
1440
  uint64_t copy_start_us = ggml_time_us();
1429
1441
  for (int j = 0; j < split->n_inputs; j++) {
1430
1442
  struct ggml_tensor * input = split->inputs[j];
1431
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
1443
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
1432
1444
 
1433
1445
  GGML_ASSERT(input->buffer != NULL);
1434
1446
  GGML_ASSERT(input_cpy->buffer != NULL);
1435
1447
 
1436
- // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1437
- // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
1438
1448
  ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1439
1449
  }
1440
1450
  //ggml_backend_synchronize(split_backend); // necessary to measure copy time
@@ -1450,7 +1460,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1450
1460
 
1451
1461
  uint64_t compute_start_us = ggml_time_us();
1452
1462
  if (!sched->callback_eval) {
1453
- ggml_backend_graph_compute(split_backend, &split->graph);
1463
+ if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
1464
+ return false;
1465
+ }
1454
1466
  //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1455
1467
  } else {
1456
1468
  // similar to ggml_backend_compare_graph_backend
@@ -1470,7 +1482,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1470
1482
 
1471
1483
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1472
1484
 
1473
- ggml_backend_graph_compute(split_backend, &gv);
1485
+ if (!ggml_backend_graph_compute(split_backend, &gv)) {
1486
+ return false;
1487
+ }
1474
1488
 
1475
1489
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1476
1490
  break;
@@ -1492,19 +1506,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1492
1506
  }
1493
1507
  }
1494
1508
  #endif
1495
- }
1496
1509
 
1497
- static void sched_reset(ggml_backend_sched_t sched) {
1498
- for (int i = 0; i < sched->n_backends; i++) {
1499
- ggml_tallocr_reset(sched->tallocs[i]);
1500
- }
1501
- // reset state for the next run
1502
- size_t hash_size = sched->hash_set.size;
1503
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
1504
- memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
1505
- memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
1506
-
1507
- sched->is_reset = true;
1510
+ return true;
1508
1511
  }
1509
1512
 
1510
1513
  ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@@ -1514,9 +1517,10 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
1514
1517
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1515
1518
 
1516
1519
  // initialize hash table
1517
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1518
- sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
1519
- sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
1520
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1521
+ sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1522
+ sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1523
+ sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1520
1524
 
1521
1525
  sched->n_backends = n_backends;
1522
1526
  for (int i = 0; i < n_backends; i++) {
@@ -1524,14 +1528,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
1524
1528
  sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1525
1529
  }
1526
1530
 
1527
- sched->galloc = ggml_gallocr_new();
1528
-
1529
- // init measure allocs for each backend
1530
- for (int i = 0; i < n_backends; i++) {
1531
- sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
1532
- }
1531
+ sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
1533
1532
 
1534
- sched_reset(sched);
1533
+ ggml_backend_sched_reset(sched);
1535
1534
 
1536
1535
  return sched;
1537
1536
  }
@@ -1540,49 +1539,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1540
1539
  if (sched == NULL) {
1541
1540
  return;
1542
1541
  }
1543
- for (int i = 0; i < sched->n_backends; i++) {
1544
- ggml_tallocr_free(sched->tallocs[i]);
1545
- }
1546
1542
  ggml_gallocr_free(sched->galloc);
1547
1543
  ggml_free(sched->ctx);
1548
1544
  free(sched->hash_set.keys);
1549
- free(sched->node_talloc);
1550
- free(sched->node_copies);
1545
+ free(sched->tensor_backend_id);
1546
+ free(sched->tensor_copies);
1547
+ free(sched->node_backend_ids);
1551
1548
  free(sched);
1552
1549
  }
1553
1550
 
1554
- void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1555
- GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
1551
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1552
+ // reset state for the next run
1553
+ size_t hash_size = sched->hash_set.size;
1554
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1555
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1556
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1556
1557
 
1557
- sched_split_graph(sched, measure_graph);
1558
- sched_alloc_splits(sched);
1558
+ sched->is_reset = true;
1559
+ }
1559
1560
 
1560
- // allocate buffers and reset allocators
1561
- for (int i = 0; i < sched->n_backends; i++) {
1562
- size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
1563
- ggml_tallocr_free(sched->tallocs[i]);
1564
- sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
1561
+ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1562
+ ggml_backend_sched_split_graph(sched, measure_graph);
1563
+
1564
+ if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
1565
+ return false;
1565
1566
  }
1566
1567
 
1567
- sched_reset(sched);
1568
+ ggml_backend_sched_reset(sched);
1569
+ return true;
1568
1570
  }
1569
1571
 
1570
- void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1572
+ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1571
1573
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1572
1574
 
1573
1575
  if (!sched->is_reset) {
1574
- sched_reset(sched);
1576
+ ggml_backend_sched_reset(sched);
1575
1577
  }
1576
1578
 
1577
- sched_split_graph(sched, graph);
1578
- sched_alloc_splits(sched);
1579
- sched_compute_splits(sched);
1580
- }
1579
+ ggml_backend_sched_split_graph(sched, graph);
1580
+ if (!ggml_backend_sched_alloc_splits(sched)) {
1581
+ return false;
1582
+ }
1581
1583
 
1582
- void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1583
- sched_reset(sched);
1584
- }
1584
+ if (!ggml_backend_sched_compute_splits(sched)) {
1585
+ return false;
1586
+ }
1585
1587
 
1588
+ return true;
1589
+ }
1586
1590
 
1587
1591
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1588
1592
  sched->callback_eval = callback;
@@ -1593,37 +1597,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1593
1597
  return sched->n_splits;
1594
1598
  }
1595
1599
 
1596
- ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
1597
- int backend_index = sched_backend_prio(sched, backend);
1598
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1599
- return sched->tallocs[backend_index];
1600
- }
1601
-
1602
- ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
1603
- int backend_index = sched_backend_prio(sched, backend);
1600
+ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1601
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1604
1602
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1605
- return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
1603
+ return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1606
1604
  }
1607
1605
 
1608
1606
  void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1609
- int backend_index = sched_backend_prio(sched, backend);
1607
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1610
1608
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1611
- node_allocr(node) = sched->tallocs[backend_index];
1609
+ tensor_backend_id(node) = backend_index;
1612
1610
  }
1613
1611
 
1614
1612
  ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1615
- ggml_tallocr_t allocr = node_allocr(node);
1616
- if (allocr == NULL) {
1613
+ int backend_index = tensor_backend_id(node);
1614
+ if (backend_index == -1) {
1617
1615
  return NULL;
1618
1616
  }
1619
- return get_allocr_backend(sched, allocr);
1617
+ return sched->backends[backend_index];
1620
1618
  }
1621
1619
 
1622
1620
  // utils
1623
1621
 
1624
1622
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1625
1623
  GGML_ASSERT(tensor->buffer == NULL);
1626
- //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
1627
1624
  GGML_ASSERT(tensor->view_src != NULL);
1628
1625
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1629
1626
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1647,7 +1644,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
1647
1644
  ggml_backend_buffer_init_tensor(buffer, tensor);
1648
1645
  }
1649
1646
 
1650
- static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1647
+ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1651
1648
  struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
1652
1649
 
1653
1650
  GGML_ASSERT(src != NULL);
@@ -1660,7 +1657,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
1660
1657
 
1661
1658
  struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1662
1659
  if (src->view_src != NULL) {
1663
- dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1660
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1664
1661
  dst->view_offs = src->view_offs;
1665
1662
  }
1666
1663
  dst->op = src->op;
@@ -1671,16 +1668,16 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
1671
1668
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1672
1669
  struct ggml_tensor * s = src->src[i];
1673
1670
  if (s == NULL) {
1674
- break;
1671
+ continue;
1675
1672
  }
1676
- dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1673
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1677
1674
  }
1678
1675
 
1679
1676
  node_copies[id] = dst;
1680
1677
  return dst;
1681
1678
  }
1682
1679
 
1683
- static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1680
+ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1684
1681
  size_t id = ggml_hash_find(hash_set, src);
1685
1682
  if (node_init[id]) {
1686
1683
  return;
@@ -1689,7 +1686,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1689
1686
 
1690
1687
  struct ggml_tensor * dst = node_copies[id];
1691
1688
  if (dst->view_src != NULL) {
1692
- graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
1689
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1693
1690
  ggml_backend_view_init(dst->view_src->buffer, dst);
1694
1691
  }
1695
1692
  else {
@@ -1700,19 +1697,19 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1700
1697
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1701
1698
  struct ggml_tensor * s = src->src[i];
1702
1699
  if (s == NULL) {
1703
- break;
1700
+ continue;
1704
1701
  }
1705
- graph_init_tensor(hash_set, node_copies, node_init, s);
1702
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1706
1703
  }
1707
1704
  }
1708
1705
 
1709
1706
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1710
1707
  struct ggml_hash_set hash_set = {
1711
1708
  /* .size = */ graph->visited_hash_table.size,
1712
- /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
1709
+ /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1713
1710
  };
1714
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
1715
- bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
1711
+ struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1712
+ bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1716
1713
 
1717
1714
  struct ggml_init_params params = {
1718
1715
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -1741,7 +1738,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1741
1738
  // dup nodes
1742
1739
  for (int i = 0; i < graph->n_nodes; i++) {
1743
1740
  struct ggml_tensor * node = graph->nodes[i];
1744
- graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1741
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1745
1742
  }
1746
1743
 
1747
1744
  // allocate nodes
@@ -1766,7 +1763,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1766
1763
  // copy data and init views
1767
1764
  for (int i = 0; i < graph->n_nodes; i++) {
1768
1765
  struct ggml_tensor * node = graph->nodes[i];
1769
- graph_init_tensor(hash_set, node_copies, node_init, node);
1766
+ graph_copy_init_tensor(hash_set, node_copies, node_init, node);
1770
1767
  }
1771
1768
 
1772
1769
  // build graph copy