llama_cpp 0.12.5 → 0.12.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -475,6 +475,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
475
475
 
476
476
  // backend CPU
477
477
 
478
+ static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
479
+
478
480
  GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
479
481
  return "CPU";
480
482
 
@@ -482,7 +484,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t
482
484
  }
483
485
 
484
486
  GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
485
- return (void *)buffer->context;
487
+ uintptr_t data = (uintptr_t)buffer->context;
488
+
489
+ // align the buffer
490
+ if (data % TENSOR_ALIGNMENT != 0) {
491
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
492
+ }
493
+
494
+ return (void *)data;
486
495
  }
487
496
 
488
497
  GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -540,8 +549,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
540
549
  /* .reset = */ NULL,
541
550
  };
542
551
 
543
- static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
544
-
545
552
  GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
546
553
  return "CPU";
547
554
 
@@ -550,9 +557,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend
550
557
 
551
558
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
552
559
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
553
- void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
554
-
555
- GGML_ASSERT(data != NULL && "failed to allocate buffer");
560
+ void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
561
+ if (data == NULL) {
562
+ fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
563
+ return NULL;
564
+ }
556
565
 
557
566
  return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
558
567
  }
@@ -653,6 +662,9 @@ struct ggml_backend_cpu_context {
653
662
  int n_threads;
654
663
  void * work_data;
655
664
  size_t work_size;
665
+
666
+ ggml_abort_callback abort_callback;
667
+ void * abort_callback_data;
656
668
  };
657
669
 
658
670
  GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
@@ -691,6 +703,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
691
703
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
692
704
  }
693
705
 
706
+ cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
707
+ cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
708
+
694
709
  return cpu_plan;
695
710
  }
696
711
 
@@ -721,9 +736,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
721
736
  cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
722
737
  cpu_ctx->work_size = cplan.work_size;
723
738
  }
724
-
725
739
  cplan.work_data = cpu_ctx->work_data;
726
740
 
741
+ cplan.abort_callback = cpu_ctx->abort_callback;
742
+ cplan.abort_callback_data = cpu_ctx->abort_callback_data;
743
+
727
744
  ggml_graph_compute(cgraph, &cplan);
728
745
  return true;
729
746
  }
@@ -758,12 +775,21 @@ static struct ggml_backend_i cpu_backend_i = {
758
775
 
759
776
  ggml_backend_t ggml_backend_cpu_init(void) {
760
777
  struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
778
+ if (ctx == NULL) {
779
+ return NULL;
780
+ }
761
781
 
762
- ctx->n_threads = GGML_DEFAULT_N_THREADS;
763
- ctx->work_data = NULL;
764
- ctx->work_size = 0;
782
+ ctx->n_threads = GGML_DEFAULT_N_THREADS;
783
+ ctx->work_data = NULL;
784
+ ctx->work_size = 0;
785
+ ctx->abort_callback = NULL;
786
+ ctx->abort_callback_data = NULL;
765
787
 
766
788
  ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
789
+ if (cpu_backend == NULL) {
790
+ free(ctx);
791
+ return NULL;
792
+ }
767
793
 
768
794
  *cpu_backend = (struct ggml_backend) {
769
795
  /* .interface = */ cpu_backend_i,
@@ -783,7 +809,16 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
783
809
  ctx->n_threads = n_threads;
784
810
  }
785
811
 
812
+ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
813
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
814
+
815
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
816
+ ctx->abort_callback = abort_callback;
817
+ ctx->abort_callback_data = abort_callback_data;
818
+ }
819
+
786
820
  GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
821
+ GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
787
822
  return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
788
823
  }
789
824
 
@@ -847,6 +882,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
847
882
  ctx->n_buffers = n_buffers;
848
883
  ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
849
884
 
885
+ GGML_ASSERT(ctx->buffers != NULL);
886
+
850
887
  size_t total_size = 0;
851
888
  for (size_t i = 0; i < n_buffers; i++) {
852
889
  ctx->buffers[i] = buffers[i];
@@ -868,6 +905,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
868
905
  }
869
906
  }
870
907
 
908
+ // creates a copy of the tensor with the same memory layout
909
+ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
910
+ struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
911
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
912
+ dup->nb[i] = tensor->nb[i];
913
+ }
914
+ return dup;
915
+ }
916
+
917
+ static bool ggml_is_view_op(enum ggml_op op) {
918
+ return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
919
+ }
871
920
 
872
921
  // scheduler
873
922
 
@@ -876,7 +925,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
876
925
  #define GGML_MAX_SPLIT_INPUTS 16
877
926
 
878
927
  struct ggml_backend_sched_split {
879
- ggml_tallocr_t tallocr;
928
+ int backend_id;
880
929
  int i_start;
881
930
  int i_end;
882
931
  struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
@@ -891,15 +940,17 @@ struct ggml_backend_sched {
891
940
  int n_backends;
892
941
  ggml_backend_t backends[GGML_MAX_BACKENDS];
893
942
  ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
894
- ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
895
943
 
896
944
  ggml_gallocr_t galloc;
897
945
 
898
946
  // hash keys of the nodes in the graph
899
947
  struct ggml_hash_set hash_set;
900
- // hash values (arrays of [hash_set.size])
901
- ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
902
- struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
948
+ // hash values
949
+ int * tensor_backend_id;
950
+ struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
951
+
952
+ int * node_backend_ids; // [n_nodes]
953
+ int n_nodes;
903
954
 
904
955
  // copy of the graph with modified inputs
905
956
  struct ggml_cgraph * graph;
@@ -909,77 +960,46 @@ struct ggml_backend_sched {
909
960
 
910
961
  struct ggml_context * ctx;
911
962
 
963
+ ggml_backend_sched_eval_callback callback_eval;
964
+ void * callback_eval_user_data;
965
+
912
966
  // align context_buffer to GGML_MEM_ALIGN
913
967
  #ifdef _MSC_VER
914
968
  __declspec(align(GGML_MEM_ALIGN))
915
969
  #else
916
970
  __attribute__((aligned(GGML_MEM_ALIGN)))
917
971
  #endif
918
- char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
919
-
920
- ggml_backend_sched_eval_callback callback_eval;
921
- void * callback_eval_user_data;
972
+ char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
922
973
  };
923
974
 
924
975
  #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
925
- #define node_allocr(node) sched->node_talloc[hash_id(node)]
976
+ #define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
977
+ #define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
926
978
 
927
- static bool ggml_is_view_op(enum ggml_op op) {
928
- return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
929
- }
930
-
931
- // returns the priority of the backend, lower is better
932
- static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
979
+ // returns the priority of the backend, lower id is higher priority
980
+ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
933
981
  for (int i = 0; i < sched->n_backends; i++) {
934
982
  if (sched->backends[i] == backend) {
935
983
  return i;
936
984
  }
937
985
  }
938
- return INT_MAX;
939
- }
940
-
941
- static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
942
- for (int i = 0; i < sched->n_backends; i++) {
943
- if (sched->tallocs[i] == allocr) {
944
- return i;
945
- }
946
- }
947
- return INT_MAX;
986
+ return -1;
948
987
  }
949
988
 
950
- static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
989
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
951
990
  if (buffer == NULL) {
952
- return NULL;
953
- }
954
-
955
- // check if this is already allocate in a allocr buffer (from user manual allocations)
956
- for (int i = 0; i < sched->n_backends; i++) {
957
- if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
958
- return sched->tallocs[i];
959
- }
991
+ return -1;
960
992
  }
961
993
 
962
994
  // find highest prio backend that supports the buffer type
963
995
  for (int i = 0; i < sched->n_backends; i++) {
964
996
  if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
965
- return sched->tallocs[i];
997
+ return i;
966
998
  }
967
999
  }
968
1000
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
969
1001
  }
970
1002
 
971
- static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
972
- if (allocr == NULL) {
973
- return NULL;
974
- }
975
- for (int i = 0; i < sched->n_backends; i++) {
976
- if (sched->tallocs[i] == allocr) {
977
- return sched->backends[i];
978
- }
979
- }
980
- GGML_UNREACHABLE();
981
- }
982
-
983
1003
  #if 0
984
1004
  static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
985
1005
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
@@ -990,37 +1010,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I
990
1010
  #endif
991
1011
 
992
1012
  // returns the backend that should be used for the node based on the current locations
993
- static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1013
+ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1014
+ // TODO: use supports_op to check if the backend supports the op
1015
+
994
1016
  // assign pre-allocated nodes to their backend
995
1017
  // dst
996
- ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
997
- if (cur_allocr != NULL) {
1018
+ int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
1019
+ if (cur_backend != -1) {
998
1020
  SET_CAUSE(node, "1.dst");
999
- return cur_allocr;
1021
+ return cur_backend;
1000
1022
  }
1001
1023
  // view_src
1002
- if (node->view_src != NULL) {
1003
- cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
1004
- if (cur_allocr != NULL) {
1024
+ if (tensor->view_src != NULL) {
1025
+ cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
1026
+ if (cur_backend != -1) {
1005
1027
  SET_CAUSE(node, "1.vsrc");
1006
- return cur_allocr;
1028
+ return cur_backend;
1007
1029
  }
1008
1030
  }
1009
1031
  // assign nodes that use weights to the backend of the weights
1010
1032
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1011
- const struct ggml_tensor * src = node->src[i];
1033
+ const struct ggml_tensor * src = tensor->src[i];
1012
1034
  if (src == NULL) {
1013
1035
  break;
1014
1036
  }
1015
1037
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1016
- ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
1038
+ int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
1017
1039
  // operations with weights are always run on the same backend as the weights
1018
1040
  SET_CAUSE(node, "1.wgt%d", i);
1019
- return src_allocr;
1041
+ return src_backend;
1020
1042
  }
1021
1043
  }
1022
1044
 
1023
- return NULL;
1045
+ return -1;
1024
1046
  }
1025
1047
 
1026
1048
  static char * fmt_size(size_t size) {
@@ -1033,11 +1055,11 @@ static char * fmt_size(size_t size) {
1033
1055
  return buffer;
1034
1056
  }
1035
1057
 
1036
- static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1058
+ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1037
1059
  int cur_split = 0;
1038
1060
  for (int i = 0; i < graph->n_nodes; i++) {
1039
1061
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1040
- ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
1062
+ ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1041
1063
  fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1042
1064
  sched->splits[cur_split].n_inputs);
1043
1065
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
@@ -1051,17 +1073,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
1051
1073
  if (ggml_is_view_op(node->op)) {
1052
1074
  continue;
1053
1075
  }
1054
- ggml_tallocr_t node_allocr = node_allocr(node);
1055
- ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
1076
+ ggml_backend_t tensor_backend = tensor_backend(node);
1056
1077
  fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1057
- fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
1078
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1058
1079
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1059
1080
  struct ggml_tensor * src = node->src[j];
1060
1081
  if (src == NULL) {
1061
1082
  break;
1062
1083
  }
1063
- ggml_tallocr_t src_allocr = node_allocr(src);
1064
- ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
1084
+ ggml_backend_t src_backend = tensor_backend(src);
1065
1085
  fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1066
1086
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1067
1087
  }
@@ -1069,23 +1089,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
1069
1089
  }
1070
1090
  }
1071
1091
 
1072
- // creates a copy of the tensor with the same memory layout
1073
- static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
1074
- struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
1075
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
1076
- dup->nb[i] = tensor->nb[i];
1077
- }
1078
- return dup;
1079
- }
1080
-
1081
-
1082
1092
  //#define DEBUG_PASS1
1083
1093
  //#define DEBUG_PASS2
1084
1094
  //#define DEBUG_PASS3
1085
1095
  //#define DEBUG_PASS4
1086
1096
 
1087
1097
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1088
- static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1098
+ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1089
1099
  // reset splits
1090
1100
  sched->n_splits = 0;
1091
1101
  sched->is_reset = false;
@@ -1107,28 +1117,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1107
1117
  // pass 1: assign backends to ops with pre-allocated inputs
1108
1118
  for (int i = 0; i < graph->n_leafs; i++) {
1109
1119
  struct ggml_tensor * leaf = graph->leafs[i];
1110
- if (node_allocr(leaf) != NULL) {
1120
+ if (tensor_backend_id(leaf) != -1) {
1111
1121
  // do not overwrite user assignments
1112
1122
  continue;
1113
1123
  }
1114
- node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
1124
+ tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1115
1125
  }
1116
1126
 
1117
1127
  for (int i = 0; i < graph->n_nodes; i++) {
1118
1128
  struct ggml_tensor * node = graph->nodes[i];
1119
- if (node_allocr(node) != NULL) {
1129
+ if (tensor_backend_id(node) != -1) {
1120
1130
  // do not overwrite user assignments
1121
1131
  continue;
1122
1132
  }
1123
- node_allocr(node) = sched_allocr_from_cur(sched, node);
1133
+ tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
1124
1134
  // src
1125
1135
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1126
1136
  struct ggml_tensor * src = node->src[j];
1127
1137
  if (src == NULL) {
1128
1138
  break;
1129
1139
  }
1130
- if (node_allocr(src) == NULL) {
1131
- node_allocr(src) = sched_allocr_from_cur(sched, src);
1140
+ if (tensor_backend_id(src) == -1) {
1141
+ tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
1132
1142
  }
1133
1143
  }
1134
1144
  }
@@ -1143,22 +1153,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1143
1153
 
1144
1154
  // pass 2.1 expand gpu up
1145
1155
  {
1146
- ggml_tallocr_t cur_allocr = NULL;
1156
+ int cur_backend_id = -1;
1147
1157
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
1148
1158
  struct ggml_tensor * node = graph->nodes[i];
1149
1159
  if (ggml_is_view_op(node->op)) {
1150
1160
  continue;
1151
1161
  }
1152
- ggml_tallocr_t node_allocr = node_allocr(node);
1153
- if (node_allocr != NULL) {
1154
- if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1162
+ int tensor_backend_id = tensor_backend_id(node);
1163
+ if (tensor_backend_id != -1) {
1164
+ if (tensor_backend_id == sched->n_backends - 1) {
1155
1165
  // skip cpu (lowest prio backend)
1156
- cur_allocr = NULL;
1166
+ cur_backend_id = -1;
1157
1167
  } else {
1158
- cur_allocr = node_allocr;
1168
+ cur_backend_id = tensor_backend_id;
1159
1169
  }
1160
1170
  } else {
1161
- node_allocr(node) = cur_allocr;
1171
+ tensor_backend_id(node) = cur_backend_id;
1162
1172
  SET_CAUSE(node, "2.1");
1163
1173
  }
1164
1174
  }
@@ -1166,22 +1176,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1166
1176
 
1167
1177
  // pass 2.2 expand gpu down
1168
1178
  {
1169
- ggml_tallocr_t cur_allocr = NULL;
1179
+ int cur_backend_id = -1;
1170
1180
  for (int i = 0; i < graph->n_nodes; i++) {
1171
1181
  struct ggml_tensor * node = graph->nodes[i];
1172
1182
  if (ggml_is_view_op(node->op)) {
1173
1183
  continue;
1174
1184
  }
1175
- ggml_tallocr_t node_allocr = node_allocr(node);
1176
- if (node_allocr != NULL) {
1177
- if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1185
+ int tensor_backend_id = tensor_backend_id(node);
1186
+ if (tensor_backend_id != -1) {
1187
+ if (tensor_backend_id == sched->n_backends - 1) {
1178
1188
  // skip cpu (lowest prio backend)
1179
- cur_allocr = NULL;
1189
+ cur_backend_id = -1;
1180
1190
  } else {
1181
- cur_allocr = node_allocr;
1191
+ cur_backend_id = tensor_backend_id;
1182
1192
  }
1183
1193
  } else {
1184
- node_allocr(node) = cur_allocr;
1194
+ tensor_backend_id(node) = cur_backend_id;
1185
1195
  SET_CAUSE(node, "2.2");
1186
1196
  }
1187
1197
  }
@@ -1189,17 +1199,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1189
1199
 
1190
1200
  // pass 2.3 expand rest up
1191
1201
  {
1192
- ggml_tallocr_t cur_allocr = NULL;
1202
+ int cur_backend_id = -1;
1193
1203
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
1194
1204
  struct ggml_tensor * node = graph->nodes[i];
1195
1205
  if (ggml_is_view_op(node->op)) {
1196
1206
  continue;
1197
1207
  }
1198
- ggml_tallocr_t node_allocr = node_allocr(node);
1199
- if (node_allocr != NULL) {
1200
- cur_allocr = node_allocr;
1208
+ int tensor_backend_id = tensor_backend_id(node);
1209
+ if (tensor_backend_id != -1) {
1210
+ cur_backend_id = tensor_backend_id;
1201
1211
  } else {
1202
- node_allocr(node) = cur_allocr;
1212
+ tensor_backend_id(node) = cur_backend_id;
1203
1213
  SET_CAUSE(node, "2.3");
1204
1214
  }
1205
1215
  }
@@ -1207,17 +1217,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1207
1217
 
1208
1218
  // pass 2.4 expand rest down
1209
1219
  {
1210
- ggml_tallocr_t cur_allocr = NULL;
1220
+ int cur_backend_id = -1;
1211
1221
  for (int i = 0; i < graph->n_nodes; i++) {
1212
1222
  struct ggml_tensor * node = graph->nodes[i];
1213
1223
  if (ggml_is_view_op(node->op)) {
1214
1224
  continue;
1215
1225
  }
1216
- ggml_tallocr_t node_allocr = node_allocr(node);
1217
- if (node_allocr != NULL) {
1218
- cur_allocr = node_allocr;
1226
+ int tensor_backend_id = tensor_backend_id(node);
1227
+ if (tensor_backend_id != -1) {
1228
+ cur_backend_id = tensor_backend_id;
1219
1229
  } else {
1220
- node_allocr(node) = cur_allocr;
1230
+ tensor_backend_id(node) = cur_backend_id;
1221
1231
  SET_CAUSE(node, "2.4");
1222
1232
  }
1223
1233
  }
@@ -1229,9 +1239,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1229
1239
  // pass 3: assign backends to remaining src from dst and view_src
1230
1240
  for (int i = 0; i < graph->n_nodes; i++) {
1231
1241
  struct ggml_tensor * node = graph->nodes[i];
1232
- ggml_tallocr_t cur_allocr = node_allocr(node);
1233
- if (node->view_src != NULL && cur_allocr == NULL) {
1234
- cur_allocr = node_allocr(node) = node_allocr(node->view_src);
1242
+ int cur_backend_id = tensor_backend_id(node);
1243
+ if (node->view_src != NULL && cur_backend_id == -1) {
1244
+ cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
1235
1245
  SET_CAUSE(node, "3.vsrc");
1236
1246
  }
1237
1247
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1239,14 +1249,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1239
1249
  if (src == NULL) {
1240
1250
  break;
1241
1251
  }
1242
- ggml_tallocr_t src_allocr = node_allocr(src);
1243
- if (src_allocr == NULL) {
1252
+ int src_backend_id = tensor_backend_id(src);
1253
+ if (src_backend_id == -1) {
1244
1254
  if (src->view_src != NULL) {
1245
1255
  // views are always on the same backend as the source
1246
- node_allocr(src) = node_allocr(src->view_src);
1256
+ tensor_backend_id(src) = tensor_backend_id(src->view_src);
1247
1257
  SET_CAUSE(src, "3.vsrc");
1248
1258
  } else {
1249
- node_allocr(src) = cur_allocr;
1259
+ tensor_backend_id(src) = cur_backend_id;
1250
1260
  SET_CAUSE(src, "3.cur");
1251
1261
  }
1252
1262
  }
@@ -1263,15 +1273,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1263
1273
  for (int i = 0; i < graph->n_nodes; i++) {
1264
1274
  struct ggml_tensor * node = graph->nodes[i];
1265
1275
  if (!ggml_is_view_op(node->op)) {
1266
- sched->splits[0].tallocr = node_allocr(node);
1276
+ sched->splits[0].backend_id = tensor_backend_id(node);
1267
1277
  break;
1268
1278
  }
1269
1279
  }
1270
1280
  sched->splits[0].i_start = 0;
1271
1281
  sched->splits[0].n_inputs = 0;
1272
1282
  memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1273
- ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1274
- size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1283
+ int cur_backend_id = sched->splits[0].backend_id;
1275
1284
  for (int i = 0; i < graph->n_nodes; i++) {
1276
1285
  struct ggml_tensor * node = graph->nodes[i];
1277
1286
 
@@ -1279,19 +1288,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1279
1288
  continue;
1280
1289
  }
1281
1290
 
1282
- ggml_tallocr_t node_allocr = node_allocr(node);
1291
+ int tensor_backend_id = tensor_backend_id(node);
1283
1292
 
1284
- GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
1293
+ GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
1285
1294
 
1286
- if (node_allocr != cur_allocr) {
1295
+ if (tensor_backend_id != cur_backend_id) {
1287
1296
  sched->splits[cur_split].i_end = i;
1288
1297
  cur_split++;
1289
1298
  GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1290
- sched->splits[cur_split].tallocr = node_allocr;
1299
+ sched->splits[cur_split].backend_id = tensor_backend_id;
1291
1300
  sched->splits[cur_split].i_start = i;
1292
1301
  sched->splits[cur_split].n_inputs = 0;
1293
- cur_allocr = node_allocr;
1294
- cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1302
+ cur_backend_id = tensor_backend_id;
1295
1303
  }
1296
1304
 
1297
1305
  // find inputs that are not on the same backend
@@ -1300,43 +1308,25 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1300
1308
  if (src == NULL) {
1301
1309
  break;
1302
1310
  }
1303
- ggml_tallocr_t src_allocr = node_allocr(src);
1304
- GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1305
- if (src_allocr != node_allocr) {
1311
+ int src_backend_id = tensor_backend_id(src);
1312
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1313
+ if (src_backend_id != tensor_backend_id) {
1306
1314
  // create a copy of the input in the split's backend
1307
1315
  size_t id = hash_id(src);
1308
- if (sched->node_copies[id][cur_backend_id] == NULL) {
1309
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1316
+ if (sched->tensor_copies[id][cur_backend_id] == NULL) {
1317
+ ggml_backend_t backend = sched->backends[cur_backend_id];
1310
1318
  struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1311
1319
  ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1312
1320
 
1313
- sched->node_copies[id][cur_backend_id] = tensor_copy;
1314
- node_allocr(tensor_copy) = cur_allocr;
1321
+ sched->tensor_copies[id][cur_backend_id] = tensor_copy;
1322
+ tensor_backend_id(tensor_copy) = cur_backend_id;
1315
1323
  SET_CAUSE(tensor_copy, "4.cpy");
1316
1324
 
1317
1325
  int n_inputs = sched->splits[cur_split].n_inputs++;
1318
1326
  GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1319
1327
  sched->splits[cur_split].inputs[n_inputs] = src;
1320
1328
  }
1321
- node->src[j] = sched->node_copies[id][cur_backend_id];
1322
-
1323
- #if 0
1324
- // check if the input is already in the split
1325
- bool found = false;
1326
- for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
1327
- if (sched->splits[cur_split].inputs[k] == src) {
1328
- found = true;
1329
- break;
1330
- }
1331
- }
1332
-
1333
- if (!found) {
1334
- int n_inputs = sched->splits[cur_split].n_inputs++;
1335
- //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
1336
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1337
- sched->splits[cur_split].inputs[n_inputs] = src;
1338
- }
1339
- #endif
1329
+ node->src[j] = sched->tensor_copies[id][cur_backend_id];
1340
1330
  }
1341
1331
  }
1342
1332
  }
@@ -1351,30 +1341,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1351
1341
  // sanity check: all sources should have the same backend as the node
1352
1342
  for (int i = 0; i < graph->n_nodes; i++) {
1353
1343
  struct ggml_tensor * node = graph->nodes[i];
1354
- ggml_tallocr_t node_allocr = node_allocr(node);
1355
- if (node_allocr == NULL) {
1344
+ ggml_backend_t tensor_backend = tensor_backend(node);
1345
+ if (tensor_backend == NULL) {
1356
1346
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1357
1347
  }
1358
- if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
1348
+ if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
1359
1349
  fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1360
- node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1361
- node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
1350
+ node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1351
+ node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
1362
1352
  }
1363
1353
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1364
1354
  struct ggml_tensor * src = node->src[j];
1365
1355
  if (src == NULL) {
1366
1356
  break;
1367
1357
  }
1368
- ggml_tallocr_t src_allocr = node_allocr(src);
1369
- if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
1358
+ ggml_backend_t src_backend = tensor_backend(src);
1359
+ if (src_backend != tensor_backend /* && src_backend != NULL */) {
1370
1360
  fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1371
- node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1372
- j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
1361
+ node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1362
+ j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1373
1363
  }
1374
- if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
1364
+ if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
1375
1365
  fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1376
- src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
1377
- src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
1366
+ src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1367
+ src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
1378
1368
  }
1379
1369
  }
1380
1370
  }
@@ -1388,32 +1378,45 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1388
1378
  struct ggml_backend_sched_split * split = &sched->splits[i];
1389
1379
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1390
1380
 
1391
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1392
1381
  for (int j = 0; j < split->n_inputs; j++) {
1393
1382
  struct ggml_tensor * input = split->inputs[j];
1394
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
1383
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
1384
+
1395
1385
  // add a dependency to the input source so that it is not freed before the copy is done
1396
- GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
1397
- input_cpy->src[0] = input;
1386
+ struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1387
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1388
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1389
+
1390
+ // add a dependency to the input copy so that it is allocated at the start of the split
1391
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1398
1392
  graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1399
1393
  }
1400
1394
 
1401
1395
  for (int j = split->i_start; j < split->i_end; j++) {
1396
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1402
1397
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1403
1398
  }
1404
1399
  }
1405
1400
  sched->graph = graph_copy;
1406
1401
  }
1407
1402
 
1408
- static void sched_alloc_splits(ggml_backend_sched_t sched) {
1409
- ggml_gallocr_alloc_graph_n(
1410
- sched->galloc,
1411
- sched->graph,
1412
- sched->hash_set,
1413
- sched->node_talloc);
1403
+ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1404
+ // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1405
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1406
+ #ifndef NDEBUG
1407
+ fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
1408
+ #endif
1409
+ ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
1410
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1411
+ fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
1412
+ return false;
1413
+ }
1414
+ }
1415
+
1416
+ return true;
1414
1417
  }
1415
1418
 
1416
- static void sched_compute_splits(ggml_backend_sched_t sched) {
1419
+ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1417
1420
  uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
1418
1421
  uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
1419
1422
 
@@ -1421,20 +1424,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1421
1424
 
1422
1425
  for (int i = 0; i < sched->n_splits; i++) {
1423
1426
  struct ggml_backend_sched_split * split = &splits[i];
1424
- ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
1425
- int split_backend_id = sched_backend_prio(sched, split_backend);
1427
+ int split_backend_id = split->backend_id;
1428
+ ggml_backend_t split_backend = sched->backends[split_backend_id];
1426
1429
 
1427
1430
  // copy the input tensors to the split backend
1428
1431
  uint64_t copy_start_us = ggml_time_us();
1429
1432
  for (int j = 0; j < split->n_inputs; j++) {
1430
1433
  struct ggml_tensor * input = split->inputs[j];
1431
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
1434
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
1432
1435
 
1433
1436
  GGML_ASSERT(input->buffer != NULL);
1434
1437
  GGML_ASSERT(input_cpy->buffer != NULL);
1435
1438
 
1436
- // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1437
- // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
1438
1439
  ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1439
1440
  }
1440
1441
  //ggml_backend_synchronize(split_backend); // necessary to measure copy time
@@ -1450,7 +1451,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1450
1451
 
1451
1452
  uint64_t compute_start_us = ggml_time_us();
1452
1453
  if (!sched->callback_eval) {
1453
- ggml_backend_graph_compute(split_backend, &split->graph);
1454
+ if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
1455
+ return false;
1456
+ }
1454
1457
  //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1455
1458
  } else {
1456
1459
  // similar to ggml_backend_compare_graph_backend
@@ -1470,7 +1473,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1470
1473
 
1471
1474
  struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1472
1475
 
1473
- ggml_backend_graph_compute(split_backend, &gv);
1476
+ if (!ggml_backend_graph_compute(split_backend, &gv)) {
1477
+ return false;
1478
+ }
1474
1479
 
1475
1480
  if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1476
1481
  break;
@@ -1492,19 +1497,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1492
1497
  }
1493
1498
  }
1494
1499
  #endif
1495
- }
1496
-
1497
- static void sched_reset(ggml_backend_sched_t sched) {
1498
- for (int i = 0; i < sched->n_backends; i++) {
1499
- ggml_tallocr_reset(sched->tallocs[i]);
1500
- }
1501
- // reset state for the next run
1502
- size_t hash_size = sched->hash_set.size;
1503
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
1504
- memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
1505
- memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
1506
1500
 
1507
- sched->is_reset = true;
1501
+ return true;
1508
1502
  }
1509
1503
 
1510
1504
  ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@@ -1514,9 +1508,10 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
1514
1508
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1515
1509
 
1516
1510
  // initialize hash table
1517
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1518
- sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
1519
- sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
1511
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1512
+ sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1513
+ sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1514
+ sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1520
1515
 
1521
1516
  sched->n_backends = n_backends;
1522
1517
  for (int i = 0; i < n_backends; i++) {
@@ -1524,14 +1519,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
1524
1519
  sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1525
1520
  }
1526
1521
 
1527
- sched->galloc = ggml_gallocr_new();
1528
-
1529
- // init measure allocs for each backend
1530
- for (int i = 0; i < n_backends; i++) {
1531
- sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
1532
- }
1522
+ sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
1533
1523
 
1534
- sched_reset(sched);
1524
+ ggml_backend_sched_reset(sched);
1535
1525
 
1536
1526
  return sched;
1537
1527
  }
@@ -1540,49 +1530,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1540
1530
  if (sched == NULL) {
1541
1531
  return;
1542
1532
  }
1543
- for (int i = 0; i < sched->n_backends; i++) {
1544
- ggml_tallocr_free(sched->tallocs[i]);
1545
- }
1546
1533
  ggml_gallocr_free(sched->galloc);
1547
1534
  ggml_free(sched->ctx);
1548
1535
  free(sched->hash_set.keys);
1549
- free(sched->node_talloc);
1550
- free(sched->node_copies);
1536
+ free(sched->tensor_backend_id);
1537
+ free(sched->tensor_copies);
1538
+ free(sched->node_backend_ids);
1551
1539
  free(sched);
1552
1540
  }
1553
1541
 
1554
- void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1555
- GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
1542
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1543
+ // reset state for the next run
1544
+ size_t hash_size = sched->hash_set.size;
1545
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1546
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1547
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1548
+
1549
+ sched->is_reset = true;
1550
+ }
1556
1551
 
1557
- sched_split_graph(sched, measure_graph);
1558
- sched_alloc_splits(sched);
1552
+ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1553
+ ggml_backend_sched_split_graph(sched, measure_graph);
1559
1554
 
1560
- // allocate buffers and reset allocators
1561
- for (int i = 0; i < sched->n_backends; i++) {
1562
- size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
1563
- ggml_tallocr_free(sched->tallocs[i]);
1564
- sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
1555
+ if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
1556
+ return false;
1565
1557
  }
1566
1558
 
1567
- sched_reset(sched);
1559
+ ggml_backend_sched_reset(sched);
1560
+ return true;
1568
1561
  }
1569
1562
 
1570
- void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1563
+ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1571
1564
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1572
1565
 
1573
1566
  if (!sched->is_reset) {
1574
- sched_reset(sched);
1567
+ ggml_backend_sched_reset(sched);
1575
1568
  }
1576
1569
 
1577
- sched_split_graph(sched, graph);
1578
- sched_alloc_splits(sched);
1579
- sched_compute_splits(sched);
1580
- }
1570
+ ggml_backend_sched_split_graph(sched, graph);
1571
+ if (!ggml_backend_sched_alloc_splits(sched)) {
1572
+ return false;
1573
+ }
1581
1574
 
1582
- void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1583
- sched_reset(sched);
1584
- }
1575
+ if (!ggml_backend_sched_compute_splits(sched)) {
1576
+ return false;
1577
+ }
1585
1578
 
1579
+ return true;
1580
+ }
1586
1581
 
1587
1582
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1588
1583
  sched->callback_eval = callback;
@@ -1593,37 +1588,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1593
1588
  return sched->n_splits;
1594
1589
  }
1595
1590
 
1596
- ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
1597
- int backend_index = sched_backend_prio(sched, backend);
1598
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1599
- return sched->tallocs[backend_index];
1600
- }
1601
-
1602
- ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
1603
- int backend_index = sched_backend_prio(sched, backend);
1591
+ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1592
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1604
1593
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1605
- return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
1594
+ return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1606
1595
  }
1607
1596
 
1608
1597
  void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1609
- int backend_index = sched_backend_prio(sched, backend);
1598
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1610
1599
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1611
- node_allocr(node) = sched->tallocs[backend_index];
1600
+ tensor_backend_id(node) = backend_index;
1612
1601
  }
1613
1602
 
1614
1603
  ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1615
- ggml_tallocr_t allocr = node_allocr(node);
1616
- if (allocr == NULL) {
1604
+ int backend_index = tensor_backend_id(node);
1605
+ if (backend_index == -1) {
1617
1606
  return NULL;
1618
1607
  }
1619
- return get_allocr_backend(sched, allocr);
1608
+ return sched->backends[backend_index];
1620
1609
  }
1621
1610
 
1622
1611
  // utils
1623
1612
 
1624
1613
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1625
1614
  GGML_ASSERT(tensor->buffer == NULL);
1626
- //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
1627
1615
  GGML_ASSERT(tensor->view_src != NULL);
1628
1616
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1629
1617
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1647,7 +1635,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
1647
1635
  ggml_backend_buffer_init_tensor(buffer, tensor);
1648
1636
  }
1649
1637
 
1650
- static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1638
+ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1651
1639
  struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
1652
1640
 
1653
1641
  GGML_ASSERT(src != NULL);
@@ -1660,7 +1648,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
1660
1648
 
1661
1649
  struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1662
1650
  if (src->view_src != NULL) {
1663
- dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1651
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1664
1652
  dst->view_offs = src->view_offs;
1665
1653
  }
1666
1654
  dst->op = src->op;
@@ -1673,14 +1661,14 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
1673
1661
  if (s == NULL) {
1674
1662
  break;
1675
1663
  }
1676
- dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1664
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1677
1665
  }
1678
1666
 
1679
1667
  node_copies[id] = dst;
1680
1668
  return dst;
1681
1669
  }
1682
1670
 
1683
- static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1671
+ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1684
1672
  size_t id = ggml_hash_find(hash_set, src);
1685
1673
  if (node_init[id]) {
1686
1674
  return;
@@ -1689,7 +1677,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1689
1677
 
1690
1678
  struct ggml_tensor * dst = node_copies[id];
1691
1679
  if (dst->view_src != NULL) {
1692
- graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
1680
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1693
1681
  ggml_backend_view_init(dst->view_src->buffer, dst);
1694
1682
  }
1695
1683
  else {
@@ -1702,17 +1690,17 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1702
1690
  if (s == NULL) {
1703
1691
  break;
1704
1692
  }
1705
- graph_init_tensor(hash_set, node_copies, node_init, s);
1693
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1706
1694
  }
1707
1695
  }
1708
1696
 
1709
1697
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1710
1698
  struct ggml_hash_set hash_set = {
1711
1699
  /* .size = */ graph->visited_hash_table.size,
1712
- /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
1700
+ /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1713
1701
  };
1714
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
1715
- bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
1702
+ struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1703
+ bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1716
1704
 
1717
1705
  struct ggml_init_params params = {
1718
1706
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -1741,7 +1729,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1741
1729
  // dup nodes
1742
1730
  for (int i = 0; i < graph->n_nodes; i++) {
1743
1731
  struct ggml_tensor * node = graph->nodes[i];
1744
- graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1732
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1745
1733
  }
1746
1734
 
1747
1735
  // allocate nodes
@@ -1766,7 +1754,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1766
1754
  // copy data and init views
1767
1755
  for (int i = 0; i < graph->n_nodes; i++) {
1768
1756
  struct ggml_tensor * node = graph->nodes[i];
1769
- graph_init_tensor(hash_set, node_copies, node_init, node);
1757
+ graph_copy_init_tensor(hash_set, node_copies, node_init, node);
1770
1758
  }
1771
1759
 
1772
1760
  // build graph copy