llama_cpp 0.12.5 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -1
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +347 -40
- data/vendor/tmp/llama.cpp/ggml-quants.h +14 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +14 -61
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +89 -6
- data/vendor/tmp/llama.cpp/ggml.c +134 -60
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +654 -130
- data/vendor/tmp/llama.cpp/llama.h +6 -0
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -475,6 +475,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|
475
475
|
|
476
476
|
// backend CPU
|
477
477
|
|
478
|
+
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
479
|
+
|
478
480
|
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
479
481
|
return "CPU";
|
480
482
|
|
@@ -482,7 +484,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t
|
|
482
484
|
}
|
483
485
|
|
484
486
|
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
485
|
-
|
487
|
+
uintptr_t data = (uintptr_t)buffer->context;
|
488
|
+
|
489
|
+
// align the buffer
|
490
|
+
if (data % TENSOR_ALIGNMENT != 0) {
|
491
|
+
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
492
|
+
}
|
493
|
+
|
494
|
+
return (void *)data;
|
486
495
|
}
|
487
496
|
|
488
497
|
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
@@ -540,8 +549,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
540
549
|
/* .reset = */ NULL,
|
541
550
|
};
|
542
551
|
|
543
|
-
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
544
|
-
|
545
552
|
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
546
553
|
return "CPU";
|
547
554
|
|
@@ -550,9 +557,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend
|
|
550
557
|
|
551
558
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
552
559
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
553
|
-
void * data = malloc(size); // TODO:
|
554
|
-
|
555
|
-
|
560
|
+
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
561
|
+
if (data == NULL) {
|
562
|
+
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
563
|
+
return NULL;
|
564
|
+
}
|
556
565
|
|
557
566
|
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
558
567
|
}
|
@@ -653,6 +662,9 @@ struct ggml_backend_cpu_context {
|
|
653
662
|
int n_threads;
|
654
663
|
void * work_data;
|
655
664
|
size_t work_size;
|
665
|
+
|
666
|
+
ggml_abort_callback abort_callback;
|
667
|
+
void * abort_callback_data;
|
656
668
|
};
|
657
669
|
|
658
670
|
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
@@ -691,6 +703,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
|
691
703
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
692
704
|
}
|
693
705
|
|
706
|
+
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
707
|
+
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
708
|
+
|
694
709
|
return cpu_plan;
|
695
710
|
}
|
696
711
|
|
@@ -721,9 +736,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
721
736
|
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
|
722
737
|
cpu_ctx->work_size = cplan.work_size;
|
723
738
|
}
|
724
|
-
|
725
739
|
cplan.work_data = cpu_ctx->work_data;
|
726
740
|
|
741
|
+
cplan.abort_callback = cpu_ctx->abort_callback;
|
742
|
+
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
743
|
+
|
727
744
|
ggml_graph_compute(cgraph, &cplan);
|
728
745
|
return true;
|
729
746
|
}
|
@@ -758,12 +775,21 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
758
775
|
|
759
776
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
760
777
|
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
778
|
+
if (ctx == NULL) {
|
779
|
+
return NULL;
|
780
|
+
}
|
761
781
|
|
762
|
-
ctx->n_threads
|
763
|
-
ctx->work_data
|
764
|
-
ctx->work_size
|
782
|
+
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
783
|
+
ctx->work_data = NULL;
|
784
|
+
ctx->work_size = 0;
|
785
|
+
ctx->abort_callback = NULL;
|
786
|
+
ctx->abort_callback_data = NULL;
|
765
787
|
|
766
788
|
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
789
|
+
if (cpu_backend == NULL) {
|
790
|
+
free(ctx);
|
791
|
+
return NULL;
|
792
|
+
}
|
767
793
|
|
768
794
|
*cpu_backend = (struct ggml_backend) {
|
769
795
|
/* .interface = */ cpu_backend_i,
|
@@ -783,7 +809,16 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
783
809
|
ctx->n_threads = n_threads;
|
784
810
|
}
|
785
811
|
|
812
|
+
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
813
|
+
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
814
|
+
|
815
|
+
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
816
|
+
ctx->abort_callback = abort_callback;
|
817
|
+
ctx->abort_callback_data = abort_callback_data;
|
818
|
+
}
|
819
|
+
|
786
820
|
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
821
|
+
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
787
822
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
788
823
|
}
|
789
824
|
|
@@ -847,6 +882,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
|
|
847
882
|
ctx->n_buffers = n_buffers;
|
848
883
|
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
849
884
|
|
885
|
+
GGML_ASSERT(ctx->buffers != NULL);
|
886
|
+
|
850
887
|
size_t total_size = 0;
|
851
888
|
for (size_t i = 0; i < n_buffers; i++) {
|
852
889
|
ctx->buffers[i] = buffers[i];
|
@@ -868,6 +905,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
|
|
868
905
|
}
|
869
906
|
}
|
870
907
|
|
908
|
+
// creates a copy of the tensor with the same memory layout
|
909
|
+
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
910
|
+
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
911
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
912
|
+
dup->nb[i] = tensor->nb[i];
|
913
|
+
}
|
914
|
+
return dup;
|
915
|
+
}
|
916
|
+
|
917
|
+
static bool ggml_is_view_op(enum ggml_op op) {
|
918
|
+
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
919
|
+
}
|
871
920
|
|
872
921
|
// scheduler
|
873
922
|
|
@@ -876,7 +925,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
|
|
876
925
|
#define GGML_MAX_SPLIT_INPUTS 16
|
877
926
|
|
878
927
|
struct ggml_backend_sched_split {
|
879
|
-
|
928
|
+
int backend_id;
|
880
929
|
int i_start;
|
881
930
|
int i_end;
|
882
931
|
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
@@ -891,15 +940,17 @@ struct ggml_backend_sched {
|
|
891
940
|
int n_backends;
|
892
941
|
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
893
942
|
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
894
|
-
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
895
943
|
|
896
944
|
ggml_gallocr_t galloc;
|
897
945
|
|
898
946
|
// hash keys of the nodes in the graph
|
899
947
|
struct ggml_hash_set hash_set;
|
900
|
-
// hash values
|
901
|
-
|
902
|
-
struct ggml_tensor * (*
|
948
|
+
// hash values
|
949
|
+
int * tensor_backend_id;
|
950
|
+
struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
|
951
|
+
|
952
|
+
int * node_backend_ids; // [n_nodes]
|
953
|
+
int n_nodes;
|
903
954
|
|
904
955
|
// copy of the graph with modified inputs
|
905
956
|
struct ggml_cgraph * graph;
|
@@ -909,77 +960,46 @@ struct ggml_backend_sched {
|
|
909
960
|
|
910
961
|
struct ggml_context * ctx;
|
911
962
|
|
963
|
+
ggml_backend_sched_eval_callback callback_eval;
|
964
|
+
void * callback_eval_user_data;
|
965
|
+
|
912
966
|
// align context_buffer to GGML_MEM_ALIGN
|
913
967
|
#ifdef _MSC_VER
|
914
968
|
__declspec(align(GGML_MEM_ALIGN))
|
915
969
|
#else
|
916
970
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
917
971
|
#endif
|
918
|
-
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
919
|
-
|
920
|
-
ggml_backend_sched_eval_callback callback_eval;
|
921
|
-
void * callback_eval_user_data;
|
972
|
+
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
922
973
|
};
|
923
974
|
|
924
975
|
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
925
|
-
#define
|
976
|
+
#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
|
977
|
+
#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
|
926
978
|
|
927
|
-
|
928
|
-
|
929
|
-
}
|
930
|
-
|
931
|
-
// returns the priority of the backend, lower is better
|
932
|
-
static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
979
|
+
// returns the priority of the backend, lower id is higher priority
|
980
|
+
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
933
981
|
for (int i = 0; i < sched->n_backends; i++) {
|
934
982
|
if (sched->backends[i] == backend) {
|
935
983
|
return i;
|
936
984
|
}
|
937
985
|
}
|
938
|
-
return
|
939
|
-
}
|
940
|
-
|
941
|
-
static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
942
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
943
|
-
if (sched->tallocs[i] == allocr) {
|
944
|
-
return i;
|
945
|
-
}
|
946
|
-
}
|
947
|
-
return INT_MAX;
|
986
|
+
return -1;
|
948
987
|
}
|
949
988
|
|
950
|
-
static
|
989
|
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
951
990
|
if (buffer == NULL) {
|
952
|
-
return
|
953
|
-
}
|
954
|
-
|
955
|
-
// check if this is already allocate in a allocr buffer (from user manual allocations)
|
956
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
957
|
-
if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
|
958
|
-
return sched->tallocs[i];
|
959
|
-
}
|
991
|
+
return -1;
|
960
992
|
}
|
961
993
|
|
962
994
|
// find highest prio backend that supports the buffer type
|
963
995
|
for (int i = 0; i < sched->n_backends; i++) {
|
964
996
|
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
965
|
-
return
|
997
|
+
return i;
|
966
998
|
}
|
967
999
|
}
|
968
1000
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
969
1001
|
}
|
970
1002
|
|
971
|
-
static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
972
|
-
if (allocr == NULL) {
|
973
|
-
return NULL;
|
974
|
-
}
|
975
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
976
|
-
if (sched->tallocs[i] == allocr) {
|
977
|
-
return sched->backends[i];
|
978
|
-
}
|
979
|
-
}
|
980
|
-
GGML_UNREACHABLE();
|
981
|
-
}
|
982
|
-
|
983
1003
|
#if 0
|
984
1004
|
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
985
1005
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
@@ -990,37 +1010,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I
|
|
990
1010
|
#endif
|
991
1011
|
|
992
1012
|
// returns the backend that should be used for the node based on the current locations
|
993
|
-
static
|
1013
|
+
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
1014
|
+
// TODO: use supports_op to check if the backend supports the op
|
1015
|
+
|
994
1016
|
// assign pre-allocated nodes to their backend
|
995
1017
|
// dst
|
996
|
-
|
997
|
-
if (
|
1018
|
+
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
|
1019
|
+
if (cur_backend != -1) {
|
998
1020
|
SET_CAUSE(node, "1.dst");
|
999
|
-
return
|
1021
|
+
return cur_backend;
|
1000
1022
|
}
|
1001
1023
|
// view_src
|
1002
|
-
if (
|
1003
|
-
|
1004
|
-
if (
|
1024
|
+
if (tensor->view_src != NULL) {
|
1025
|
+
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
|
1026
|
+
if (cur_backend != -1) {
|
1005
1027
|
SET_CAUSE(node, "1.vsrc");
|
1006
|
-
return
|
1028
|
+
return cur_backend;
|
1007
1029
|
}
|
1008
1030
|
}
|
1009
1031
|
// assign nodes that use weights to the backend of the weights
|
1010
1032
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1011
|
-
const struct ggml_tensor * src =
|
1033
|
+
const struct ggml_tensor * src = tensor->src[i];
|
1012
1034
|
if (src == NULL) {
|
1013
1035
|
break;
|
1014
1036
|
}
|
1015
1037
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1016
|
-
|
1038
|
+
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
|
1017
1039
|
// operations with weights are always run on the same backend as the weights
|
1018
1040
|
SET_CAUSE(node, "1.wgt%d", i);
|
1019
|
-
return
|
1041
|
+
return src_backend;
|
1020
1042
|
}
|
1021
1043
|
}
|
1022
1044
|
|
1023
|
-
return
|
1045
|
+
return -1;
|
1024
1046
|
}
|
1025
1047
|
|
1026
1048
|
static char * fmt_size(size_t size) {
|
@@ -1033,11 +1055,11 @@ static char * fmt_size(size_t size) {
|
|
1033
1055
|
return buffer;
|
1034
1056
|
}
|
1035
1057
|
|
1036
|
-
static void
|
1058
|
+
static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1037
1059
|
int cur_split = 0;
|
1038
1060
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1039
1061
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
1040
|
-
ggml_backend_t split_backend =
|
1062
|
+
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
1041
1063
|
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
1042
1064
|
sched->splits[cur_split].n_inputs);
|
1043
1065
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
@@ -1051,17 +1073,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
1051
1073
|
if (ggml_is_view_op(node->op)) {
|
1052
1074
|
continue;
|
1053
1075
|
}
|
1054
|
-
|
1055
|
-
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
1076
|
+
ggml_backend_t tensor_backend = tensor_backend(node);
|
1056
1077
|
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
1057
|
-
fmt_size(ggml_nbytes(node)),
|
1078
|
+
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1058
1079
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1059
1080
|
struct ggml_tensor * src = node->src[j];
|
1060
1081
|
if (src == NULL) {
|
1061
1082
|
break;
|
1062
1083
|
}
|
1063
|
-
|
1064
|
-
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
1084
|
+
ggml_backend_t src_backend = tensor_backend(src);
|
1065
1085
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1066
1086
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1067
1087
|
}
|
@@ -1069,23 +1089,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
1069
1089
|
}
|
1070
1090
|
}
|
1071
1091
|
|
1072
|
-
// creates a copy of the tensor with the same memory layout
|
1073
|
-
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
1074
|
-
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
1075
|
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
1076
|
-
dup->nb[i] = tensor->nb[i];
|
1077
|
-
}
|
1078
|
-
return dup;
|
1079
|
-
}
|
1080
|
-
|
1081
|
-
|
1082
1092
|
//#define DEBUG_PASS1
|
1083
1093
|
//#define DEBUG_PASS2
|
1084
1094
|
//#define DEBUG_PASS3
|
1085
1095
|
//#define DEBUG_PASS4
|
1086
1096
|
|
1087
1097
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
1088
|
-
static void
|
1098
|
+
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1089
1099
|
// reset splits
|
1090
1100
|
sched->n_splits = 0;
|
1091
1101
|
sched->is_reset = false;
|
@@ -1107,28 +1117,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1107
1117
|
// pass 1: assign backends to ops with pre-allocated inputs
|
1108
1118
|
for (int i = 0; i < graph->n_leafs; i++) {
|
1109
1119
|
struct ggml_tensor * leaf = graph->leafs[i];
|
1110
|
-
if (
|
1120
|
+
if (tensor_backend_id(leaf) != -1) {
|
1111
1121
|
// do not overwrite user assignments
|
1112
1122
|
continue;
|
1113
1123
|
}
|
1114
|
-
|
1124
|
+
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
1115
1125
|
}
|
1116
1126
|
|
1117
1127
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1118
1128
|
struct ggml_tensor * node = graph->nodes[i];
|
1119
|
-
if (
|
1129
|
+
if (tensor_backend_id(node) != -1) {
|
1120
1130
|
// do not overwrite user assignments
|
1121
1131
|
continue;
|
1122
1132
|
}
|
1123
|
-
|
1133
|
+
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
|
1124
1134
|
// src
|
1125
1135
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1126
1136
|
struct ggml_tensor * src = node->src[j];
|
1127
1137
|
if (src == NULL) {
|
1128
1138
|
break;
|
1129
1139
|
}
|
1130
|
-
if (
|
1131
|
-
|
1140
|
+
if (tensor_backend_id(src) == -1) {
|
1141
|
+
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
1132
1142
|
}
|
1133
1143
|
}
|
1134
1144
|
}
|
@@ -1143,22 +1153,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1143
1153
|
|
1144
1154
|
// pass 2.1 expand gpu up
|
1145
1155
|
{
|
1146
|
-
|
1156
|
+
int cur_backend_id = -1;
|
1147
1157
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1148
1158
|
struct ggml_tensor * node = graph->nodes[i];
|
1149
1159
|
if (ggml_is_view_op(node->op)) {
|
1150
1160
|
continue;
|
1151
1161
|
}
|
1152
|
-
|
1153
|
-
if (
|
1154
|
-
if (
|
1162
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1163
|
+
if (tensor_backend_id != -1) {
|
1164
|
+
if (tensor_backend_id == sched->n_backends - 1) {
|
1155
1165
|
// skip cpu (lowest prio backend)
|
1156
|
-
|
1166
|
+
cur_backend_id = -1;
|
1157
1167
|
} else {
|
1158
|
-
|
1168
|
+
cur_backend_id = tensor_backend_id;
|
1159
1169
|
}
|
1160
1170
|
} else {
|
1161
|
-
|
1171
|
+
tensor_backend_id(node) = cur_backend_id;
|
1162
1172
|
SET_CAUSE(node, "2.1");
|
1163
1173
|
}
|
1164
1174
|
}
|
@@ -1166,22 +1176,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1166
1176
|
|
1167
1177
|
// pass 2.2 expand gpu down
|
1168
1178
|
{
|
1169
|
-
|
1179
|
+
int cur_backend_id = -1;
|
1170
1180
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1171
1181
|
struct ggml_tensor * node = graph->nodes[i];
|
1172
1182
|
if (ggml_is_view_op(node->op)) {
|
1173
1183
|
continue;
|
1174
1184
|
}
|
1175
|
-
|
1176
|
-
if (
|
1177
|
-
if (
|
1185
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1186
|
+
if (tensor_backend_id != -1) {
|
1187
|
+
if (tensor_backend_id == sched->n_backends - 1) {
|
1178
1188
|
// skip cpu (lowest prio backend)
|
1179
|
-
|
1189
|
+
cur_backend_id = -1;
|
1180
1190
|
} else {
|
1181
|
-
|
1191
|
+
cur_backend_id = tensor_backend_id;
|
1182
1192
|
}
|
1183
1193
|
} else {
|
1184
|
-
|
1194
|
+
tensor_backend_id(node) = cur_backend_id;
|
1185
1195
|
SET_CAUSE(node, "2.2");
|
1186
1196
|
}
|
1187
1197
|
}
|
@@ -1189,17 +1199,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1189
1199
|
|
1190
1200
|
// pass 2.3 expand rest up
|
1191
1201
|
{
|
1192
|
-
|
1202
|
+
int cur_backend_id = -1;
|
1193
1203
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1194
1204
|
struct ggml_tensor * node = graph->nodes[i];
|
1195
1205
|
if (ggml_is_view_op(node->op)) {
|
1196
1206
|
continue;
|
1197
1207
|
}
|
1198
|
-
|
1199
|
-
if (
|
1200
|
-
|
1208
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1209
|
+
if (tensor_backend_id != -1) {
|
1210
|
+
cur_backend_id = tensor_backend_id;
|
1201
1211
|
} else {
|
1202
|
-
|
1212
|
+
tensor_backend_id(node) = cur_backend_id;
|
1203
1213
|
SET_CAUSE(node, "2.3");
|
1204
1214
|
}
|
1205
1215
|
}
|
@@ -1207,17 +1217,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1207
1217
|
|
1208
1218
|
// pass 2.4 expand rest down
|
1209
1219
|
{
|
1210
|
-
|
1220
|
+
int cur_backend_id = -1;
|
1211
1221
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1212
1222
|
struct ggml_tensor * node = graph->nodes[i];
|
1213
1223
|
if (ggml_is_view_op(node->op)) {
|
1214
1224
|
continue;
|
1215
1225
|
}
|
1216
|
-
|
1217
|
-
if (
|
1218
|
-
|
1226
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1227
|
+
if (tensor_backend_id != -1) {
|
1228
|
+
cur_backend_id = tensor_backend_id;
|
1219
1229
|
} else {
|
1220
|
-
|
1230
|
+
tensor_backend_id(node) = cur_backend_id;
|
1221
1231
|
SET_CAUSE(node, "2.4");
|
1222
1232
|
}
|
1223
1233
|
}
|
@@ -1229,9 +1239,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1229
1239
|
// pass 3: assign backends to remaining src from dst and view_src
|
1230
1240
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1231
1241
|
struct ggml_tensor * node = graph->nodes[i];
|
1232
|
-
|
1233
|
-
if (node->view_src != NULL &&
|
1234
|
-
|
1242
|
+
int cur_backend_id = tensor_backend_id(node);
|
1243
|
+
if (node->view_src != NULL && cur_backend_id == -1) {
|
1244
|
+
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
|
1235
1245
|
SET_CAUSE(node, "3.vsrc");
|
1236
1246
|
}
|
1237
1247
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -1239,14 +1249,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1239
1249
|
if (src == NULL) {
|
1240
1250
|
break;
|
1241
1251
|
}
|
1242
|
-
|
1243
|
-
if (
|
1252
|
+
int src_backend_id = tensor_backend_id(src);
|
1253
|
+
if (src_backend_id == -1) {
|
1244
1254
|
if (src->view_src != NULL) {
|
1245
1255
|
// views are always on the same backend as the source
|
1246
|
-
|
1256
|
+
tensor_backend_id(src) = tensor_backend_id(src->view_src);
|
1247
1257
|
SET_CAUSE(src, "3.vsrc");
|
1248
1258
|
} else {
|
1249
|
-
|
1259
|
+
tensor_backend_id(src) = cur_backend_id;
|
1250
1260
|
SET_CAUSE(src, "3.cur");
|
1251
1261
|
}
|
1252
1262
|
}
|
@@ -1263,15 +1273,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1263
1273
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1264
1274
|
struct ggml_tensor * node = graph->nodes[i];
|
1265
1275
|
if (!ggml_is_view_op(node->op)) {
|
1266
|
-
sched->splits[0].
|
1276
|
+
sched->splits[0].backend_id = tensor_backend_id(node);
|
1267
1277
|
break;
|
1268
1278
|
}
|
1269
1279
|
}
|
1270
1280
|
sched->splits[0].i_start = 0;
|
1271
1281
|
sched->splits[0].n_inputs = 0;
|
1272
1282
|
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
1273
|
-
|
1274
|
-
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1283
|
+
int cur_backend_id = sched->splits[0].backend_id;
|
1275
1284
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1276
1285
|
struct ggml_tensor * node = graph->nodes[i];
|
1277
1286
|
|
@@ -1279,19 +1288,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1279
1288
|
continue;
|
1280
1289
|
}
|
1281
1290
|
|
1282
|
-
|
1291
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1283
1292
|
|
1284
|
-
GGML_ASSERT(
|
1293
|
+
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
|
1285
1294
|
|
1286
|
-
if (
|
1295
|
+
if (tensor_backend_id != cur_backend_id) {
|
1287
1296
|
sched->splits[cur_split].i_end = i;
|
1288
1297
|
cur_split++;
|
1289
1298
|
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
1290
|
-
sched->splits[cur_split].
|
1299
|
+
sched->splits[cur_split].backend_id = tensor_backend_id;
|
1291
1300
|
sched->splits[cur_split].i_start = i;
|
1292
1301
|
sched->splits[cur_split].n_inputs = 0;
|
1293
|
-
|
1294
|
-
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1302
|
+
cur_backend_id = tensor_backend_id;
|
1295
1303
|
}
|
1296
1304
|
|
1297
1305
|
// find inputs that are not on the same backend
|
@@ -1300,43 +1308,25 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1300
1308
|
if (src == NULL) {
|
1301
1309
|
break;
|
1302
1310
|
}
|
1303
|
-
|
1304
|
-
|
1305
|
-
if (
|
1311
|
+
int src_backend_id = tensor_backend_id(src);
|
1312
|
+
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1313
|
+
if (src_backend_id != tensor_backend_id) {
|
1306
1314
|
// create a copy of the input in the split's backend
|
1307
1315
|
size_t id = hash_id(src);
|
1308
|
-
if (sched->
|
1309
|
-
ggml_backend_t backend =
|
1316
|
+
if (sched->tensor_copies[id][cur_backend_id] == NULL) {
|
1317
|
+
ggml_backend_t backend = sched->backends[cur_backend_id];
|
1310
1318
|
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1311
1319
|
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
1312
1320
|
|
1313
|
-
sched->
|
1314
|
-
|
1321
|
+
sched->tensor_copies[id][cur_backend_id] = tensor_copy;
|
1322
|
+
tensor_backend_id(tensor_copy) = cur_backend_id;
|
1315
1323
|
SET_CAUSE(tensor_copy, "4.cpy");
|
1316
1324
|
|
1317
1325
|
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1318
1326
|
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
1319
1327
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
1320
1328
|
}
|
1321
|
-
node->src[j] = sched->
|
1322
|
-
|
1323
|
-
#if 0
|
1324
|
-
// check if the input is already in the split
|
1325
|
-
bool found = false;
|
1326
|
-
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
1327
|
-
if (sched->splits[cur_split].inputs[k] == src) {
|
1328
|
-
found = true;
|
1329
|
-
break;
|
1330
|
-
}
|
1331
|
-
}
|
1332
|
-
|
1333
|
-
if (!found) {
|
1334
|
-
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1335
|
-
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
1336
|
-
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
1337
|
-
sched->splits[cur_split].inputs[n_inputs] = src;
|
1338
|
-
}
|
1339
|
-
#endif
|
1329
|
+
node->src[j] = sched->tensor_copies[id][cur_backend_id];
|
1340
1330
|
}
|
1341
1331
|
}
|
1342
1332
|
}
|
@@ -1351,30 +1341,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1351
1341
|
// sanity check: all sources should have the same backend as the node
|
1352
1342
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1353
1343
|
struct ggml_tensor * node = graph->nodes[i];
|
1354
|
-
|
1355
|
-
if (
|
1344
|
+
ggml_backend_t tensor_backend = tensor_backend(node);
|
1345
|
+
if (tensor_backend == NULL) {
|
1356
1346
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
1357
1347
|
}
|
1358
|
-
if (node->view_src != NULL &&
|
1348
|
+
if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
|
1359
1349
|
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
1360
|
-
node->name,
|
1361
|
-
node->view_src->name,
|
1350
|
+
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1351
|
+
node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
|
1362
1352
|
}
|
1363
1353
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1364
1354
|
struct ggml_tensor * src = node->src[j];
|
1365
1355
|
if (src == NULL) {
|
1366
1356
|
break;
|
1367
1357
|
}
|
1368
|
-
|
1369
|
-
if (
|
1358
|
+
ggml_backend_t src_backend = tensor_backend(src);
|
1359
|
+
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
1370
1360
|
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
1371
|
-
node->name,
|
1372
|
-
j, src->name,
|
1361
|
+
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1362
|
+
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
1373
1363
|
}
|
1374
|
-
if (src->view_src != NULL &&
|
1364
|
+
if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
|
1375
1365
|
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
1376
|
-
src->name,
|
1377
|
-
src->view_src->name,
|
1366
|
+
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
1367
|
+
src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
|
1378
1368
|
}
|
1379
1369
|
}
|
1380
1370
|
}
|
@@ -1388,32 +1378,45 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1388
1378
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1389
1379
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
1390
1380
|
|
1391
|
-
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1392
1381
|
for (int j = 0; j < split->n_inputs; j++) {
|
1393
1382
|
struct ggml_tensor * input = split->inputs[j];
|
1394
|
-
struct ggml_tensor * input_cpy = sched->
|
1383
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
|
1384
|
+
|
1395
1385
|
// add a dependency to the input source so that it is not freed before the copy is done
|
1396
|
-
|
1397
|
-
|
1386
|
+
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
1387
|
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
1388
|
+
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1389
|
+
|
1390
|
+
// add a dependency to the input copy so that it is allocated at the start of the split
|
1391
|
+
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
1398
1392
|
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
1399
1393
|
}
|
1400
1394
|
|
1401
1395
|
for (int j = split->i_start; j < split->i_end; j++) {
|
1396
|
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
1402
1397
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1403
1398
|
}
|
1404
1399
|
}
|
1405
1400
|
sched->graph = graph_copy;
|
1406
1401
|
}
|
1407
1402
|
|
1408
|
-
static
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1403
|
+
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
1404
|
+
// ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
1405
|
+
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1406
|
+
#ifndef NDEBUG
|
1407
|
+
fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
|
1408
|
+
#endif
|
1409
|
+
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
1410
|
+
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1411
|
+
fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
|
1412
|
+
return false;
|
1413
|
+
}
|
1414
|
+
}
|
1415
|
+
|
1416
|
+
return true;
|
1414
1417
|
}
|
1415
1418
|
|
1416
|
-
static
|
1419
|
+
static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
1417
1420
|
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
1418
1421
|
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
1419
1422
|
|
@@ -1421,20 +1424,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1421
1424
|
|
1422
1425
|
for (int i = 0; i < sched->n_splits; i++) {
|
1423
1426
|
struct ggml_backend_sched_split * split = &splits[i];
|
1424
|
-
|
1425
|
-
|
1427
|
+
int split_backend_id = split->backend_id;
|
1428
|
+
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
1426
1429
|
|
1427
1430
|
// copy the input tensors to the split backend
|
1428
1431
|
uint64_t copy_start_us = ggml_time_us();
|
1429
1432
|
for (int j = 0; j < split->n_inputs; j++) {
|
1430
1433
|
struct ggml_tensor * input = split->inputs[j];
|
1431
|
-
struct ggml_tensor * input_cpy = sched->
|
1434
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
|
1432
1435
|
|
1433
1436
|
GGML_ASSERT(input->buffer != NULL);
|
1434
1437
|
GGML_ASSERT(input_cpy->buffer != NULL);
|
1435
1438
|
|
1436
|
-
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
|
1437
|
-
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
|
1438
1439
|
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
1439
1440
|
}
|
1440
1441
|
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
@@ -1450,7 +1451,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1450
1451
|
|
1451
1452
|
uint64_t compute_start_us = ggml_time_us();
|
1452
1453
|
if (!sched->callback_eval) {
|
1453
|
-
ggml_backend_graph_compute(split_backend, &split->graph)
|
1454
|
+
if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
|
1455
|
+
return false;
|
1456
|
+
}
|
1454
1457
|
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1455
1458
|
} else {
|
1456
1459
|
// similar to ggml_backend_compare_graph_backend
|
@@ -1470,7 +1473,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1470
1473
|
|
1471
1474
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
1472
1475
|
|
1473
|
-
ggml_backend_graph_compute(split_backend, &gv)
|
1476
|
+
if (!ggml_backend_graph_compute(split_backend, &gv)) {
|
1477
|
+
return false;
|
1478
|
+
}
|
1474
1479
|
|
1475
1480
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
1476
1481
|
break;
|
@@ -1492,19 +1497,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1492
1497
|
}
|
1493
1498
|
}
|
1494
1499
|
#endif
|
1495
|
-
}
|
1496
|
-
|
1497
|
-
static void sched_reset(ggml_backend_sched_t sched) {
|
1498
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
1499
|
-
ggml_tallocr_reset(sched->tallocs[i]);
|
1500
|
-
}
|
1501
|
-
// reset state for the next run
|
1502
|
-
size_t hash_size = sched->hash_set.size;
|
1503
|
-
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
1504
|
-
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
1505
|
-
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
1506
1500
|
|
1507
|
-
|
1501
|
+
return true;
|
1508
1502
|
}
|
1509
1503
|
|
1510
1504
|
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
@@ -1514,9 +1508,10 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
|
|
1514
1508
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
1515
1509
|
|
1516
1510
|
// initialize hash table
|
1517
|
-
sched->hash_set
|
1518
|
-
sched->
|
1519
|
-
sched->
|
1511
|
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1512
|
+
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
1513
|
+
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
1514
|
+
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
1520
1515
|
|
1521
1516
|
sched->n_backends = n_backends;
|
1522
1517
|
for (int i = 0; i < n_backends; i++) {
|
@@ -1524,14 +1519,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
|
|
1524
1519
|
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
1525
1520
|
}
|
1526
1521
|
|
1527
|
-
sched->galloc =
|
1528
|
-
|
1529
|
-
// init measure allocs for each backend
|
1530
|
-
for (int i = 0; i < n_backends; i++) {
|
1531
|
-
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
|
1532
|
-
}
|
1522
|
+
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
1533
1523
|
|
1534
|
-
|
1524
|
+
ggml_backend_sched_reset(sched);
|
1535
1525
|
|
1536
1526
|
return sched;
|
1537
1527
|
}
|
@@ -1540,49 +1530,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1540
1530
|
if (sched == NULL) {
|
1541
1531
|
return;
|
1542
1532
|
}
|
1543
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
1544
|
-
ggml_tallocr_free(sched->tallocs[i]);
|
1545
|
-
}
|
1546
1533
|
ggml_gallocr_free(sched->galloc);
|
1547
1534
|
ggml_free(sched->ctx);
|
1548
1535
|
free(sched->hash_set.keys);
|
1549
|
-
free(sched->
|
1550
|
-
free(sched->
|
1536
|
+
free(sched->tensor_backend_id);
|
1537
|
+
free(sched->tensor_copies);
|
1538
|
+
free(sched->node_backend_ids);
|
1551
1539
|
free(sched);
|
1552
1540
|
}
|
1553
1541
|
|
1554
|
-
void
|
1555
|
-
|
1542
|
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1543
|
+
// reset state for the next run
|
1544
|
+
size_t hash_size = sched->hash_set.size;
|
1545
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
1546
|
+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
1547
|
+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1548
|
+
|
1549
|
+
sched->is_reset = true;
|
1550
|
+
}
|
1556
1551
|
|
1557
|
-
|
1558
|
-
|
1552
|
+
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1553
|
+
ggml_backend_sched_split_graph(sched, measure_graph);
|
1559
1554
|
|
1560
|
-
|
1561
|
-
|
1562
|
-
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
1563
|
-
ggml_tallocr_free(sched->tallocs[i]);
|
1564
|
-
sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
|
1555
|
+
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
|
1556
|
+
return false;
|
1565
1557
|
}
|
1566
1558
|
|
1567
|
-
|
1559
|
+
ggml_backend_sched_reset(sched);
|
1560
|
+
return true;
|
1568
1561
|
}
|
1569
1562
|
|
1570
|
-
|
1563
|
+
bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1571
1564
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1572
1565
|
|
1573
1566
|
if (!sched->is_reset) {
|
1574
|
-
|
1567
|
+
ggml_backend_sched_reset(sched);
|
1575
1568
|
}
|
1576
1569
|
|
1577
|
-
|
1578
|
-
|
1579
|
-
|
1580
|
-
}
|
1570
|
+
ggml_backend_sched_split_graph(sched, graph);
|
1571
|
+
if (!ggml_backend_sched_alloc_splits(sched)) {
|
1572
|
+
return false;
|
1573
|
+
}
|
1581
1574
|
|
1582
|
-
|
1583
|
-
|
1584
|
-
}
|
1575
|
+
if (!ggml_backend_sched_compute_splits(sched)) {
|
1576
|
+
return false;
|
1577
|
+
}
|
1585
1578
|
|
1579
|
+
return true;
|
1580
|
+
}
|
1586
1581
|
|
1587
1582
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
1588
1583
|
sched->callback_eval = callback;
|
@@ -1593,37 +1588,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|
1593
1588
|
return sched->n_splits;
|
1594
1589
|
}
|
1595
1590
|
|
1596
|
-
|
1597
|
-
int backend_index =
|
1598
|
-
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1599
|
-
return sched->tallocs[backend_index];
|
1600
|
-
}
|
1601
|
-
|
1602
|
-
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1603
|
-
int backend_index = sched_backend_prio(sched, backend);
|
1591
|
+
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1592
|
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1604
1593
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1605
|
-
return
|
1594
|
+
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
1606
1595
|
}
|
1607
1596
|
|
1608
1597
|
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
1609
|
-
int backend_index =
|
1598
|
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1610
1599
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1611
|
-
|
1600
|
+
tensor_backend_id(node) = backend_index;
|
1612
1601
|
}
|
1613
1602
|
|
1614
1603
|
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
1615
|
-
|
1616
|
-
if (
|
1604
|
+
int backend_index = tensor_backend_id(node);
|
1605
|
+
if (backend_index == -1) {
|
1617
1606
|
return NULL;
|
1618
1607
|
}
|
1619
|
-
return
|
1608
|
+
return sched->backends[backend_index];
|
1620
1609
|
}
|
1621
1610
|
|
1622
1611
|
// utils
|
1623
1612
|
|
1624
1613
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
1625
1614
|
GGML_ASSERT(tensor->buffer == NULL);
|
1626
|
-
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
|
1627
1615
|
GGML_ASSERT(tensor->view_src != NULL);
|
1628
1616
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1629
1617
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
@@ -1647,7 +1635,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
|
|
1647
1635
|
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1648
1636
|
}
|
1649
1637
|
|
1650
|
-
static struct ggml_tensor *
|
1638
|
+
static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
1651
1639
|
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
1652
1640
|
|
1653
1641
|
GGML_ASSERT(src != NULL);
|
@@ -1660,7 +1648,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
|
|
1660
1648
|
|
1661
1649
|
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
1662
1650
|
if (src->view_src != NULL) {
|
1663
|
-
dst->view_src =
|
1651
|
+
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
1664
1652
|
dst->view_offs = src->view_offs;
|
1665
1653
|
}
|
1666
1654
|
dst->op = src->op;
|
@@ -1673,14 +1661,14 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
|
|
1673
1661
|
if (s == NULL) {
|
1674
1662
|
break;
|
1675
1663
|
}
|
1676
|
-
dst->src[i] =
|
1664
|
+
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
1677
1665
|
}
|
1678
1666
|
|
1679
1667
|
node_copies[id] = dst;
|
1680
1668
|
return dst;
|
1681
1669
|
}
|
1682
1670
|
|
1683
|
-
static void
|
1671
|
+
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
1684
1672
|
size_t id = ggml_hash_find(hash_set, src);
|
1685
1673
|
if (node_init[id]) {
|
1686
1674
|
return;
|
@@ -1689,7 +1677,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
1689
1677
|
|
1690
1678
|
struct ggml_tensor * dst = node_copies[id];
|
1691
1679
|
if (dst->view_src != NULL) {
|
1692
|
-
|
1680
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1693
1681
|
ggml_backend_view_init(dst->view_src->buffer, dst);
|
1694
1682
|
}
|
1695
1683
|
else {
|
@@ -1702,17 +1690,17 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
1702
1690
|
if (s == NULL) {
|
1703
1691
|
break;
|
1704
1692
|
}
|
1705
|
-
|
1693
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
1706
1694
|
}
|
1707
1695
|
}
|
1708
1696
|
|
1709
1697
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1710
1698
|
struct ggml_hash_set hash_set = {
|
1711
1699
|
/* .size = */ graph->visited_hash_table.size,
|
1712
|
-
/* .keys = */ calloc(sizeof(hash_set.keys[0])
|
1700
|
+
/* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
|
1713
1701
|
};
|
1714
|
-
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0])
|
1715
|
-
bool * node_init = calloc(sizeof(node_init[0])
|
1702
|
+
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
|
1703
|
+
bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
|
1716
1704
|
|
1717
1705
|
struct ggml_init_params params = {
|
1718
1706
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
@@ -1741,7 +1729,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
1741
1729
|
// dup nodes
|
1742
1730
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1743
1731
|
struct ggml_tensor * node = graph->nodes[i];
|
1744
|
-
|
1732
|
+
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
1745
1733
|
}
|
1746
1734
|
|
1747
1735
|
// allocate nodes
|
@@ -1766,7 +1754,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
1766
1754
|
// copy data and init views
|
1767
1755
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1768
1756
|
struct ggml_tensor * node = graph->nodes[i];
|
1769
|
-
|
1757
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, node);
|
1770
1758
|
}
|
1771
1759
|
|
1772
1760
|
// build graph copy
|