llama_cpp 0.12.5 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +67 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +51 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +595 -492
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +268 -271
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +101 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +1255 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +39 -16
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +95 -264
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +213 -58
- data/vendor/tmp/llama.cpp/ggml.c +1082 -564
- data/vendor/tmp/llama.cpp/ggml.h +50 -17
- data/vendor/tmp/llama.cpp/llama.cpp +1329 -280
- data/vendor/tmp/llama.cpp/llama.h +43 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
|
|
219
219
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
220
220
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
221
221
|
|
222
|
+
if (!size) {
|
223
|
+
return;
|
224
|
+
}
|
225
|
+
|
222
226
|
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
223
227
|
}
|
224
228
|
|
@@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
|
|
229
233
|
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
230
234
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
231
235
|
|
236
|
+
if (!size) {
|
237
|
+
return;
|
238
|
+
}
|
239
|
+
|
232
240
|
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
233
241
|
}
|
234
242
|
|
@@ -475,6 +483,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|
475
483
|
|
476
484
|
// backend CPU
|
477
485
|
|
486
|
+
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
487
|
+
|
478
488
|
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
479
489
|
return "CPU";
|
480
490
|
|
@@ -482,7 +492,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t
|
|
482
492
|
}
|
483
493
|
|
484
494
|
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
485
|
-
|
495
|
+
uintptr_t data = (uintptr_t)buffer->context;
|
496
|
+
|
497
|
+
// align the buffer
|
498
|
+
if (data % TENSOR_ALIGNMENT != 0) {
|
499
|
+
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
500
|
+
}
|
501
|
+
|
502
|
+
return (void *)data;
|
486
503
|
}
|
487
504
|
|
488
505
|
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
@@ -540,8 +557,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
540
557
|
/* .reset = */ NULL,
|
541
558
|
};
|
542
559
|
|
543
|
-
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
544
|
-
|
545
560
|
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
546
561
|
return "CPU";
|
547
562
|
|
@@ -550,9 +565,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend
|
|
550
565
|
|
551
566
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
552
567
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
553
|
-
void * data = malloc(size); // TODO:
|
554
|
-
|
555
|
-
|
568
|
+
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
569
|
+
if (data == NULL) {
|
570
|
+
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
571
|
+
return NULL;
|
572
|
+
}
|
556
573
|
|
557
574
|
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
558
575
|
}
|
@@ -653,6 +670,9 @@ struct ggml_backend_cpu_context {
|
|
653
670
|
int n_threads;
|
654
671
|
void * work_data;
|
655
672
|
size_t work_size;
|
673
|
+
|
674
|
+
ggml_abort_callback abort_callback;
|
675
|
+
void * abort_callback_data;
|
656
676
|
};
|
657
677
|
|
658
678
|
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
@@ -691,6 +711,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
|
691
711
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
692
712
|
}
|
693
713
|
|
714
|
+
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
715
|
+
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
716
|
+
|
694
717
|
return cpu_plan;
|
695
718
|
}
|
696
719
|
|
@@ -721,9 +744,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
721
744
|
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
|
722
745
|
cpu_ctx->work_size = cplan.work_size;
|
723
746
|
}
|
724
|
-
|
725
747
|
cplan.work_data = cpu_ctx->work_data;
|
726
748
|
|
749
|
+
cplan.abort_callback = cpu_ctx->abort_callback;
|
750
|
+
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
751
|
+
|
727
752
|
ggml_graph_compute(cgraph, &cplan);
|
728
753
|
return true;
|
729
754
|
}
|
@@ -731,7 +756,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
731
756
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
732
757
|
switch (op->op) {
|
733
758
|
case GGML_OP_CPY:
|
734
|
-
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
|
759
|
+
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
|
735
760
|
case GGML_OP_MUL_MAT:
|
736
761
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
737
762
|
default:
|
@@ -758,12 +783,21 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
758
783
|
|
759
784
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
760
785
|
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
786
|
+
if (ctx == NULL) {
|
787
|
+
return NULL;
|
788
|
+
}
|
761
789
|
|
762
|
-
ctx->n_threads
|
763
|
-
ctx->work_data
|
764
|
-
ctx->work_size
|
790
|
+
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
791
|
+
ctx->work_data = NULL;
|
792
|
+
ctx->work_size = 0;
|
793
|
+
ctx->abort_callback = NULL;
|
794
|
+
ctx->abort_callback_data = NULL;
|
765
795
|
|
766
796
|
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
797
|
+
if (cpu_backend == NULL) {
|
798
|
+
free(ctx);
|
799
|
+
return NULL;
|
800
|
+
}
|
767
801
|
|
768
802
|
*cpu_backend = (struct ggml_backend) {
|
769
803
|
/* .interface = */ cpu_backend_i,
|
@@ -783,7 +817,16 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
783
817
|
ctx->n_threads = n_threads;
|
784
818
|
}
|
785
819
|
|
820
|
+
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
821
|
+
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
822
|
+
|
823
|
+
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
824
|
+
ctx->abort_callback = abort_callback;
|
825
|
+
ctx->abort_callback_data = abort_callback_data;
|
826
|
+
}
|
827
|
+
|
786
828
|
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
829
|
+
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
787
830
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
788
831
|
}
|
789
832
|
|
@@ -847,6 +890,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
|
|
847
890
|
ctx->n_buffers = n_buffers;
|
848
891
|
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
849
892
|
|
893
|
+
GGML_ASSERT(ctx->buffers != NULL);
|
894
|
+
|
850
895
|
size_t total_size = 0;
|
851
896
|
for (size_t i = 0; i < n_buffers; i++) {
|
852
897
|
ctx->buffers[i] = buffers[i];
|
@@ -868,6 +913,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
|
|
868
913
|
}
|
869
914
|
}
|
870
915
|
|
916
|
+
// creates a copy of the tensor with the same memory layout
|
917
|
+
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
918
|
+
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
919
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
920
|
+
dup->nb[i] = tensor->nb[i];
|
921
|
+
}
|
922
|
+
return dup;
|
923
|
+
}
|
924
|
+
|
925
|
+
static bool ggml_is_view_op(enum ggml_op op) {
|
926
|
+
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
927
|
+
}
|
871
928
|
|
872
929
|
// scheduler
|
873
930
|
|
@@ -876,7 +933,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
|
|
876
933
|
#define GGML_MAX_SPLIT_INPUTS 16
|
877
934
|
|
878
935
|
struct ggml_backend_sched_split {
|
879
|
-
|
936
|
+
int backend_id;
|
880
937
|
int i_start;
|
881
938
|
int i_end;
|
882
939
|
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
@@ -891,15 +948,17 @@ struct ggml_backend_sched {
|
|
891
948
|
int n_backends;
|
892
949
|
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
893
950
|
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
894
|
-
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
895
951
|
|
896
952
|
ggml_gallocr_t galloc;
|
897
953
|
|
898
954
|
// hash keys of the nodes in the graph
|
899
955
|
struct ggml_hash_set hash_set;
|
900
|
-
// hash values
|
901
|
-
|
902
|
-
struct ggml_tensor * (*
|
956
|
+
// hash values
|
957
|
+
int * tensor_backend_id;
|
958
|
+
struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
|
959
|
+
|
960
|
+
int * node_backend_ids; // [n_nodes]
|
961
|
+
int n_nodes;
|
903
962
|
|
904
963
|
// copy of the graph with modified inputs
|
905
964
|
struct ggml_cgraph * graph;
|
@@ -909,75 +968,45 @@ struct ggml_backend_sched {
|
|
909
968
|
|
910
969
|
struct ggml_context * ctx;
|
911
970
|
|
971
|
+
ggml_backend_sched_eval_callback callback_eval;
|
972
|
+
void * callback_eval_user_data;
|
973
|
+
|
912
974
|
// align context_buffer to GGML_MEM_ALIGN
|
913
975
|
#ifdef _MSC_VER
|
914
976
|
__declspec(align(GGML_MEM_ALIGN))
|
915
977
|
#else
|
916
978
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
917
979
|
#endif
|
918
|
-
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
919
|
-
|
920
|
-
ggml_backend_sched_eval_callback callback_eval;
|
921
|
-
void * callback_eval_user_data;
|
980
|
+
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
922
981
|
};
|
923
982
|
|
924
983
|
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
925
|
-
#define
|
984
|
+
#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
|
985
|
+
#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
|
926
986
|
|
927
|
-
|
928
|
-
|
929
|
-
}
|
930
|
-
|
931
|
-
// returns the priority of the backend, lower is better
|
932
|
-
static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
987
|
+
// returns the priority of the backend, lower id is higher priority
|
988
|
+
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
933
989
|
for (int i = 0; i < sched->n_backends; i++) {
|
934
990
|
if (sched->backends[i] == backend) {
|
935
991
|
return i;
|
936
992
|
}
|
937
993
|
}
|
938
|
-
return
|
939
|
-
}
|
940
|
-
|
941
|
-
static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
942
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
943
|
-
if (sched->tallocs[i] == allocr) {
|
944
|
-
return i;
|
945
|
-
}
|
946
|
-
}
|
947
|
-
return INT_MAX;
|
994
|
+
return -1;
|
948
995
|
}
|
949
996
|
|
950
|
-
static
|
997
|
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
951
998
|
if (buffer == NULL) {
|
952
|
-
return
|
953
|
-
}
|
954
|
-
|
955
|
-
// check if this is already allocate in a allocr buffer (from user manual allocations)
|
956
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
957
|
-
if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
|
958
|
-
return sched->tallocs[i];
|
959
|
-
}
|
999
|
+
return -1;
|
960
1000
|
}
|
961
1001
|
|
962
1002
|
// find highest prio backend that supports the buffer type
|
963
1003
|
for (int i = 0; i < sched->n_backends; i++) {
|
964
1004
|
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
965
|
-
return
|
1005
|
+
return i;
|
966
1006
|
}
|
967
1007
|
}
|
968
1008
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
969
|
-
|
970
|
-
|
971
|
-
static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
972
|
-
if (allocr == NULL) {
|
973
|
-
return NULL;
|
974
|
-
}
|
975
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
976
|
-
if (sched->tallocs[i] == allocr) {
|
977
|
-
return sched->backends[i];
|
978
|
-
}
|
979
|
-
}
|
980
|
-
GGML_UNREACHABLE();
|
1009
|
+
return -1; // silence warning
|
981
1010
|
}
|
982
1011
|
|
983
1012
|
#if 0
|
@@ -990,37 +1019,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I
|
|
990
1019
|
#endif
|
991
1020
|
|
992
1021
|
// returns the backend that should be used for the node based on the current locations
|
993
|
-
static
|
1022
|
+
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
1023
|
+
// TODO: use supports_op to check if the backend supports the op
|
1024
|
+
|
994
1025
|
// assign pre-allocated nodes to their backend
|
995
1026
|
// dst
|
996
|
-
|
997
|
-
if (
|
1027
|
+
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
|
1028
|
+
if (cur_backend != -1) {
|
998
1029
|
SET_CAUSE(node, "1.dst");
|
999
|
-
return
|
1030
|
+
return cur_backend;
|
1000
1031
|
}
|
1001
1032
|
// view_src
|
1002
|
-
if (
|
1003
|
-
|
1004
|
-
if (
|
1033
|
+
if (tensor->view_src != NULL) {
|
1034
|
+
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
|
1035
|
+
if (cur_backend != -1) {
|
1005
1036
|
SET_CAUSE(node, "1.vsrc");
|
1006
|
-
return
|
1037
|
+
return cur_backend;
|
1007
1038
|
}
|
1008
1039
|
}
|
1009
1040
|
// assign nodes that use weights to the backend of the weights
|
1010
1041
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1011
|
-
const struct ggml_tensor * src =
|
1042
|
+
const struct ggml_tensor * src = tensor->src[i];
|
1012
1043
|
if (src == NULL) {
|
1013
|
-
|
1044
|
+
continue;
|
1014
1045
|
}
|
1015
1046
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1016
|
-
|
1047
|
+
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
|
1017
1048
|
// operations with weights are always run on the same backend as the weights
|
1018
1049
|
SET_CAUSE(node, "1.wgt%d", i);
|
1019
|
-
return
|
1050
|
+
return src_backend;
|
1020
1051
|
}
|
1021
1052
|
}
|
1022
1053
|
|
1023
|
-
return
|
1054
|
+
return -1;
|
1024
1055
|
}
|
1025
1056
|
|
1026
1057
|
static char * fmt_size(size_t size) {
|
@@ -1033,11 +1064,11 @@ static char * fmt_size(size_t size) {
|
|
1033
1064
|
return buffer;
|
1034
1065
|
}
|
1035
1066
|
|
1036
|
-
static void
|
1067
|
+
static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1037
1068
|
int cur_split = 0;
|
1038
1069
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1039
1070
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
1040
|
-
ggml_backend_t split_backend =
|
1071
|
+
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
1041
1072
|
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
1042
1073
|
sched->splits[cur_split].n_inputs);
|
1043
1074
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
@@ -1051,17 +1082,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
1051
1082
|
if (ggml_is_view_op(node->op)) {
|
1052
1083
|
continue;
|
1053
1084
|
}
|
1054
|
-
|
1055
|
-
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
1085
|
+
ggml_backend_t tensor_backend = tensor_backend(node);
|
1056
1086
|
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
1057
|
-
fmt_size(ggml_nbytes(node)),
|
1087
|
+
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1058
1088
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1059
1089
|
struct ggml_tensor * src = node->src[j];
|
1060
1090
|
if (src == NULL) {
|
1061
|
-
|
1091
|
+
continue;
|
1062
1092
|
}
|
1063
|
-
|
1064
|
-
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
1093
|
+
ggml_backend_t src_backend = tensor_backend(src);
|
1065
1094
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1066
1095
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1067
1096
|
}
|
@@ -1069,23 +1098,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
1069
1098
|
}
|
1070
1099
|
}
|
1071
1100
|
|
1072
|
-
// creates a copy of the tensor with the same memory layout
|
1073
|
-
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
1074
|
-
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
1075
|
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
1076
|
-
dup->nb[i] = tensor->nb[i];
|
1077
|
-
}
|
1078
|
-
return dup;
|
1079
|
-
}
|
1080
|
-
|
1081
|
-
|
1082
1101
|
//#define DEBUG_PASS1
|
1083
1102
|
//#define DEBUG_PASS2
|
1084
1103
|
//#define DEBUG_PASS3
|
1085
1104
|
//#define DEBUG_PASS4
|
1086
1105
|
|
1087
1106
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
1088
|
-
static void
|
1107
|
+
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1089
1108
|
// reset splits
|
1090
1109
|
sched->n_splits = 0;
|
1091
1110
|
sched->is_reset = false;
|
@@ -1107,28 +1126,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1107
1126
|
// pass 1: assign backends to ops with pre-allocated inputs
|
1108
1127
|
for (int i = 0; i < graph->n_leafs; i++) {
|
1109
1128
|
struct ggml_tensor * leaf = graph->leafs[i];
|
1110
|
-
if (
|
1129
|
+
if (tensor_backend_id(leaf) != -1) {
|
1111
1130
|
// do not overwrite user assignments
|
1112
1131
|
continue;
|
1113
1132
|
}
|
1114
|
-
|
1133
|
+
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
1115
1134
|
}
|
1116
1135
|
|
1117
1136
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1118
1137
|
struct ggml_tensor * node = graph->nodes[i];
|
1119
|
-
if (
|
1138
|
+
if (tensor_backend_id(node) != -1) {
|
1120
1139
|
// do not overwrite user assignments
|
1121
1140
|
continue;
|
1122
1141
|
}
|
1123
|
-
|
1142
|
+
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
|
1124
1143
|
// src
|
1125
1144
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1126
1145
|
struct ggml_tensor * src = node->src[j];
|
1127
1146
|
if (src == NULL) {
|
1128
|
-
|
1147
|
+
continue;
|
1129
1148
|
}
|
1130
|
-
if (
|
1131
|
-
|
1149
|
+
if (tensor_backend_id(src) == -1) {
|
1150
|
+
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
1132
1151
|
}
|
1133
1152
|
}
|
1134
1153
|
}
|
@@ -1143,22 +1162,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1143
1162
|
|
1144
1163
|
// pass 2.1 expand gpu up
|
1145
1164
|
{
|
1146
|
-
|
1165
|
+
int cur_backend_id = -1;
|
1147
1166
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1148
1167
|
struct ggml_tensor * node = graph->nodes[i];
|
1149
1168
|
if (ggml_is_view_op(node->op)) {
|
1150
1169
|
continue;
|
1151
1170
|
}
|
1152
|
-
|
1153
|
-
if (
|
1154
|
-
if (
|
1171
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1172
|
+
if (tensor_backend_id != -1) {
|
1173
|
+
if (tensor_backend_id == sched->n_backends - 1) {
|
1155
1174
|
// skip cpu (lowest prio backend)
|
1156
|
-
|
1175
|
+
cur_backend_id = -1;
|
1157
1176
|
} else {
|
1158
|
-
|
1177
|
+
cur_backend_id = tensor_backend_id;
|
1159
1178
|
}
|
1160
1179
|
} else {
|
1161
|
-
|
1180
|
+
tensor_backend_id(node) = cur_backend_id;
|
1162
1181
|
SET_CAUSE(node, "2.1");
|
1163
1182
|
}
|
1164
1183
|
}
|
@@ -1166,22 +1185,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1166
1185
|
|
1167
1186
|
// pass 2.2 expand gpu down
|
1168
1187
|
{
|
1169
|
-
|
1188
|
+
int cur_backend_id = -1;
|
1170
1189
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1171
1190
|
struct ggml_tensor * node = graph->nodes[i];
|
1172
1191
|
if (ggml_is_view_op(node->op)) {
|
1173
1192
|
continue;
|
1174
1193
|
}
|
1175
|
-
|
1176
|
-
if (
|
1177
|
-
if (
|
1194
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1195
|
+
if (tensor_backend_id != -1) {
|
1196
|
+
if (tensor_backend_id == sched->n_backends - 1) {
|
1178
1197
|
// skip cpu (lowest prio backend)
|
1179
|
-
|
1198
|
+
cur_backend_id = -1;
|
1180
1199
|
} else {
|
1181
|
-
|
1200
|
+
cur_backend_id = tensor_backend_id;
|
1182
1201
|
}
|
1183
1202
|
} else {
|
1184
|
-
|
1203
|
+
tensor_backend_id(node) = cur_backend_id;
|
1185
1204
|
SET_CAUSE(node, "2.2");
|
1186
1205
|
}
|
1187
1206
|
}
|
@@ -1189,17 +1208,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1189
1208
|
|
1190
1209
|
// pass 2.3 expand rest up
|
1191
1210
|
{
|
1192
|
-
|
1211
|
+
int cur_backend_id = -1;
|
1193
1212
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1194
1213
|
struct ggml_tensor * node = graph->nodes[i];
|
1195
1214
|
if (ggml_is_view_op(node->op)) {
|
1196
1215
|
continue;
|
1197
1216
|
}
|
1198
|
-
|
1199
|
-
if (
|
1200
|
-
|
1217
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1218
|
+
if (tensor_backend_id != -1) {
|
1219
|
+
cur_backend_id = tensor_backend_id;
|
1201
1220
|
} else {
|
1202
|
-
|
1221
|
+
tensor_backend_id(node) = cur_backend_id;
|
1203
1222
|
SET_CAUSE(node, "2.3");
|
1204
1223
|
}
|
1205
1224
|
}
|
@@ -1207,17 +1226,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1207
1226
|
|
1208
1227
|
// pass 2.4 expand rest down
|
1209
1228
|
{
|
1210
|
-
|
1229
|
+
int cur_backend_id = -1;
|
1211
1230
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1212
1231
|
struct ggml_tensor * node = graph->nodes[i];
|
1213
1232
|
if (ggml_is_view_op(node->op)) {
|
1214
1233
|
continue;
|
1215
1234
|
}
|
1216
|
-
|
1217
|
-
if (
|
1218
|
-
|
1235
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1236
|
+
if (tensor_backend_id != -1) {
|
1237
|
+
cur_backend_id = tensor_backend_id;
|
1219
1238
|
} else {
|
1220
|
-
|
1239
|
+
tensor_backend_id(node) = cur_backend_id;
|
1221
1240
|
SET_CAUSE(node, "2.4");
|
1222
1241
|
}
|
1223
1242
|
}
|
@@ -1229,24 +1248,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1229
1248
|
// pass 3: assign backends to remaining src from dst and view_src
|
1230
1249
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1231
1250
|
struct ggml_tensor * node = graph->nodes[i];
|
1232
|
-
|
1233
|
-
if (node->view_src != NULL &&
|
1234
|
-
|
1251
|
+
int cur_backend_id = tensor_backend_id(node);
|
1252
|
+
if (node->view_src != NULL && cur_backend_id == -1) {
|
1253
|
+
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
|
1235
1254
|
SET_CAUSE(node, "3.vsrc");
|
1236
1255
|
}
|
1237
1256
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1238
1257
|
struct ggml_tensor * src = node->src[j];
|
1239
1258
|
if (src == NULL) {
|
1240
|
-
|
1259
|
+
continue;
|
1241
1260
|
}
|
1242
|
-
|
1243
|
-
if (
|
1261
|
+
int src_backend_id = tensor_backend_id(src);
|
1262
|
+
if (src_backend_id == -1) {
|
1244
1263
|
if (src->view_src != NULL) {
|
1245
1264
|
// views are always on the same backend as the source
|
1246
|
-
|
1265
|
+
tensor_backend_id(src) = tensor_backend_id(src->view_src);
|
1247
1266
|
SET_CAUSE(src, "3.vsrc");
|
1248
1267
|
} else {
|
1249
|
-
|
1268
|
+
tensor_backend_id(src) = cur_backend_id;
|
1250
1269
|
SET_CAUSE(src, "3.cur");
|
1251
1270
|
}
|
1252
1271
|
}
|
@@ -1263,15 +1282,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1263
1282
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1264
1283
|
struct ggml_tensor * node = graph->nodes[i];
|
1265
1284
|
if (!ggml_is_view_op(node->op)) {
|
1266
|
-
sched->splits[0].
|
1285
|
+
sched->splits[0].backend_id = tensor_backend_id(node);
|
1267
1286
|
break;
|
1268
1287
|
}
|
1269
1288
|
}
|
1270
1289
|
sched->splits[0].i_start = 0;
|
1271
1290
|
sched->splits[0].n_inputs = 0;
|
1272
1291
|
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
1273
|
-
|
1274
|
-
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1292
|
+
int cur_backend_id = sched->splits[0].backend_id;
|
1275
1293
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1276
1294
|
struct ggml_tensor * node = graph->nodes[i];
|
1277
1295
|
|
@@ -1279,64 +1297,45 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1279
1297
|
continue;
|
1280
1298
|
}
|
1281
1299
|
|
1282
|
-
|
1300
|
+
int tensor_backend_id = tensor_backend_id(node);
|
1283
1301
|
|
1284
|
-
GGML_ASSERT(
|
1302
|
+
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
|
1285
1303
|
|
1286
|
-
if (
|
1304
|
+
if (tensor_backend_id != cur_backend_id) {
|
1287
1305
|
sched->splits[cur_split].i_end = i;
|
1288
1306
|
cur_split++;
|
1289
1307
|
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
1290
|
-
sched->splits[cur_split].
|
1308
|
+
sched->splits[cur_split].backend_id = tensor_backend_id;
|
1291
1309
|
sched->splits[cur_split].i_start = i;
|
1292
1310
|
sched->splits[cur_split].n_inputs = 0;
|
1293
|
-
|
1294
|
-
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1311
|
+
cur_backend_id = tensor_backend_id;
|
1295
1312
|
}
|
1296
1313
|
|
1297
1314
|
// find inputs that are not on the same backend
|
1298
1315
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1299
1316
|
struct ggml_tensor * src = node->src[j];
|
1300
1317
|
if (src == NULL) {
|
1301
|
-
|
1318
|
+
continue;
|
1302
1319
|
}
|
1303
|
-
|
1304
|
-
|
1305
|
-
if (
|
1320
|
+
int src_backend_id = tensor_backend_id(src);
|
1321
|
+
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1322
|
+
if (src_backend_id != tensor_backend_id) {
|
1306
1323
|
// create a copy of the input in the split's backend
|
1307
1324
|
size_t id = hash_id(src);
|
1308
|
-
if (sched->
|
1309
|
-
ggml_backend_t backend =
|
1325
|
+
if (sched->tensor_copies[id][cur_backend_id] == NULL) {
|
1326
|
+
ggml_backend_t backend = sched->backends[cur_backend_id];
|
1310
1327
|
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1311
1328
|
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
1312
1329
|
|
1313
|
-
sched->
|
1314
|
-
|
1330
|
+
sched->tensor_copies[id][cur_backend_id] = tensor_copy;
|
1331
|
+
tensor_backend_id(tensor_copy) = cur_backend_id;
|
1315
1332
|
SET_CAUSE(tensor_copy, "4.cpy");
|
1316
1333
|
|
1317
1334
|
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1318
1335
|
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
1319
1336
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
1320
1337
|
}
|
1321
|
-
node->src[j] = sched->
|
1322
|
-
|
1323
|
-
#if 0
|
1324
|
-
// check if the input is already in the split
|
1325
|
-
bool found = false;
|
1326
|
-
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
1327
|
-
if (sched->splits[cur_split].inputs[k] == src) {
|
1328
|
-
found = true;
|
1329
|
-
break;
|
1330
|
-
}
|
1331
|
-
}
|
1332
|
-
|
1333
|
-
if (!found) {
|
1334
|
-
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1335
|
-
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
1336
|
-
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
1337
|
-
sched->splits[cur_split].inputs[n_inputs] = src;
|
1338
|
-
}
|
1339
|
-
#endif
|
1338
|
+
node->src[j] = sched->tensor_copies[id][cur_backend_id];
|
1340
1339
|
}
|
1341
1340
|
}
|
1342
1341
|
}
|
@@ -1351,30 +1350,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1351
1350
|
// sanity check: all sources should have the same backend as the node
|
1352
1351
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1353
1352
|
struct ggml_tensor * node = graph->nodes[i];
|
1354
|
-
|
1355
|
-
if (
|
1353
|
+
ggml_backend_t tensor_backend = tensor_backend(node);
|
1354
|
+
if (tensor_backend == NULL) {
|
1356
1355
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
1357
1356
|
}
|
1358
|
-
if (node->view_src != NULL &&
|
1357
|
+
if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
|
1359
1358
|
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
1360
|
-
node->name,
|
1361
|
-
node->view_src->name,
|
1359
|
+
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1360
|
+
node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
|
1362
1361
|
}
|
1363
1362
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1364
1363
|
struct ggml_tensor * src = node->src[j];
|
1365
1364
|
if (src == NULL) {
|
1366
|
-
|
1365
|
+
continue;
|
1367
1366
|
}
|
1368
|
-
|
1369
|
-
if (
|
1367
|
+
ggml_backend_t src_backend = tensor_backend(src);
|
1368
|
+
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
1370
1369
|
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
1371
|
-
node->name,
|
1372
|
-
j, src->name,
|
1370
|
+
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1371
|
+
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
1373
1372
|
}
|
1374
|
-
if (src->view_src != NULL &&
|
1373
|
+
if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
|
1375
1374
|
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
1376
|
-
src->name,
|
1377
|
-
src->view_src->name,
|
1375
|
+
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
1376
|
+
src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
|
1378
1377
|
}
|
1379
1378
|
}
|
1380
1379
|
}
|
@@ -1388,32 +1387,45 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1388
1387
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1389
1388
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
1390
1389
|
|
1391
|
-
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1392
1390
|
for (int j = 0; j < split->n_inputs; j++) {
|
1393
1391
|
struct ggml_tensor * input = split->inputs[j];
|
1394
|
-
struct ggml_tensor * input_cpy = sched->
|
1392
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
|
1393
|
+
|
1395
1394
|
// add a dependency to the input source so that it is not freed before the copy is done
|
1396
|
-
|
1397
|
-
|
1395
|
+
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
1396
|
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
1397
|
+
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1398
|
+
|
1399
|
+
// add a dependency to the input copy so that it is allocated at the start of the split
|
1400
|
+
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
1398
1401
|
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
1399
1402
|
}
|
1400
1403
|
|
1401
1404
|
for (int j = split->i_start; j < split->i_end; j++) {
|
1405
|
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
1402
1406
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1403
1407
|
}
|
1404
1408
|
}
|
1405
1409
|
sched->graph = graph_copy;
|
1406
1410
|
}
|
1407
1411
|
|
1408
|
-
static
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1412
|
+
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
1413
|
+
// ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
1414
|
+
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1415
|
+
#ifndef NDEBUG
|
1416
|
+
fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
|
1417
|
+
#endif
|
1418
|
+
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
1419
|
+
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1420
|
+
fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
|
1421
|
+
return false;
|
1422
|
+
}
|
1423
|
+
}
|
1424
|
+
|
1425
|
+
return true;
|
1414
1426
|
}
|
1415
1427
|
|
1416
|
-
static
|
1428
|
+
static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
1417
1429
|
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
1418
1430
|
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
1419
1431
|
|
@@ -1421,20 +1433,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1421
1433
|
|
1422
1434
|
for (int i = 0; i < sched->n_splits; i++) {
|
1423
1435
|
struct ggml_backend_sched_split * split = &splits[i];
|
1424
|
-
|
1425
|
-
|
1436
|
+
int split_backend_id = split->backend_id;
|
1437
|
+
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
1426
1438
|
|
1427
1439
|
// copy the input tensors to the split backend
|
1428
1440
|
uint64_t copy_start_us = ggml_time_us();
|
1429
1441
|
for (int j = 0; j < split->n_inputs; j++) {
|
1430
1442
|
struct ggml_tensor * input = split->inputs[j];
|
1431
|
-
struct ggml_tensor * input_cpy = sched->
|
1443
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
|
1432
1444
|
|
1433
1445
|
GGML_ASSERT(input->buffer != NULL);
|
1434
1446
|
GGML_ASSERT(input_cpy->buffer != NULL);
|
1435
1447
|
|
1436
|
-
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
|
1437
|
-
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
|
1438
1448
|
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
1439
1449
|
}
|
1440
1450
|
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
@@ -1450,7 +1460,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1450
1460
|
|
1451
1461
|
uint64_t compute_start_us = ggml_time_us();
|
1452
1462
|
if (!sched->callback_eval) {
|
1453
|
-
ggml_backend_graph_compute(split_backend, &split->graph)
|
1463
|
+
if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
|
1464
|
+
return false;
|
1465
|
+
}
|
1454
1466
|
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1455
1467
|
} else {
|
1456
1468
|
// similar to ggml_backend_compare_graph_backend
|
@@ -1470,7 +1482,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1470
1482
|
|
1471
1483
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
1472
1484
|
|
1473
|
-
ggml_backend_graph_compute(split_backend, &gv)
|
1485
|
+
if (!ggml_backend_graph_compute(split_backend, &gv)) {
|
1486
|
+
return false;
|
1487
|
+
}
|
1474
1488
|
|
1475
1489
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
1476
1490
|
break;
|
@@ -1492,19 +1506,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1492
1506
|
}
|
1493
1507
|
}
|
1494
1508
|
#endif
|
1495
|
-
}
|
1496
1509
|
|
1497
|
-
|
1498
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
1499
|
-
ggml_tallocr_reset(sched->tallocs[i]);
|
1500
|
-
}
|
1501
|
-
// reset state for the next run
|
1502
|
-
size_t hash_size = sched->hash_set.size;
|
1503
|
-
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
1504
|
-
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
1505
|
-
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
1506
|
-
|
1507
|
-
sched->is_reset = true;
|
1510
|
+
return true;
|
1508
1511
|
}
|
1509
1512
|
|
1510
1513
|
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
@@ -1514,9 +1517,10 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
|
|
1514
1517
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
1515
1518
|
|
1516
1519
|
// initialize hash table
|
1517
|
-
sched->hash_set
|
1518
|
-
sched->
|
1519
|
-
sched->
|
1520
|
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1521
|
+
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
1522
|
+
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
1523
|
+
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
1520
1524
|
|
1521
1525
|
sched->n_backends = n_backends;
|
1522
1526
|
for (int i = 0; i < n_backends; i++) {
|
@@ -1524,14 +1528,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
|
|
1524
1528
|
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
1525
1529
|
}
|
1526
1530
|
|
1527
|
-
sched->galloc =
|
1528
|
-
|
1529
|
-
// init measure allocs for each backend
|
1530
|
-
for (int i = 0; i < n_backends; i++) {
|
1531
|
-
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
|
1532
|
-
}
|
1531
|
+
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
1533
1532
|
|
1534
|
-
|
1533
|
+
ggml_backend_sched_reset(sched);
|
1535
1534
|
|
1536
1535
|
return sched;
|
1537
1536
|
}
|
@@ -1540,49 +1539,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1540
1539
|
if (sched == NULL) {
|
1541
1540
|
return;
|
1542
1541
|
}
|
1543
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
1544
|
-
ggml_tallocr_free(sched->tallocs[i]);
|
1545
|
-
}
|
1546
1542
|
ggml_gallocr_free(sched->galloc);
|
1547
1543
|
ggml_free(sched->ctx);
|
1548
1544
|
free(sched->hash_set.keys);
|
1549
|
-
free(sched->
|
1550
|
-
free(sched->
|
1545
|
+
free(sched->tensor_backend_id);
|
1546
|
+
free(sched->tensor_copies);
|
1547
|
+
free(sched->node_backend_ids);
|
1551
1548
|
free(sched);
|
1552
1549
|
}
|
1553
1550
|
|
1554
|
-
void
|
1555
|
-
|
1551
|
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1552
|
+
// reset state for the next run
|
1553
|
+
size_t hash_size = sched->hash_set.size;
|
1554
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
1555
|
+
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
1556
|
+
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1556
1557
|
|
1557
|
-
|
1558
|
-
|
1558
|
+
sched->is_reset = true;
|
1559
|
+
}
|
1559
1560
|
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1561
|
+
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1562
|
+
ggml_backend_sched_split_graph(sched, measure_graph);
|
1563
|
+
|
1564
|
+
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
|
1565
|
+
return false;
|
1565
1566
|
}
|
1566
1567
|
|
1567
|
-
|
1568
|
+
ggml_backend_sched_reset(sched);
|
1569
|
+
return true;
|
1568
1570
|
}
|
1569
1571
|
|
1570
|
-
|
1572
|
+
bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1571
1573
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1572
1574
|
|
1573
1575
|
if (!sched->is_reset) {
|
1574
|
-
|
1576
|
+
ggml_backend_sched_reset(sched);
|
1575
1577
|
}
|
1576
1578
|
|
1577
|
-
|
1578
|
-
|
1579
|
-
|
1580
|
-
}
|
1579
|
+
ggml_backend_sched_split_graph(sched, graph);
|
1580
|
+
if (!ggml_backend_sched_alloc_splits(sched)) {
|
1581
|
+
return false;
|
1582
|
+
}
|
1581
1583
|
|
1582
|
-
|
1583
|
-
|
1584
|
-
}
|
1584
|
+
if (!ggml_backend_sched_compute_splits(sched)) {
|
1585
|
+
return false;
|
1586
|
+
}
|
1585
1587
|
|
1588
|
+
return true;
|
1589
|
+
}
|
1586
1590
|
|
1587
1591
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
1588
1592
|
sched->callback_eval = callback;
|
@@ -1593,37 +1597,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|
1593
1597
|
return sched->n_splits;
|
1594
1598
|
}
|
1595
1599
|
|
1596
|
-
|
1597
|
-
int backend_index =
|
1598
|
-
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1599
|
-
return sched->tallocs[backend_index];
|
1600
|
-
}
|
1601
|
-
|
1602
|
-
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1603
|
-
int backend_index = sched_backend_prio(sched, backend);
|
1600
|
+
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1601
|
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1604
1602
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1605
|
-
return
|
1603
|
+
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
1606
1604
|
}
|
1607
1605
|
|
1608
1606
|
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
1609
|
-
int backend_index =
|
1607
|
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1610
1608
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1611
|
-
|
1609
|
+
tensor_backend_id(node) = backend_index;
|
1612
1610
|
}
|
1613
1611
|
|
1614
1612
|
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
1615
|
-
|
1616
|
-
if (
|
1613
|
+
int backend_index = tensor_backend_id(node);
|
1614
|
+
if (backend_index == -1) {
|
1617
1615
|
return NULL;
|
1618
1616
|
}
|
1619
|
-
return
|
1617
|
+
return sched->backends[backend_index];
|
1620
1618
|
}
|
1621
1619
|
|
1622
1620
|
// utils
|
1623
1621
|
|
1624
1622
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
1625
1623
|
GGML_ASSERT(tensor->buffer == NULL);
|
1626
|
-
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
|
1627
1624
|
GGML_ASSERT(tensor->view_src != NULL);
|
1628
1625
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1629
1626
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
@@ -1647,7 +1644,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
|
|
1647
1644
|
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1648
1645
|
}
|
1649
1646
|
|
1650
|
-
static struct ggml_tensor *
|
1647
|
+
static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
1651
1648
|
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
1652
1649
|
|
1653
1650
|
GGML_ASSERT(src != NULL);
|
@@ -1660,7 +1657,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
|
|
1660
1657
|
|
1661
1658
|
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
1662
1659
|
if (src->view_src != NULL) {
|
1663
|
-
dst->view_src =
|
1660
|
+
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
1664
1661
|
dst->view_offs = src->view_offs;
|
1665
1662
|
}
|
1666
1663
|
dst->op = src->op;
|
@@ -1671,16 +1668,16 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
|
|
1671
1668
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1672
1669
|
struct ggml_tensor * s = src->src[i];
|
1673
1670
|
if (s == NULL) {
|
1674
|
-
|
1671
|
+
continue;
|
1675
1672
|
}
|
1676
|
-
dst->src[i] =
|
1673
|
+
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
1677
1674
|
}
|
1678
1675
|
|
1679
1676
|
node_copies[id] = dst;
|
1680
1677
|
return dst;
|
1681
1678
|
}
|
1682
1679
|
|
1683
|
-
static void
|
1680
|
+
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
1684
1681
|
size_t id = ggml_hash_find(hash_set, src);
|
1685
1682
|
if (node_init[id]) {
|
1686
1683
|
return;
|
@@ -1689,7 +1686,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
1689
1686
|
|
1690
1687
|
struct ggml_tensor * dst = node_copies[id];
|
1691
1688
|
if (dst->view_src != NULL) {
|
1692
|
-
|
1689
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1693
1690
|
ggml_backend_view_init(dst->view_src->buffer, dst);
|
1694
1691
|
}
|
1695
1692
|
else {
|
@@ -1700,19 +1697,19 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
1700
1697
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1701
1698
|
struct ggml_tensor * s = src->src[i];
|
1702
1699
|
if (s == NULL) {
|
1703
|
-
|
1700
|
+
continue;
|
1704
1701
|
}
|
1705
|
-
|
1702
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
1706
1703
|
}
|
1707
1704
|
}
|
1708
1705
|
|
1709
1706
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1710
1707
|
struct ggml_hash_set hash_set = {
|
1711
1708
|
/* .size = */ graph->visited_hash_table.size,
|
1712
|
-
/* .keys = */ calloc(sizeof(hash_set.keys[0])
|
1709
|
+
/* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
|
1713
1710
|
};
|
1714
|
-
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0])
|
1715
|
-
bool * node_init = calloc(sizeof(node_init[0])
|
1711
|
+
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
|
1712
|
+
bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
|
1716
1713
|
|
1717
1714
|
struct ggml_init_params params = {
|
1718
1715
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
@@ -1741,7 +1738,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
1741
1738
|
// dup nodes
|
1742
1739
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1743
1740
|
struct ggml_tensor * node = graph->nodes[i];
|
1744
|
-
|
1741
|
+
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
1745
1742
|
}
|
1746
1743
|
|
1747
1744
|
// allocate nodes
|
@@ -1766,7 +1763,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
1766
1763
|
// copy data and init views
|
1767
1764
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1768
1765
|
struct ggml_tensor * node = graph->nodes[i];
|
1769
|
-
|
1766
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, node);
|
1770
1767
|
}
|
1771
1768
|
|
1772
1769
|
// build graph copy
|