llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
221
221
|
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
222
222
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
223
223
|
|
224
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
225
224
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
225
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
226
226
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
227
227
|
|
228
228
|
if (!size) {
|
229
229
|
return;
|
230
230
|
}
|
231
231
|
|
232
|
-
|
232
|
+
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
233
233
|
}
|
234
234
|
|
235
235
|
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
236
236
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
237
237
|
|
238
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
238
239
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
239
|
-
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
240
240
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
241
241
|
|
242
242
|
if (!size) {
|
243
243
|
return;
|
244
244
|
}
|
245
245
|
|
246
|
-
|
246
|
+
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
247
247
|
}
|
248
248
|
|
249
249
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
@@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
255
255
|
}
|
256
256
|
|
257
257
|
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
258
|
+
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
259
|
+
|
258
260
|
return backend->iface.graph_plan_create(backend, cgraph);
|
259
261
|
}
|
260
262
|
|
261
263
|
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
264
|
+
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
265
|
+
|
262
266
|
backend->iface.graph_plan_free(backend, plan);
|
263
267
|
}
|
264
268
|
|
265
|
-
|
266
|
-
backend->iface.graph_plan_compute
|
269
|
+
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
270
|
+
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
271
|
+
|
272
|
+
return backend->iface.graph_plan_compute(backend, plan);
|
273
|
+
}
|
274
|
+
|
275
|
+
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
276
|
+
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
|
277
|
+
ggml_backend_synchronize(backend);
|
278
|
+
return err;
|
267
279
|
}
|
268
280
|
|
269
|
-
bool
|
281
|
+
bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
270
282
|
return backend->iface.graph_compute(backend, cgraph);
|
271
283
|
}
|
272
284
|
|
@@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
314
326
|
}
|
315
327
|
}
|
316
328
|
|
317
|
-
void ggml_backend_tensor_copy_async(ggml_backend_t
|
329
|
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
318
330
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
319
331
|
|
320
332
|
if (src == dst) {
|
321
333
|
return;
|
322
334
|
}
|
323
335
|
|
324
|
-
if (
|
325
|
-
if (
|
326
|
-
|
327
|
-
return;
|
328
|
-
}
|
336
|
+
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
337
|
+
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
338
|
+
return;
|
329
339
|
}
|
330
340
|
}
|
331
341
|
|
332
|
-
|
342
|
+
// an async copy would normally happen after all the queued operations on both backends are completed
|
343
|
+
// sync src, set_async dst
|
333
344
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
334
|
-
|
335
|
-
|
336
|
-
else {
|
345
|
+
ggml_backend_synchronize(backend_src);
|
346
|
+
ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
|
347
|
+
} else {
|
348
|
+
ggml_backend_synchronize(backend_src);
|
337
349
|
ggml_backend_tensor_copy(src, dst);
|
350
|
+
ggml_backend_synchronize(backend_dst);
|
338
351
|
}
|
339
352
|
}
|
340
353
|
|
354
|
+
// events
|
355
|
+
|
356
|
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
|
357
|
+
if (backend->iface.event_new == NULL) {
|
358
|
+
return NULL;
|
359
|
+
}
|
360
|
+
return backend->iface.event_new(backend);
|
361
|
+
}
|
362
|
+
|
363
|
+
void ggml_backend_event_free(ggml_backend_event_t event) {
|
364
|
+
if (event == NULL) {
|
365
|
+
return;
|
366
|
+
}
|
367
|
+
event->backend->iface.event_free(event);
|
368
|
+
}
|
369
|
+
|
370
|
+
void ggml_backend_event_record(ggml_backend_event_t event) {
|
371
|
+
GGML_ASSERT(event->backend->iface.event_record != NULL);
|
372
|
+
|
373
|
+
event->backend->iface.event_record(event);
|
374
|
+
}
|
375
|
+
|
376
|
+
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
377
|
+
GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
|
378
|
+
|
379
|
+
event->backend->iface.event_synchronize(event);
|
380
|
+
}
|
381
|
+
|
382
|
+
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
383
|
+
GGML_ASSERT(backend->iface.event_wait != NULL);
|
384
|
+
|
385
|
+
backend->iface.event_wait(backend, event);
|
386
|
+
}
|
341
387
|
|
342
388
|
// backend registry
|
343
389
|
|
344
|
-
#define
|
390
|
+
#define GGML_REG_MAX_BACKENDS 16
|
345
391
|
|
346
392
|
struct ggml_backend_reg {
|
347
393
|
char name[128];
|
@@ -350,7 +396,7 @@ struct ggml_backend_reg {
|
|
350
396
|
void * user_data;
|
351
397
|
};
|
352
398
|
|
353
|
-
static struct ggml_backend_reg ggml_backend_registry[
|
399
|
+
static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
|
354
400
|
static size_t ggml_backend_registry_count = 0;
|
355
401
|
|
356
402
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
@@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
395
441
|
}
|
396
442
|
|
397
443
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
398
|
-
GGML_ASSERT(ggml_backend_registry_count <
|
444
|
+
GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
|
399
445
|
|
400
446
|
size_t id = ggml_backend_registry_count;
|
401
447
|
|
@@ -732,22 +778,26 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
|
|
732
778
|
GGML_UNUSED(backend);
|
733
779
|
}
|
734
780
|
|
735
|
-
GGML_CALL static
|
781
|
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
736
782
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
737
783
|
|
738
|
-
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
784
|
+
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
739
785
|
|
740
786
|
GGML_UNUSED(backend);
|
741
787
|
}
|
742
788
|
|
743
|
-
GGML_CALL static
|
789
|
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
744
790
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
745
791
|
|
746
792
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
747
793
|
|
748
794
|
if (cpu_ctx->work_size < cplan.work_size) {
|
749
|
-
|
750
|
-
cpu_ctx->work_data =
|
795
|
+
free(cpu_ctx->work_data);
|
796
|
+
cpu_ctx->work_data = malloc(cplan.work_size);
|
797
|
+
if (cpu_ctx->work_data == NULL) {
|
798
|
+
cpu_ctx->work_size = 0;
|
799
|
+
return GGML_STATUS_ALLOC_FAILED;
|
800
|
+
}
|
751
801
|
cpu_ctx->work_size = cplan.work_size;
|
752
802
|
}
|
753
803
|
cplan.work_data = cpu_ctx->work_data;
|
@@ -755,8 +805,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
755
805
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
756
806
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
757
807
|
|
758
|
-
ggml_graph_compute(cgraph, &cplan);
|
759
|
-
return true;
|
808
|
+
return ggml_graph_compute(cgraph, &cplan);
|
760
809
|
}
|
761
810
|
|
762
811
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -785,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
785
834
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
786
835
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
787
836
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
837
|
+
/* .event_new = */ NULL,
|
838
|
+
/* .event_free = */ NULL,
|
839
|
+
/* .event_record = */ NULL,
|
840
|
+
/* .event_wait = */ NULL,
|
841
|
+
/* .event_synchronize = */ NULL,
|
788
842
|
};
|
789
843
|
|
790
844
|
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
@@ -940,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
940
994
|
|
941
995
|
// scheduler
|
942
996
|
|
943
|
-
#
|
944
|
-
#define
|
945
|
-
#
|
997
|
+
#ifndef GGML_SCHED_MAX_BACKENDS
|
998
|
+
#define GGML_SCHED_MAX_BACKENDS 16
|
999
|
+
#endif
|
1000
|
+
|
1001
|
+
#ifndef GGML_SCHED_MAX_SPLITS
|
1002
|
+
#define GGML_SCHED_MAX_SPLITS 256
|
1003
|
+
#endif
|
1004
|
+
|
1005
|
+
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
1006
|
+
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
1007
|
+
#endif
|
1008
|
+
|
1009
|
+
#ifndef GGML_SCHED_MAX_COPIES
|
1010
|
+
#define GGML_SCHED_MAX_COPIES 4
|
1011
|
+
#endif
|
946
1012
|
|
947
1013
|
struct ggml_backend_sched_split {
|
948
1014
|
int backend_id;
|
949
1015
|
int i_start;
|
950
1016
|
int i_end;
|
951
|
-
struct ggml_tensor * inputs[
|
1017
|
+
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
952
1018
|
int n_inputs;
|
953
1019
|
// graph view of this split
|
954
1020
|
struct ggml_cgraph graph;
|
@@ -956,45 +1022,53 @@ struct ggml_backend_sched_split {
|
|
956
1022
|
|
957
1023
|
struct ggml_backend_sched {
|
958
1024
|
bool is_reset; // true if the scheduler has been reset since the last graph split
|
1025
|
+
bool is_alloc;
|
959
1026
|
|
960
1027
|
int n_backends;
|
961
|
-
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
962
|
-
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
963
1028
|
|
1029
|
+
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
|
1030
|
+
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
964
1031
|
ggml_gallocr_t galloc;
|
965
1032
|
|
966
1033
|
// hash keys of the nodes in the graph
|
967
1034
|
struct ggml_hash_set hash_set;
|
968
1035
|
// hash values
|
969
1036
|
int * tensor_backend_id;
|
970
|
-
struct ggml_tensor * (* tensor_copies)[
|
1037
|
+
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
971
1038
|
|
972
|
-
int * node_backend_ids; // [
|
973
|
-
int
|
1039
|
+
int * node_backend_ids; // [graph_size]
|
1040
|
+
int * leaf_backend_ids; // [graph_size]
|
974
1041
|
|
975
1042
|
// copy of the graph with modified inputs
|
976
1043
|
struct ggml_cgraph * graph;
|
977
1044
|
|
978
|
-
|
1045
|
+
// graph splits
|
1046
|
+
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
|
979
1047
|
int n_splits;
|
980
1048
|
|
1049
|
+
// pipeline parallelism support
|
1050
|
+
int n_copies;
|
1051
|
+
int cur_copy;
|
1052
|
+
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
1053
|
+
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
1054
|
+
int n_graph_inputs;
|
1055
|
+
|
981
1056
|
struct ggml_context * ctx;
|
982
1057
|
|
983
1058
|
ggml_backend_sched_eval_callback callback_eval;
|
984
1059
|
void * callback_eval_user_data;
|
985
1060
|
|
986
1061
|
// align context_buffer to GGML_MEM_ALIGN
|
987
|
-
|
1062
|
+
#ifdef _MSC_VER
|
988
1063
|
__declspec(align(GGML_MEM_ALIGN))
|
989
|
-
|
1064
|
+
#else
|
990
1065
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
991
|
-
|
992
|
-
char context_buffer[
|
1066
|
+
#endif
|
1067
|
+
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
993
1068
|
};
|
994
1069
|
|
995
|
-
#define hash_id(
|
996
|
-
#define tensor_backend_id(
|
997
|
-
#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
|
1070
|
+
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
1071
|
+
#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
|
998
1072
|
|
999
1073
|
// returns the priority of the backend, lower id is higher priority
|
1000
1074
|
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
@@ -1006,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
1006
1080
|
return -1;
|
1007
1081
|
}
|
1008
1082
|
|
1009
|
-
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched,
|
1083
|
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
1084
|
+
ggml_backend_buffer_t buffer = tensor->buffer;
|
1010
1085
|
if (buffer == NULL) {
|
1011
1086
|
return -1;
|
1012
1087
|
}
|
@@ -1017,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
|
|
1017
1092
|
return i;
|
1018
1093
|
}
|
1019
1094
|
}
|
1020
|
-
|
1021
|
-
|
1095
|
+
|
1096
|
+
fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
|
1097
|
+
__func__, ggml_backend_buffer_name(buffer), tensor->name);
|
1098
|
+
GGML_ASSERT(false);
|
1099
|
+
|
1100
|
+
return -1;
|
1022
1101
|
}
|
1023
1102
|
|
1024
1103
|
#if 0
|
1025
|
-
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 +
|
1104
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
1026
1105
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
1027
1106
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
1028
1107
|
#else
|
@@ -1036,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1036
1115
|
|
1037
1116
|
// assign pre-allocated nodes to their backend
|
1038
1117
|
// dst
|
1039
|
-
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor
|
1118
|
+
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
1040
1119
|
if (cur_backend != -1) {
|
1041
|
-
SET_CAUSE(
|
1120
|
+
SET_CAUSE(tensor, "1.dst");
|
1042
1121
|
return cur_backend;
|
1043
1122
|
}
|
1123
|
+
|
1044
1124
|
// view_src
|
1045
1125
|
if (tensor->view_src != NULL) {
|
1046
|
-
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src
|
1126
|
+
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
1047
1127
|
if (cur_backend != -1) {
|
1048
|
-
SET_CAUSE(
|
1128
|
+
SET_CAUSE(tensor, "1.vsrc");
|
1049
1129
|
return cur_backend;
|
1050
1130
|
}
|
1051
1131
|
}
|
1132
|
+
|
1133
|
+
// input
|
1134
|
+
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
1135
|
+
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
|
1136
|
+
SET_CAUSE(tensor, "1.inp");
|
1137
|
+
return cur_backend;
|
1138
|
+
}
|
1139
|
+
|
1052
1140
|
// assign nodes that use weights to the backend of the weights
|
1053
1141
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1054
1142
|
const struct ggml_tensor * src = tensor->src[i];
|
@@ -1056,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1056
1144
|
continue;
|
1057
1145
|
}
|
1058
1146
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1059
|
-
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src
|
1147
|
+
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
|
1060
1148
|
// operations with weights are always run on the same backend as the weights
|
1061
|
-
SET_CAUSE(
|
1149
|
+
SET_CAUSE(tensor, "1.wgt%d", i);
|
1062
1150
|
return src_backend;
|
1063
1151
|
}
|
1064
1152
|
}
|
@@ -1094,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1094
1182
|
if (ggml_is_view_op(node->op)) {
|
1095
1183
|
continue;
|
1096
1184
|
}
|
1097
|
-
ggml_backend_t tensor_backend =
|
1185
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
1098
1186
|
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
1099
1187
|
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1100
1188
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -1102,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1102
1190
|
if (src == NULL) {
|
1103
1191
|
continue;
|
1104
1192
|
}
|
1105
|
-
ggml_backend_t src_backend =
|
1193
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
1106
1194
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1107
1195
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1108
1196
|
}
|
@@ -1119,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1119
1207
|
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1120
1208
|
// reset splits
|
1121
1209
|
sched->n_splits = 0;
|
1210
|
+
sched->n_graph_inputs = 0;
|
1122
1211
|
sched->is_reset = false;
|
1123
1212
|
|
1124
1213
|
struct ggml_init_params params = {
|
@@ -1164,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1164
1253
|
}
|
1165
1254
|
}
|
1166
1255
|
#ifdef DEBUG_PASS1
|
1167
|
-
fprintf(stderr, "PASS 1 ASSIGNMENTS\n");
|
1256
|
+
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1168
1257
|
#endif
|
1169
1258
|
|
1170
1259
|
// pass 2: expand current backend assignments
|
@@ -1172,10 +1261,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1172
1261
|
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
1173
1262
|
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
1174
1263
|
|
1175
|
-
|
1264
|
+
|
1265
|
+
// pass 2.2 expand gpu down
|
1176
1266
|
{
|
1177
1267
|
int cur_backend_id = -1;
|
1178
|
-
for (int i =
|
1268
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1179
1269
|
struct ggml_tensor * node = graph->nodes[i];
|
1180
1270
|
if (ggml_is_view_op(node->op)) {
|
1181
1271
|
continue;
|
@@ -1190,15 +1280,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1190
1280
|
}
|
1191
1281
|
} else {
|
1192
1282
|
tensor_backend_id(node) = cur_backend_id;
|
1193
|
-
SET_CAUSE(node, "2.
|
1283
|
+
SET_CAUSE(node, "2.2");
|
1194
1284
|
}
|
1195
1285
|
}
|
1196
1286
|
}
|
1197
1287
|
|
1198
|
-
// pass 2.
|
1288
|
+
// pass 2.1 expand gpu up
|
1199
1289
|
{
|
1200
1290
|
int cur_backend_id = -1;
|
1201
|
-
for (int i =
|
1291
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1202
1292
|
struct ggml_tensor * node = graph->nodes[i];
|
1203
1293
|
if (ggml_is_view_op(node->op)) {
|
1204
1294
|
continue;
|
@@ -1213,15 +1303,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1213
1303
|
}
|
1214
1304
|
} else {
|
1215
1305
|
tensor_backend_id(node) = cur_backend_id;
|
1216
|
-
SET_CAUSE(node, "2.
|
1306
|
+
SET_CAUSE(node, "2.1");
|
1217
1307
|
}
|
1218
1308
|
}
|
1219
1309
|
}
|
1220
1310
|
|
1221
|
-
|
1311
|
+
|
1312
|
+
// pass 2.4 expand rest down
|
1222
1313
|
{
|
1223
1314
|
int cur_backend_id = -1;
|
1224
|
-
for (int i =
|
1315
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1225
1316
|
struct ggml_tensor * node = graph->nodes[i];
|
1226
1317
|
if (ggml_is_view_op(node->op)) {
|
1227
1318
|
continue;
|
@@ -1231,15 +1322,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1231
1322
|
cur_backend_id = tensor_backend_id;
|
1232
1323
|
} else {
|
1233
1324
|
tensor_backend_id(node) = cur_backend_id;
|
1234
|
-
SET_CAUSE(node, "2.
|
1325
|
+
SET_CAUSE(node, "2.4");
|
1235
1326
|
}
|
1236
1327
|
}
|
1237
1328
|
}
|
1238
|
-
|
1239
|
-
// pass 2.4 expand rest down
|
1329
|
+
// pass 2.3 expand rest up
|
1240
1330
|
{
|
1241
1331
|
int cur_backend_id = -1;
|
1242
|
-
for (int i =
|
1332
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1243
1333
|
struct ggml_tensor * node = graph->nodes[i];
|
1244
1334
|
if (ggml_is_view_op(node->op)) {
|
1245
1335
|
continue;
|
@@ -1249,12 +1339,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1249
1339
|
cur_backend_id = tensor_backend_id;
|
1250
1340
|
} else {
|
1251
1341
|
tensor_backend_id(node) = cur_backend_id;
|
1252
|
-
SET_CAUSE(node, "2.
|
1342
|
+
SET_CAUSE(node, "2.3");
|
1253
1343
|
}
|
1254
1344
|
}
|
1255
1345
|
}
|
1346
|
+
|
1256
1347
|
#ifdef DEBUG_PASS2
|
1257
|
-
fprintf(stderr, "PASS 2 ASSIGNMENTS\n");
|
1348
|
+
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1258
1349
|
#endif
|
1259
1350
|
|
1260
1351
|
// pass 3: assign backends to remaining src from dst and view_src
|
@@ -1284,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1284
1375
|
}
|
1285
1376
|
}
|
1286
1377
|
#ifdef DEBUG_PASS3
|
1287
|
-
fprintf(stderr, "PASS 3 ASSIGNMENTS\n");
|
1378
|
+
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1288
1379
|
#endif
|
1289
1380
|
|
1290
1381
|
// pass 4: split graph, find tensors that need to be copied
|
@@ -1316,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1316
1407
|
if (tensor_backend_id != cur_backend_id) {
|
1317
1408
|
sched->splits[cur_split].i_end = i;
|
1318
1409
|
cur_split++;
|
1319
|
-
GGML_ASSERT(cur_split <
|
1410
|
+
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
|
1320
1411
|
sched->splits[cur_split].backend_id = tensor_backend_id;
|
1321
1412
|
sched->splits[cur_split].i_start = i;
|
1322
1413
|
sched->splits[cur_split].n_inputs = 0;
|
@@ -1329,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1329
1420
|
if (src == NULL) {
|
1330
1421
|
continue;
|
1331
1422
|
}
|
1423
|
+
|
1332
1424
|
int src_backend_id = tensor_backend_id(src);
|
1333
1425
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1426
|
+
|
1427
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
1428
|
+
size_t id = hash_id(src);
|
1429
|
+
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
1430
|
+
ggml_backend_t backend = sched->backends[src_backend_id];
|
1431
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1432
|
+
struct ggml_tensor * tensor_copy;
|
1433
|
+
if (c == sched->cur_copy) {
|
1434
|
+
tensor_copy = src; // use the original tensor as the current copy
|
1435
|
+
} else {
|
1436
|
+
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1437
|
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
1438
|
+
}
|
1439
|
+
if (sched->n_copies > 1) {
|
1440
|
+
ggml_set_input(tensor_copy);
|
1441
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1442
|
+
}
|
1443
|
+
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
1444
|
+
tensor_backend_id(tensor_copy) = src_backend_id;
|
1445
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1446
|
+
}
|
1447
|
+
int n_graph_inputs = sched->n_graph_inputs++;
|
1448
|
+
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1449
|
+
sched->graph_inputs[n_graph_inputs] = src;
|
1450
|
+
}
|
1451
|
+
}
|
1452
|
+
|
1334
1453
|
if (src_backend_id != tensor_backend_id) {
|
1335
1454
|
// create a copy of the input in the split's backend
|
1336
1455
|
size_t id = hash_id(src);
|
1337
|
-
if (sched->tensor_copies[id][cur_backend_id] == NULL) {
|
1456
|
+
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
1338
1457
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1458
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1459
|
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1460
|
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
1461
|
+
if (sched->n_copies > 1) {
|
1462
|
+
ggml_set_input(tensor_copy);
|
1463
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1464
|
+
}
|
1465
|
+
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
1466
|
+
tensor_backend_id(tensor_copy) = cur_backend_id;
|
1467
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1468
|
+
}
|
1346
1469
|
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1347
|
-
GGML_ASSERT(n_inputs <
|
1470
|
+
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1348
1471
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
1349
1472
|
}
|
1350
|
-
node->src[j] = sched->tensor_copies[id][cur_backend_id];
|
1473
|
+
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
1351
1474
|
}
|
1352
1475
|
}
|
1353
1476
|
}
|
@@ -1355,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1355
1478
|
sched->n_splits = cur_split + 1;
|
1356
1479
|
}
|
1357
1480
|
#ifdef DEBUG_PASS4
|
1358
|
-
fprintf(stderr, "PASS 4 ASSIGNMENTS\n");
|
1481
|
+
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1359
1482
|
#endif
|
1360
1483
|
|
1361
1484
|
#ifndef NDEBUG
|
1362
1485
|
// sanity check: all sources should have the same backend as the node
|
1363
1486
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1364
1487
|
struct ggml_tensor * node = graph->nodes[i];
|
1365
|
-
ggml_backend_t tensor_backend =
|
1488
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
1366
1489
|
if (tensor_backend == NULL) {
|
1367
1490
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
1368
1491
|
}
|
1369
|
-
if (node->view_src != NULL && tensor_backend !=
|
1492
|
+
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
1370
1493
|
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
1371
1494
|
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1372
|
-
node->view_src->name,
|
1495
|
+
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
1496
|
+
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
1373
1497
|
}
|
1374
1498
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1375
1499
|
struct ggml_tensor * src = node->src[j];
|
1376
1500
|
if (src == NULL) {
|
1377
1501
|
continue;
|
1378
1502
|
}
|
1379
|
-
ggml_backend_t src_backend =
|
1503
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
1380
1504
|
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
1381
1505
|
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
1382
1506
|
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1383
1507
|
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
1384
1508
|
}
|
1385
|
-
if (src->view_src != NULL && src_backend !=
|
1509
|
+
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
1386
1510
|
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
1387
1511
|
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
1388
|
-
src->view_src->name,
|
1512
|
+
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
1513
|
+
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
1389
1514
|
}
|
1390
1515
|
}
|
1391
1516
|
}
|
@@ -1393,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1393
1518
|
#endif
|
1394
1519
|
|
1395
1520
|
// create copies of the graph for each split
|
1396
|
-
//
|
1397
|
-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*
|
1521
|
+
// TODO: avoid this copy
|
1522
|
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
1398
1523
|
for (int i = 0; i < sched->n_splits; i++) {
|
1399
1524
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1400
1525
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
1401
1526
|
|
1527
|
+
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1402
1528
|
for (int j = 0; j < split->n_inputs; j++) {
|
1403
1529
|
struct ggml_tensor * input = split->inputs[j];
|
1404
|
-
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
|
1530
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
|
1405
1531
|
|
1406
1532
|
// add a dependency to the input source so that it is not freed before the copy is done
|
1407
1533
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
1534
|
+
input_dep->src[0] = input;
|
1408
1535
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
1409
1536
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1410
1537
|
|
@@ -1418,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1418
1545
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1419
1546
|
}
|
1420
1547
|
}
|
1548
|
+
|
1549
|
+
if (sched->n_copies > 1) {
|
1550
|
+
// add input copies as leafs so that they are allocated first
|
1551
|
+
for (int i = 0; i < sched->n_graph_inputs; i++) {
|
1552
|
+
struct ggml_tensor * input = sched->graph_inputs[i];
|
1553
|
+
size_t id = hash_id(input);
|
1554
|
+
int backend_id = tensor_backend_id(input);
|
1555
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1556
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
1557
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1558
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1559
|
+
}
|
1560
|
+
}
|
1561
|
+
|
1562
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1563
|
+
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1564
|
+
int backend_id = split->backend_id;
|
1565
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1566
|
+
struct ggml_tensor * input = split->inputs[j];
|
1567
|
+
size_t id = hash_id(input);
|
1568
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1569
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
1570
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1571
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1572
|
+
}
|
1573
|
+
}
|
1574
|
+
}
|
1575
|
+
}
|
1576
|
+
|
1577
|
+
// add leafs from the original graph
|
1578
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
1579
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
1580
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
1581
|
+
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
1582
|
+
}
|
1583
|
+
|
1421
1584
|
sched->graph = graph_copy;
|
1422
1585
|
}
|
1423
1586
|
|
1424
1587
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
1425
|
-
//
|
1588
|
+
// allocate graph
|
1426
1589
|
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1590
|
+
// the re-allocation may cause the split inputs to be moved to a different address
|
1591
|
+
ggml_backend_sched_synchronize(sched);
|
1427
1592
|
#ifndef NDEBUG
|
1428
|
-
fprintf(stderr, "
|
1593
|
+
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
|
1429
1594
|
#endif
|
1430
|
-
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
1595
|
+
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
1431
1596
|
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1432
|
-
fprintf(stderr, "
|
1597
|
+
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
1433
1598
|
return false;
|
1434
1599
|
}
|
1435
1600
|
}
|
@@ -1437,10 +1602,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
1437
1602
|
return true;
|
1438
1603
|
}
|
1439
1604
|
|
1440
|
-
static
|
1441
|
-
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
1442
|
-
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
1443
|
-
|
1605
|
+
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
1444
1606
|
struct ggml_backend_sched_split * splits = sched->splits;
|
1445
1607
|
|
1446
1608
|
for (int i = 0; i < sched->n_splits; i++) {
|
@@ -1449,33 +1611,36 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1449
1611
|
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
1450
1612
|
|
1451
1613
|
// copy the input tensors to the split backend
|
1452
|
-
uint64_t copy_start_us = ggml_time_us();
|
1453
1614
|
for (int j = 0; j < split->n_inputs; j++) {
|
1615
|
+
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
1454
1616
|
struct ggml_tensor * input = split->inputs[j];
|
1455
|
-
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
|
1617
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
|
1456
1618
|
|
1457
|
-
|
1458
|
-
|
1619
|
+
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
1620
|
+
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
1621
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1622
|
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1623
|
+
} else {
|
1624
|
+
ggml_backend_synchronize(split_backend);
|
1625
|
+
}
|
1626
|
+
ggml_backend_tensor_copy(input, input_cpy);
|
1627
|
+
} else {
|
1628
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1629
|
+
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
1630
|
+
} else {
|
1631
|
+
ggml_backend_synchronize(split_backend);
|
1632
|
+
ggml_backend_synchronize(input_backend);
|
1633
|
+
}
|
1459
1634
|
|
1460
|
-
|
1635
|
+
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
1636
|
+
}
|
1461
1637
|
}
|
1462
|
-
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
1463
|
-
int64_t copy_end_us = ggml_time_us();
|
1464
|
-
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
1465
|
-
|
1466
|
-
#if 0
|
1467
|
-
char split_filename[GGML_MAX_NAME];
|
1468
|
-
snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
|
1469
|
-
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
1470
|
-
#endif
|
1471
1638
|
|
1472
|
-
|
1473
|
-
uint64_t compute_start_us = ggml_time_us();
|
1474
1639
|
if (!sched->callback_eval) {
|
1475
|
-
|
1476
|
-
|
1640
|
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
1641
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1642
|
+
return ec;
|
1477
1643
|
}
|
1478
|
-
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1479
1644
|
} else {
|
1480
1645
|
// similar to ggml_backend_compare_graph_backend
|
1481
1646
|
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
@@ -1494,10 +1659,14 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1494
1659
|
|
1495
1660
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
1496
1661
|
|
1497
|
-
|
1498
|
-
|
1662
|
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
|
1663
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1664
|
+
return ec;
|
1499
1665
|
}
|
1500
1666
|
|
1667
|
+
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
1668
|
+
ggml_backend_synchronize(split_backend);
|
1669
|
+
|
1501
1670
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
1502
1671
|
break;
|
1503
1672
|
}
|
@@ -1505,39 +1674,54 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1505
1674
|
j0 = j1;
|
1506
1675
|
}
|
1507
1676
|
}
|
1508
|
-
uint64_t compute_end_us = ggml_time_us();
|
1509
|
-
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
1510
|
-
}
|
1511
1677
|
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
|
1678
|
+
// record the event of this copy
|
1679
|
+
if (split->n_inputs > 0) {
|
1680
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1681
|
+
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
|
1682
|
+
}
|
1518
1683
|
}
|
1519
1684
|
}
|
1520
|
-
#endif
|
1521
1685
|
|
1522
|
-
|
1686
|
+
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
1687
|
+
|
1688
|
+
return GGML_STATUS_SUCCESS;
|
1523
1689
|
}
|
1524
1690
|
|
1525
|
-
ggml_backend_sched_t ggml_backend_sched_new(
|
1691
|
+
ggml_backend_sched_t ggml_backend_sched_new(
|
1692
|
+
ggml_backend_t * backends,
|
1693
|
+
ggml_backend_buffer_type_t * bufts,
|
1694
|
+
int n_backends,
|
1695
|
+
size_t graph_size,
|
1696
|
+
bool parallel) {
|
1526
1697
|
GGML_ASSERT(n_backends > 0);
|
1527
|
-
GGML_ASSERT(n_backends <=
|
1698
|
+
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1699
|
+
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1528
1700
|
|
1529
1701
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
1530
1702
|
|
1531
1703
|
// initialize hash table
|
1532
|
-
sched->hash_set = ggml_hash_set_new(graph_size +
|
1704
|
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
1533
1705
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
1534
1706
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
1535
1707
|
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
1708
|
+
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
|
1536
1709
|
|
1537
1710
|
sched->n_backends = n_backends;
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1711
|
+
|
1712
|
+
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1713
|
+
|
1714
|
+
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
|
1715
|
+
|
1716
|
+
for (int b = 0; b < n_backends; b++) {
|
1717
|
+
sched->backends[b] = backends[b];
|
1718
|
+
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
1719
|
+
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
|
1720
|
+
if (sched->n_copies > 1) {
|
1721
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1722
|
+
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
1723
|
+
}
|
1724
|
+
}
|
1541
1725
|
}
|
1542
1726
|
|
1543
1727
|
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
@@ -1551,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1551
1735
|
if (sched == NULL) {
|
1552
1736
|
return;
|
1553
1737
|
}
|
1738
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1739
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1740
|
+
ggml_backend_event_free(sched->events[b][c]);
|
1741
|
+
}
|
1742
|
+
}
|
1554
1743
|
ggml_gallocr_free(sched->galloc);
|
1555
1744
|
ggml_free(sched->ctx);
|
1556
1745
|
free(sched->hash_set.keys);
|
1557
1746
|
free(sched->tensor_backend_id);
|
1558
1747
|
free(sched->tensor_copies);
|
1559
1748
|
free(sched->node_backend_ids);
|
1749
|
+
free(sched->leaf_backend_ids);
|
1560
1750
|
free(sched);
|
1561
1751
|
}
|
1562
1752
|
|
@@ -1568,38 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1568
1758
|
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1569
1759
|
|
1570
1760
|
sched->is_reset = true;
|
1761
|
+
sched->is_alloc = false;
|
1571
1762
|
}
|
1572
1763
|
|
1573
1764
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1574
1765
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
1575
1766
|
|
1576
|
-
|
1767
|
+
// TODO: extract this to a separate function
|
1768
|
+
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
1577
1769
|
return false;
|
1578
1770
|
}
|
1579
1771
|
|
1580
1772
|
ggml_backend_sched_reset(sched);
|
1773
|
+
ggml_backend_sched_synchronize(sched);
|
1774
|
+
|
1581
1775
|
return true;
|
1582
1776
|
}
|
1583
1777
|
|
1584
|
-
bool
|
1585
|
-
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes +
|
1586
|
-
|
1587
|
-
if (!sched->is_reset) {
|
1588
|
-
ggml_backend_sched_reset(sched);
|
1589
|
-
}
|
1778
|
+
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1779
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
1590
1780
|
|
1591
1781
|
ggml_backend_sched_split_graph(sched, graph);
|
1782
|
+
|
1592
1783
|
if (!ggml_backend_sched_alloc_splits(sched)) {
|
1593
1784
|
return false;
|
1594
1785
|
}
|
1595
1786
|
|
1596
|
-
|
1597
|
-
return false;
|
1598
|
-
}
|
1787
|
+
sched->is_alloc = true;
|
1599
1788
|
|
1600
1789
|
return true;
|
1601
1790
|
}
|
1602
1791
|
|
1792
|
+
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1793
|
+
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
|
1794
|
+
ggml_backend_sched_synchronize(sched);
|
1795
|
+
return err;
|
1796
|
+
}
|
1797
|
+
|
1798
|
+
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1799
|
+
if (!sched->is_reset && !sched->is_alloc) {
|
1800
|
+
ggml_backend_sched_reset(sched);
|
1801
|
+
}
|
1802
|
+
|
1803
|
+
if (!sched->is_alloc) {
|
1804
|
+
if (!ggml_backend_sched_alloc_graph(sched, graph)) {
|
1805
|
+
return GGML_STATUS_ALLOC_FAILED;
|
1806
|
+
}
|
1807
|
+
}
|
1808
|
+
|
1809
|
+
return ggml_backend_sched_compute_splits(sched);
|
1810
|
+
}
|
1811
|
+
|
1812
|
+
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
1813
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
1814
|
+
ggml_backend_synchronize(sched->backends[i]);
|
1815
|
+
}
|
1816
|
+
}
|
1817
|
+
|
1603
1818
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
1604
1819
|
sched->callback_eval = callback;
|
1605
1820
|
sched->callback_eval_user_data = user_data;
|
@@ -1609,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|
1609
1824
|
return sched->n_splits;
|
1610
1825
|
}
|
1611
1826
|
|
1827
|
+
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
1828
|
+
return sched->n_copies;
|
1829
|
+
}
|
1830
|
+
|
1612
1831
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1613
1832
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1614
1833
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1834
|
+
|
1615
1835
|
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
1616
1836
|
}
|
1617
1837
|
|
1618
|
-
void
|
1838
|
+
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
1619
1839
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1620
1840
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1621
1841
|
tensor_backend_id(node) = backend_index;
|
1622
1842
|
}
|
1623
1843
|
|
1624
|
-
ggml_backend_t
|
1844
|
+
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
1625
1845
|
int backend_index = tensor_backend_id(node);
|
1626
1846
|
if (backend_index == -1) {
|
1627
1847
|
return NULL;
|