llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
221
221
|
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
222
222
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
223
223
|
|
224
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
225
224
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
225
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
226
226
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
227
227
|
|
228
228
|
if (!size) {
|
229
229
|
return;
|
230
230
|
}
|
231
231
|
|
232
|
-
|
232
|
+
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
233
233
|
}
|
234
234
|
|
235
235
|
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
236
236
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
237
237
|
|
238
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
238
239
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
239
|
-
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
240
240
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
241
241
|
|
242
242
|
if (!size) {
|
243
243
|
return;
|
244
244
|
}
|
245
245
|
|
246
|
-
|
246
|
+
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
247
247
|
}
|
248
248
|
|
249
249
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
@@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
255
255
|
}
|
256
256
|
|
257
257
|
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
258
|
+
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
259
|
+
|
258
260
|
return backend->iface.graph_plan_create(backend, cgraph);
|
259
261
|
}
|
260
262
|
|
261
263
|
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
264
|
+
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
265
|
+
|
262
266
|
backend->iface.graph_plan_free(backend, plan);
|
263
267
|
}
|
264
268
|
|
265
|
-
|
266
|
-
backend->iface.graph_plan_compute
|
269
|
+
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
270
|
+
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
271
|
+
|
272
|
+
return backend->iface.graph_plan_compute(backend, plan);
|
273
|
+
}
|
274
|
+
|
275
|
+
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
276
|
+
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
|
277
|
+
ggml_backend_synchronize(backend);
|
278
|
+
return err;
|
267
279
|
}
|
268
280
|
|
269
|
-
bool
|
281
|
+
bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
270
282
|
return backend->iface.graph_compute(backend, cgraph);
|
271
283
|
}
|
272
284
|
|
@@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
314
326
|
}
|
315
327
|
}
|
316
328
|
|
317
|
-
void ggml_backend_tensor_copy_async(ggml_backend_t
|
329
|
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
318
330
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
319
331
|
|
320
332
|
if (src == dst) {
|
321
333
|
return;
|
322
334
|
}
|
323
335
|
|
324
|
-
if (
|
325
|
-
if (
|
326
|
-
|
327
|
-
return;
|
328
|
-
}
|
336
|
+
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
337
|
+
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
338
|
+
return;
|
329
339
|
}
|
330
340
|
}
|
331
341
|
|
332
|
-
|
342
|
+
// an async copy would normally happen after all the queued operations on both backends are completed
|
343
|
+
// sync src, set_async dst
|
333
344
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
334
|
-
|
335
|
-
|
336
|
-
else {
|
345
|
+
ggml_backend_synchronize(backend_src);
|
346
|
+
ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
|
347
|
+
} else {
|
348
|
+
ggml_backend_synchronize(backend_src);
|
337
349
|
ggml_backend_tensor_copy(src, dst);
|
350
|
+
ggml_backend_synchronize(backend_dst);
|
338
351
|
}
|
339
352
|
}
|
340
353
|
|
354
|
+
// events
|
355
|
+
|
356
|
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
|
357
|
+
if (backend->iface.event_new == NULL) {
|
358
|
+
return NULL;
|
359
|
+
}
|
360
|
+
return backend->iface.event_new(backend);
|
361
|
+
}
|
362
|
+
|
363
|
+
void ggml_backend_event_free(ggml_backend_event_t event) {
|
364
|
+
if (event == NULL) {
|
365
|
+
return;
|
366
|
+
}
|
367
|
+
event->backend->iface.event_free(event);
|
368
|
+
}
|
369
|
+
|
370
|
+
void ggml_backend_event_record(ggml_backend_event_t event) {
|
371
|
+
GGML_ASSERT(event->backend->iface.event_record != NULL);
|
372
|
+
|
373
|
+
event->backend->iface.event_record(event);
|
374
|
+
}
|
375
|
+
|
376
|
+
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
377
|
+
GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
|
378
|
+
|
379
|
+
event->backend->iface.event_synchronize(event);
|
380
|
+
}
|
381
|
+
|
382
|
+
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
383
|
+
GGML_ASSERT(backend->iface.event_wait != NULL);
|
384
|
+
|
385
|
+
backend->iface.event_wait(backend, event);
|
386
|
+
}
|
341
387
|
|
342
388
|
// backend registry
|
343
389
|
|
344
|
-
#define
|
390
|
+
#define GGML_REG_MAX_BACKENDS 16
|
345
391
|
|
346
392
|
struct ggml_backend_reg {
|
347
393
|
char name[128];
|
@@ -350,7 +396,7 @@ struct ggml_backend_reg {
|
|
350
396
|
void * user_data;
|
351
397
|
};
|
352
398
|
|
353
|
-
static struct ggml_backend_reg ggml_backend_registry[
|
399
|
+
static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
|
354
400
|
static size_t ggml_backend_registry_count = 0;
|
355
401
|
|
356
402
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
@@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
395
441
|
}
|
396
442
|
|
397
443
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
398
|
-
GGML_ASSERT(ggml_backend_registry_count <
|
444
|
+
GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
|
399
445
|
|
400
446
|
size_t id = ggml_backend_registry_count;
|
401
447
|
|
@@ -732,22 +778,26 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
|
|
732
778
|
GGML_UNUSED(backend);
|
733
779
|
}
|
734
780
|
|
735
|
-
GGML_CALL static
|
781
|
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
736
782
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
737
783
|
|
738
|
-
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
784
|
+
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
739
785
|
|
740
786
|
GGML_UNUSED(backend);
|
741
787
|
}
|
742
788
|
|
743
|
-
GGML_CALL static
|
789
|
+
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
744
790
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
745
791
|
|
746
792
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
747
793
|
|
748
794
|
if (cpu_ctx->work_size < cplan.work_size) {
|
749
|
-
|
750
|
-
cpu_ctx->work_data =
|
795
|
+
free(cpu_ctx->work_data);
|
796
|
+
cpu_ctx->work_data = malloc(cplan.work_size);
|
797
|
+
if (cpu_ctx->work_data == NULL) {
|
798
|
+
cpu_ctx->work_size = 0;
|
799
|
+
return GGML_STATUS_ALLOC_FAILED;
|
800
|
+
}
|
751
801
|
cpu_ctx->work_size = cplan.work_size;
|
752
802
|
}
|
753
803
|
cplan.work_data = cpu_ctx->work_data;
|
@@ -755,8 +805,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
755
805
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
756
806
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
757
807
|
|
758
|
-
ggml_graph_compute(cgraph, &cplan);
|
759
|
-
return true;
|
808
|
+
return ggml_graph_compute(cgraph, &cplan);
|
760
809
|
}
|
761
810
|
|
762
811
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -785,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
785
834
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
786
835
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
787
836
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
837
|
+
/* .event_new = */ NULL,
|
838
|
+
/* .event_free = */ NULL,
|
839
|
+
/* .event_record = */ NULL,
|
840
|
+
/* .event_wait = */ NULL,
|
841
|
+
/* .event_synchronize = */ NULL,
|
788
842
|
};
|
789
843
|
|
790
844
|
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
@@ -940,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
940
994
|
|
941
995
|
// scheduler
|
942
996
|
|
943
|
-
#
|
944
|
-
#define
|
945
|
-
#
|
997
|
+
#ifndef GGML_SCHED_MAX_BACKENDS
|
998
|
+
#define GGML_SCHED_MAX_BACKENDS 16
|
999
|
+
#endif
|
1000
|
+
|
1001
|
+
#ifndef GGML_SCHED_MAX_SPLITS
|
1002
|
+
#define GGML_SCHED_MAX_SPLITS 256
|
1003
|
+
#endif
|
1004
|
+
|
1005
|
+
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
1006
|
+
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
1007
|
+
#endif
|
1008
|
+
|
1009
|
+
#ifndef GGML_SCHED_MAX_COPIES
|
1010
|
+
#define GGML_SCHED_MAX_COPIES 4
|
1011
|
+
#endif
|
946
1012
|
|
947
1013
|
struct ggml_backend_sched_split {
|
948
1014
|
int backend_id;
|
949
1015
|
int i_start;
|
950
1016
|
int i_end;
|
951
|
-
struct ggml_tensor * inputs[
|
1017
|
+
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
952
1018
|
int n_inputs;
|
953
1019
|
// graph view of this split
|
954
1020
|
struct ggml_cgraph graph;
|
@@ -956,45 +1022,53 @@ struct ggml_backend_sched_split {
|
|
956
1022
|
|
957
1023
|
struct ggml_backend_sched {
|
958
1024
|
bool is_reset; // true if the scheduler has been reset since the last graph split
|
1025
|
+
bool is_alloc;
|
959
1026
|
|
960
1027
|
int n_backends;
|
961
|
-
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
962
|
-
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
963
1028
|
|
1029
|
+
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
|
1030
|
+
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
964
1031
|
ggml_gallocr_t galloc;
|
965
1032
|
|
966
1033
|
// hash keys of the nodes in the graph
|
967
1034
|
struct ggml_hash_set hash_set;
|
968
1035
|
// hash values
|
969
1036
|
int * tensor_backend_id;
|
970
|
-
struct ggml_tensor * (* tensor_copies)[
|
1037
|
+
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
971
1038
|
|
972
|
-
int * node_backend_ids; // [
|
973
|
-
int
|
1039
|
+
int * node_backend_ids; // [graph_size]
|
1040
|
+
int * leaf_backend_ids; // [graph_size]
|
974
1041
|
|
975
1042
|
// copy of the graph with modified inputs
|
976
1043
|
struct ggml_cgraph * graph;
|
977
1044
|
|
978
|
-
|
1045
|
+
// graph splits
|
1046
|
+
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
|
979
1047
|
int n_splits;
|
980
1048
|
|
1049
|
+
// pipeline parallelism support
|
1050
|
+
int n_copies;
|
1051
|
+
int cur_copy;
|
1052
|
+
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
1053
|
+
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
1054
|
+
int n_graph_inputs;
|
1055
|
+
|
981
1056
|
struct ggml_context * ctx;
|
982
1057
|
|
983
1058
|
ggml_backend_sched_eval_callback callback_eval;
|
984
1059
|
void * callback_eval_user_data;
|
985
1060
|
|
986
1061
|
// align context_buffer to GGML_MEM_ALIGN
|
987
|
-
|
1062
|
+
#ifdef _MSC_VER
|
988
1063
|
__declspec(align(GGML_MEM_ALIGN))
|
989
|
-
|
1064
|
+
#else
|
990
1065
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
991
|
-
|
992
|
-
char context_buffer[
|
1066
|
+
#endif
|
1067
|
+
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
993
1068
|
};
|
994
1069
|
|
995
|
-
#define hash_id(
|
996
|
-
#define tensor_backend_id(
|
997
|
-
#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
|
1070
|
+
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
1071
|
+
#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
|
998
1072
|
|
999
1073
|
// returns the priority of the backend, lower id is higher priority
|
1000
1074
|
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
@@ -1006,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
1006
1080
|
return -1;
|
1007
1081
|
}
|
1008
1082
|
|
1009
|
-
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched,
|
1083
|
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
1084
|
+
ggml_backend_buffer_t buffer = tensor->buffer;
|
1010
1085
|
if (buffer == NULL) {
|
1011
1086
|
return -1;
|
1012
1087
|
}
|
@@ -1017,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
|
|
1017
1092
|
return i;
|
1018
1093
|
}
|
1019
1094
|
}
|
1020
|
-
|
1021
|
-
|
1095
|
+
|
1096
|
+
fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
|
1097
|
+
__func__, ggml_backend_buffer_name(buffer), tensor->name);
|
1098
|
+
GGML_ASSERT(false);
|
1099
|
+
|
1100
|
+
return -1;
|
1022
1101
|
}
|
1023
1102
|
|
1024
1103
|
#if 0
|
1025
|
-
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 +
|
1104
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
1026
1105
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
1027
1106
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
1028
1107
|
#else
|
@@ -1036,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1036
1115
|
|
1037
1116
|
// assign pre-allocated nodes to their backend
|
1038
1117
|
// dst
|
1039
|
-
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor
|
1118
|
+
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
1040
1119
|
if (cur_backend != -1) {
|
1041
|
-
SET_CAUSE(
|
1120
|
+
SET_CAUSE(tensor, "1.dst");
|
1042
1121
|
return cur_backend;
|
1043
1122
|
}
|
1123
|
+
|
1044
1124
|
// view_src
|
1045
1125
|
if (tensor->view_src != NULL) {
|
1046
|
-
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src
|
1126
|
+
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
1047
1127
|
if (cur_backend != -1) {
|
1048
|
-
SET_CAUSE(
|
1128
|
+
SET_CAUSE(tensor, "1.vsrc");
|
1049
1129
|
return cur_backend;
|
1050
1130
|
}
|
1051
1131
|
}
|
1132
|
+
|
1133
|
+
// input
|
1134
|
+
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
1135
|
+
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
|
1136
|
+
SET_CAUSE(tensor, "1.inp");
|
1137
|
+
return cur_backend;
|
1138
|
+
}
|
1139
|
+
|
1052
1140
|
// assign nodes that use weights to the backend of the weights
|
1053
1141
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1054
1142
|
const struct ggml_tensor * src = tensor->src[i];
|
@@ -1056,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1056
1144
|
continue;
|
1057
1145
|
}
|
1058
1146
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1059
|
-
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src
|
1147
|
+
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
|
1060
1148
|
// operations with weights are always run on the same backend as the weights
|
1061
|
-
SET_CAUSE(
|
1149
|
+
SET_CAUSE(tensor, "1.wgt%d", i);
|
1062
1150
|
return src_backend;
|
1063
1151
|
}
|
1064
1152
|
}
|
@@ -1094,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1094
1182
|
if (ggml_is_view_op(node->op)) {
|
1095
1183
|
continue;
|
1096
1184
|
}
|
1097
|
-
ggml_backend_t tensor_backend =
|
1185
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
1098
1186
|
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
1099
1187
|
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1100
1188
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -1102,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1102
1190
|
if (src == NULL) {
|
1103
1191
|
continue;
|
1104
1192
|
}
|
1105
|
-
ggml_backend_t src_backend =
|
1193
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
1106
1194
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1107
1195
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1108
1196
|
}
|
@@ -1119,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1119
1207
|
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1120
1208
|
// reset splits
|
1121
1209
|
sched->n_splits = 0;
|
1210
|
+
sched->n_graph_inputs = 0;
|
1122
1211
|
sched->is_reset = false;
|
1123
1212
|
|
1124
1213
|
struct ggml_init_params params = {
|
@@ -1164,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1164
1253
|
}
|
1165
1254
|
}
|
1166
1255
|
#ifdef DEBUG_PASS1
|
1167
|
-
fprintf(stderr, "PASS 1 ASSIGNMENTS\n");
|
1256
|
+
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1168
1257
|
#endif
|
1169
1258
|
|
1170
1259
|
// pass 2: expand current backend assignments
|
@@ -1172,10 +1261,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1172
1261
|
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
1173
1262
|
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
1174
1263
|
|
1175
|
-
|
1264
|
+
|
1265
|
+
// pass 2.2 expand gpu down
|
1176
1266
|
{
|
1177
1267
|
int cur_backend_id = -1;
|
1178
|
-
for (int i =
|
1268
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1179
1269
|
struct ggml_tensor * node = graph->nodes[i];
|
1180
1270
|
if (ggml_is_view_op(node->op)) {
|
1181
1271
|
continue;
|
@@ -1190,15 +1280,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1190
1280
|
}
|
1191
1281
|
} else {
|
1192
1282
|
tensor_backend_id(node) = cur_backend_id;
|
1193
|
-
SET_CAUSE(node, "2.
|
1283
|
+
SET_CAUSE(node, "2.2");
|
1194
1284
|
}
|
1195
1285
|
}
|
1196
1286
|
}
|
1197
1287
|
|
1198
|
-
// pass 2.
|
1288
|
+
// pass 2.1 expand gpu up
|
1199
1289
|
{
|
1200
1290
|
int cur_backend_id = -1;
|
1201
|
-
for (int i =
|
1291
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1202
1292
|
struct ggml_tensor * node = graph->nodes[i];
|
1203
1293
|
if (ggml_is_view_op(node->op)) {
|
1204
1294
|
continue;
|
@@ -1213,15 +1303,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1213
1303
|
}
|
1214
1304
|
} else {
|
1215
1305
|
tensor_backend_id(node) = cur_backend_id;
|
1216
|
-
SET_CAUSE(node, "2.
|
1306
|
+
SET_CAUSE(node, "2.1");
|
1217
1307
|
}
|
1218
1308
|
}
|
1219
1309
|
}
|
1220
1310
|
|
1221
|
-
|
1311
|
+
|
1312
|
+
// pass 2.4 expand rest down
|
1222
1313
|
{
|
1223
1314
|
int cur_backend_id = -1;
|
1224
|
-
for (int i =
|
1315
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1225
1316
|
struct ggml_tensor * node = graph->nodes[i];
|
1226
1317
|
if (ggml_is_view_op(node->op)) {
|
1227
1318
|
continue;
|
@@ -1231,15 +1322,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1231
1322
|
cur_backend_id = tensor_backend_id;
|
1232
1323
|
} else {
|
1233
1324
|
tensor_backend_id(node) = cur_backend_id;
|
1234
|
-
SET_CAUSE(node, "2.
|
1325
|
+
SET_CAUSE(node, "2.4");
|
1235
1326
|
}
|
1236
1327
|
}
|
1237
1328
|
}
|
1238
|
-
|
1239
|
-
// pass 2.4 expand rest down
|
1329
|
+
// pass 2.3 expand rest up
|
1240
1330
|
{
|
1241
1331
|
int cur_backend_id = -1;
|
1242
|
-
for (int i =
|
1332
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1243
1333
|
struct ggml_tensor * node = graph->nodes[i];
|
1244
1334
|
if (ggml_is_view_op(node->op)) {
|
1245
1335
|
continue;
|
@@ -1249,12 +1339,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1249
1339
|
cur_backend_id = tensor_backend_id;
|
1250
1340
|
} else {
|
1251
1341
|
tensor_backend_id(node) = cur_backend_id;
|
1252
|
-
SET_CAUSE(node, "2.
|
1342
|
+
SET_CAUSE(node, "2.3");
|
1253
1343
|
}
|
1254
1344
|
}
|
1255
1345
|
}
|
1346
|
+
|
1256
1347
|
#ifdef DEBUG_PASS2
|
1257
|
-
fprintf(stderr, "PASS 2 ASSIGNMENTS\n");
|
1348
|
+
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1258
1349
|
#endif
|
1259
1350
|
|
1260
1351
|
// pass 3: assign backends to remaining src from dst and view_src
|
@@ -1284,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1284
1375
|
}
|
1285
1376
|
}
|
1286
1377
|
#ifdef DEBUG_PASS3
|
1287
|
-
fprintf(stderr, "PASS 3 ASSIGNMENTS\n");
|
1378
|
+
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1288
1379
|
#endif
|
1289
1380
|
|
1290
1381
|
// pass 4: split graph, find tensors that need to be copied
|
@@ -1316,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1316
1407
|
if (tensor_backend_id != cur_backend_id) {
|
1317
1408
|
sched->splits[cur_split].i_end = i;
|
1318
1409
|
cur_split++;
|
1319
|
-
GGML_ASSERT(cur_split <
|
1410
|
+
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
|
1320
1411
|
sched->splits[cur_split].backend_id = tensor_backend_id;
|
1321
1412
|
sched->splits[cur_split].i_start = i;
|
1322
1413
|
sched->splits[cur_split].n_inputs = 0;
|
@@ -1329,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1329
1420
|
if (src == NULL) {
|
1330
1421
|
continue;
|
1331
1422
|
}
|
1423
|
+
|
1332
1424
|
int src_backend_id = tensor_backend_id(src);
|
1333
1425
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1426
|
+
|
1427
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
1428
|
+
size_t id = hash_id(src);
|
1429
|
+
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
1430
|
+
ggml_backend_t backend = sched->backends[src_backend_id];
|
1431
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1432
|
+
struct ggml_tensor * tensor_copy;
|
1433
|
+
if (c == sched->cur_copy) {
|
1434
|
+
tensor_copy = src; // use the original tensor as the current copy
|
1435
|
+
} else {
|
1436
|
+
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1437
|
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
1438
|
+
}
|
1439
|
+
if (sched->n_copies > 1) {
|
1440
|
+
ggml_set_input(tensor_copy);
|
1441
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1442
|
+
}
|
1443
|
+
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
1444
|
+
tensor_backend_id(tensor_copy) = src_backend_id;
|
1445
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1446
|
+
}
|
1447
|
+
int n_graph_inputs = sched->n_graph_inputs++;
|
1448
|
+
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1449
|
+
sched->graph_inputs[n_graph_inputs] = src;
|
1450
|
+
}
|
1451
|
+
}
|
1452
|
+
|
1334
1453
|
if (src_backend_id != tensor_backend_id) {
|
1335
1454
|
// create a copy of the input in the split's backend
|
1336
1455
|
size_t id = hash_id(src);
|
1337
|
-
if (sched->tensor_copies[id][cur_backend_id] == NULL) {
|
1456
|
+
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
1338
1457
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1458
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1459
|
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1460
|
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
1461
|
+
if (sched->n_copies > 1) {
|
1462
|
+
ggml_set_input(tensor_copy);
|
1463
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1464
|
+
}
|
1465
|
+
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
1466
|
+
tensor_backend_id(tensor_copy) = cur_backend_id;
|
1467
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1468
|
+
}
|
1346
1469
|
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1347
|
-
GGML_ASSERT(n_inputs <
|
1470
|
+
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1348
1471
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
1349
1472
|
}
|
1350
|
-
node->src[j] = sched->tensor_copies[id][cur_backend_id];
|
1473
|
+
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
1351
1474
|
}
|
1352
1475
|
}
|
1353
1476
|
}
|
@@ -1355,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1355
1478
|
sched->n_splits = cur_split + 1;
|
1356
1479
|
}
|
1357
1480
|
#ifdef DEBUG_PASS4
|
1358
|
-
fprintf(stderr, "PASS 4 ASSIGNMENTS\n");
|
1481
|
+
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1359
1482
|
#endif
|
1360
1483
|
|
1361
1484
|
#ifndef NDEBUG
|
1362
1485
|
// sanity check: all sources should have the same backend as the node
|
1363
1486
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1364
1487
|
struct ggml_tensor * node = graph->nodes[i];
|
1365
|
-
ggml_backend_t tensor_backend =
|
1488
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
1366
1489
|
if (tensor_backend == NULL) {
|
1367
1490
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
1368
1491
|
}
|
1369
|
-
if (node->view_src != NULL && tensor_backend !=
|
1492
|
+
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
1370
1493
|
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
1371
1494
|
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1372
|
-
node->view_src->name,
|
1495
|
+
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
1496
|
+
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
1373
1497
|
}
|
1374
1498
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1375
1499
|
struct ggml_tensor * src = node->src[j];
|
1376
1500
|
if (src == NULL) {
|
1377
1501
|
continue;
|
1378
1502
|
}
|
1379
|
-
ggml_backend_t src_backend =
|
1503
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
1380
1504
|
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
1381
1505
|
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
1382
1506
|
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1383
1507
|
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
1384
1508
|
}
|
1385
|
-
if (src->view_src != NULL && src_backend !=
|
1509
|
+
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
1386
1510
|
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
1387
1511
|
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
1388
|
-
src->view_src->name,
|
1512
|
+
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
1513
|
+
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
1389
1514
|
}
|
1390
1515
|
}
|
1391
1516
|
}
|
@@ -1393,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1393
1518
|
#endif
|
1394
1519
|
|
1395
1520
|
// create copies of the graph for each split
|
1396
|
-
//
|
1397
|
-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*
|
1521
|
+
// TODO: avoid this copy
|
1522
|
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
1398
1523
|
for (int i = 0; i < sched->n_splits; i++) {
|
1399
1524
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1400
1525
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
1401
1526
|
|
1527
|
+
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1402
1528
|
for (int j = 0; j < split->n_inputs; j++) {
|
1403
1529
|
struct ggml_tensor * input = split->inputs[j];
|
1404
|
-
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
|
1530
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
|
1405
1531
|
|
1406
1532
|
// add a dependency to the input source so that it is not freed before the copy is done
|
1407
1533
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
1534
|
+
input_dep->src[0] = input;
|
1408
1535
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
1409
1536
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1410
1537
|
|
@@ -1418,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1418
1545
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1419
1546
|
}
|
1420
1547
|
}
|
1548
|
+
|
1549
|
+
if (sched->n_copies > 1) {
|
1550
|
+
// add input copies as leafs so that they are allocated first
|
1551
|
+
for (int i = 0; i < sched->n_graph_inputs; i++) {
|
1552
|
+
struct ggml_tensor * input = sched->graph_inputs[i];
|
1553
|
+
size_t id = hash_id(input);
|
1554
|
+
int backend_id = tensor_backend_id(input);
|
1555
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1556
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
1557
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1558
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1559
|
+
}
|
1560
|
+
}
|
1561
|
+
|
1562
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1563
|
+
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1564
|
+
int backend_id = split->backend_id;
|
1565
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1566
|
+
struct ggml_tensor * input = split->inputs[j];
|
1567
|
+
size_t id = hash_id(input);
|
1568
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1569
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
1570
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1571
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1572
|
+
}
|
1573
|
+
}
|
1574
|
+
}
|
1575
|
+
}
|
1576
|
+
|
1577
|
+
// add leafs from the original graph
|
1578
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
1579
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
1580
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
1581
|
+
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
1582
|
+
}
|
1583
|
+
|
1421
1584
|
sched->graph = graph_copy;
|
1422
1585
|
}
|
1423
1586
|
|
1424
1587
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
1425
|
-
//
|
1588
|
+
// allocate graph
|
1426
1589
|
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1590
|
+
// the re-allocation may cause the split inputs to be moved to a different address
|
1591
|
+
ggml_backend_sched_synchronize(sched);
|
1427
1592
|
#ifndef NDEBUG
|
1428
|
-
fprintf(stderr, "
|
1593
|
+
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
|
1429
1594
|
#endif
|
1430
|
-
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
1595
|
+
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
1431
1596
|
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1432
|
-
fprintf(stderr, "
|
1597
|
+
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
1433
1598
|
return false;
|
1434
1599
|
}
|
1435
1600
|
}
|
@@ -1437,10 +1602,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
1437
1602
|
return true;
|
1438
1603
|
}
|
1439
1604
|
|
1440
|
-
static
|
1441
|
-
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
1442
|
-
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
1443
|
-
|
1605
|
+
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
1444
1606
|
struct ggml_backend_sched_split * splits = sched->splits;
|
1445
1607
|
|
1446
1608
|
for (int i = 0; i < sched->n_splits; i++) {
|
@@ -1449,33 +1611,36 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1449
1611
|
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
1450
1612
|
|
1451
1613
|
// copy the input tensors to the split backend
|
1452
|
-
uint64_t copy_start_us = ggml_time_us();
|
1453
1614
|
for (int j = 0; j < split->n_inputs; j++) {
|
1615
|
+
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
1454
1616
|
struct ggml_tensor * input = split->inputs[j];
|
1455
|
-
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
|
1617
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
|
1456
1618
|
|
1457
|
-
|
1458
|
-
|
1619
|
+
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
1620
|
+
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
1621
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1622
|
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1623
|
+
} else {
|
1624
|
+
ggml_backend_synchronize(split_backend);
|
1625
|
+
}
|
1626
|
+
ggml_backend_tensor_copy(input, input_cpy);
|
1627
|
+
} else {
|
1628
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1629
|
+
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
1630
|
+
} else {
|
1631
|
+
ggml_backend_synchronize(split_backend);
|
1632
|
+
ggml_backend_synchronize(input_backend);
|
1633
|
+
}
|
1459
1634
|
|
1460
|
-
|
1635
|
+
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
1636
|
+
}
|
1461
1637
|
}
|
1462
|
-
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
1463
|
-
int64_t copy_end_us = ggml_time_us();
|
1464
|
-
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
1465
|
-
|
1466
|
-
#if 0
|
1467
|
-
char split_filename[GGML_MAX_NAME];
|
1468
|
-
snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
|
1469
|
-
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
1470
|
-
#endif
|
1471
1638
|
|
1472
|
-
|
1473
|
-
uint64_t compute_start_us = ggml_time_us();
|
1474
1639
|
if (!sched->callback_eval) {
|
1475
|
-
|
1476
|
-
|
1640
|
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
1641
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1642
|
+
return ec;
|
1477
1643
|
}
|
1478
|
-
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1479
1644
|
} else {
|
1480
1645
|
// similar to ggml_backend_compare_graph_backend
|
1481
1646
|
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
@@ -1494,10 +1659,14 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1494
1659
|
|
1495
1660
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
1496
1661
|
|
1497
|
-
|
1498
|
-
|
1662
|
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
|
1663
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1664
|
+
return ec;
|
1499
1665
|
}
|
1500
1666
|
|
1667
|
+
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
1668
|
+
ggml_backend_synchronize(split_backend);
|
1669
|
+
|
1501
1670
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
1502
1671
|
break;
|
1503
1672
|
}
|
@@ -1505,39 +1674,54 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1505
1674
|
j0 = j1;
|
1506
1675
|
}
|
1507
1676
|
}
|
1508
|
-
uint64_t compute_end_us = ggml_time_us();
|
1509
|
-
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
1510
|
-
}
|
1511
1677
|
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
|
1678
|
+
// record the event of this copy
|
1679
|
+
if (split->n_inputs > 0) {
|
1680
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1681
|
+
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
|
1682
|
+
}
|
1518
1683
|
}
|
1519
1684
|
}
|
1520
|
-
#endif
|
1521
1685
|
|
1522
|
-
|
1686
|
+
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
1687
|
+
|
1688
|
+
return GGML_STATUS_SUCCESS;
|
1523
1689
|
}
|
1524
1690
|
|
1525
|
-
ggml_backend_sched_t ggml_backend_sched_new(
|
1691
|
+
ggml_backend_sched_t ggml_backend_sched_new(
|
1692
|
+
ggml_backend_t * backends,
|
1693
|
+
ggml_backend_buffer_type_t * bufts,
|
1694
|
+
int n_backends,
|
1695
|
+
size_t graph_size,
|
1696
|
+
bool parallel) {
|
1526
1697
|
GGML_ASSERT(n_backends > 0);
|
1527
|
-
GGML_ASSERT(n_backends <=
|
1698
|
+
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1699
|
+
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1528
1700
|
|
1529
1701
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
1530
1702
|
|
1531
1703
|
// initialize hash table
|
1532
|
-
sched->hash_set = ggml_hash_set_new(graph_size +
|
1704
|
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
1533
1705
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
1534
1706
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
1535
1707
|
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
1708
|
+
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
|
1536
1709
|
|
1537
1710
|
sched->n_backends = n_backends;
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1711
|
+
|
1712
|
+
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1713
|
+
|
1714
|
+
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
|
1715
|
+
|
1716
|
+
for (int b = 0; b < n_backends; b++) {
|
1717
|
+
sched->backends[b] = backends[b];
|
1718
|
+
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
1719
|
+
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
|
1720
|
+
if (sched->n_copies > 1) {
|
1721
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1722
|
+
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
1723
|
+
}
|
1724
|
+
}
|
1541
1725
|
}
|
1542
1726
|
|
1543
1727
|
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
@@ -1551,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1551
1735
|
if (sched == NULL) {
|
1552
1736
|
return;
|
1553
1737
|
}
|
1738
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1739
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1740
|
+
ggml_backend_event_free(sched->events[b][c]);
|
1741
|
+
}
|
1742
|
+
}
|
1554
1743
|
ggml_gallocr_free(sched->galloc);
|
1555
1744
|
ggml_free(sched->ctx);
|
1556
1745
|
free(sched->hash_set.keys);
|
1557
1746
|
free(sched->tensor_backend_id);
|
1558
1747
|
free(sched->tensor_copies);
|
1559
1748
|
free(sched->node_backend_ids);
|
1749
|
+
free(sched->leaf_backend_ids);
|
1560
1750
|
free(sched);
|
1561
1751
|
}
|
1562
1752
|
|
@@ -1568,38 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1568
1758
|
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1569
1759
|
|
1570
1760
|
sched->is_reset = true;
|
1761
|
+
sched->is_alloc = false;
|
1571
1762
|
}
|
1572
1763
|
|
1573
1764
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1574
1765
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
1575
1766
|
|
1576
|
-
|
1767
|
+
// TODO: extract this to a separate function
|
1768
|
+
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
1577
1769
|
return false;
|
1578
1770
|
}
|
1579
1771
|
|
1580
1772
|
ggml_backend_sched_reset(sched);
|
1773
|
+
ggml_backend_sched_synchronize(sched);
|
1774
|
+
|
1581
1775
|
return true;
|
1582
1776
|
}
|
1583
1777
|
|
1584
|
-
bool
|
1585
|
-
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes +
|
1586
|
-
|
1587
|
-
if (!sched->is_reset) {
|
1588
|
-
ggml_backend_sched_reset(sched);
|
1589
|
-
}
|
1778
|
+
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1779
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
1590
1780
|
|
1591
1781
|
ggml_backend_sched_split_graph(sched, graph);
|
1782
|
+
|
1592
1783
|
if (!ggml_backend_sched_alloc_splits(sched)) {
|
1593
1784
|
return false;
|
1594
1785
|
}
|
1595
1786
|
|
1596
|
-
|
1597
|
-
return false;
|
1598
|
-
}
|
1787
|
+
sched->is_alloc = true;
|
1599
1788
|
|
1600
1789
|
return true;
|
1601
1790
|
}
|
1602
1791
|
|
1792
|
+
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1793
|
+
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
|
1794
|
+
ggml_backend_sched_synchronize(sched);
|
1795
|
+
return err;
|
1796
|
+
}
|
1797
|
+
|
1798
|
+
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1799
|
+
if (!sched->is_reset && !sched->is_alloc) {
|
1800
|
+
ggml_backend_sched_reset(sched);
|
1801
|
+
}
|
1802
|
+
|
1803
|
+
if (!sched->is_alloc) {
|
1804
|
+
if (!ggml_backend_sched_alloc_graph(sched, graph)) {
|
1805
|
+
return GGML_STATUS_ALLOC_FAILED;
|
1806
|
+
}
|
1807
|
+
}
|
1808
|
+
|
1809
|
+
return ggml_backend_sched_compute_splits(sched);
|
1810
|
+
}
|
1811
|
+
|
1812
|
+
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
1813
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
1814
|
+
ggml_backend_synchronize(sched->backends[i]);
|
1815
|
+
}
|
1816
|
+
}
|
1817
|
+
|
1603
1818
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
1604
1819
|
sched->callback_eval = callback;
|
1605
1820
|
sched->callback_eval_user_data = user_data;
|
@@ -1609,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|
1609
1824
|
return sched->n_splits;
|
1610
1825
|
}
|
1611
1826
|
|
1827
|
+
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
1828
|
+
return sched->n_copies;
|
1829
|
+
}
|
1830
|
+
|
1612
1831
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1613
1832
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1614
1833
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1834
|
+
|
1615
1835
|
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
1616
1836
|
}
|
1617
1837
|
|
1618
|
-
void
|
1838
|
+
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
1619
1839
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1620
1840
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1621
1841
|
tensor_backend_id(node) = backend_index;
|
1622
1842
|
}
|
1623
1843
|
|
1624
|
-
ggml_backend_t
|
1844
|
+
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
1625
1845
|
int backend_index = tensor_backend_id(node);
|
1626
1846
|
if (backend_index == -1) {
|
1627
1847
|
return NULL;
|