llama_cpp 0.14.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +71 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +9 -0
- data/vendor/tmp/llama.cpp/Makefile +28 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +358 -135
- data/vendor/tmp/llama.cpp/ggml-backend.h +41 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +187 -1033
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +42 -20
- data/vendor/tmp/llama.cpp/ggml-metal.metal +44 -910
- data/vendor/tmp/llama.cpp/ggml-quants.c +457 -1074
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +388 -565
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +6 -39
- data/vendor/tmp/llama.cpp/ggml.c +509 -343
- data/vendor/tmp/llama.cpp/ggml.h +61 -47
- data/vendor/tmp/llama.cpp/llama.cpp +1446 -687
- data/vendor/tmp/llama.cpp/llama.h +25 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
221
221
|
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
222
222
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
223
223
|
|
224
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
225
224
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
225
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
226
226
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
227
227
|
|
228
228
|
if (!size) {
|
229
229
|
return;
|
230
230
|
}
|
231
231
|
|
232
|
-
|
232
|
+
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
233
233
|
}
|
234
234
|
|
235
235
|
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
236
236
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
237
237
|
|
238
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
238
239
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
239
|
-
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
240
240
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
241
241
|
|
242
242
|
if (!size) {
|
243
243
|
return;
|
244
244
|
}
|
245
245
|
|
246
|
-
|
246
|
+
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
247
247
|
}
|
248
248
|
|
249
249
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
@@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
255
255
|
}
|
256
256
|
|
257
257
|
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
258
|
+
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
259
|
+
|
258
260
|
return backend->iface.graph_plan_create(backend, cgraph);
|
259
261
|
}
|
260
262
|
|
261
263
|
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
264
|
+
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
265
|
+
|
262
266
|
backend->iface.graph_plan_free(backend, plan);
|
263
267
|
}
|
264
268
|
|
265
269
|
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
270
|
+
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
271
|
+
|
266
272
|
return backend->iface.graph_plan_compute(backend, plan);
|
267
273
|
}
|
268
274
|
|
269
275
|
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
276
|
+
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
|
277
|
+
ggml_backend_synchronize(backend);
|
278
|
+
return err;
|
279
|
+
}
|
280
|
+
|
281
|
+
bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
270
282
|
return backend->iface.graph_compute(backend, cgraph);
|
271
283
|
}
|
272
284
|
|
@@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
314
326
|
}
|
315
327
|
}
|
316
328
|
|
317
|
-
void ggml_backend_tensor_copy_async(ggml_backend_t
|
329
|
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
318
330
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
319
331
|
|
320
332
|
if (src == dst) {
|
321
333
|
return;
|
322
334
|
}
|
323
335
|
|
324
|
-
if (
|
325
|
-
if (
|
326
|
-
|
327
|
-
return;
|
328
|
-
}
|
336
|
+
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
337
|
+
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
338
|
+
return;
|
329
339
|
}
|
330
340
|
}
|
331
341
|
|
332
|
-
|
342
|
+
// an async copy would normally happen after all the queued operations on both backends are completed
|
343
|
+
// sync src, set_async dst
|
333
344
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
334
|
-
|
335
|
-
|
336
|
-
else {
|
345
|
+
ggml_backend_synchronize(backend_src);
|
346
|
+
ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
|
347
|
+
} else {
|
348
|
+
ggml_backend_synchronize(backend_src);
|
337
349
|
ggml_backend_tensor_copy(src, dst);
|
350
|
+
ggml_backend_synchronize(backend_dst);
|
351
|
+
}
|
352
|
+
}
|
353
|
+
|
354
|
+
// events
|
355
|
+
|
356
|
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
|
357
|
+
if (backend->iface.event_new == NULL) {
|
358
|
+
return NULL;
|
359
|
+
}
|
360
|
+
return backend->iface.event_new(backend);
|
361
|
+
}
|
362
|
+
|
363
|
+
void ggml_backend_event_free(ggml_backend_event_t event) {
|
364
|
+
if (event == NULL) {
|
365
|
+
return;
|
338
366
|
}
|
367
|
+
event->backend->iface.event_free(event);
|
368
|
+
}
|
369
|
+
|
370
|
+
void ggml_backend_event_record(ggml_backend_event_t event) {
|
371
|
+
GGML_ASSERT(event->backend->iface.event_record != NULL);
|
372
|
+
|
373
|
+
event->backend->iface.event_record(event);
|
374
|
+
}
|
375
|
+
|
376
|
+
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
377
|
+
GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
|
378
|
+
|
379
|
+
event->backend->iface.event_synchronize(event);
|
339
380
|
}
|
340
381
|
|
382
|
+
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
383
|
+
GGML_ASSERT(backend->iface.event_wait != NULL);
|
384
|
+
|
385
|
+
backend->iface.event_wait(backend, event);
|
386
|
+
}
|
341
387
|
|
342
388
|
// backend registry
|
343
389
|
|
344
|
-
#define
|
390
|
+
#define GGML_REG_MAX_BACKENDS 16
|
345
391
|
|
346
392
|
struct ggml_backend_reg {
|
347
393
|
char name[128];
|
@@ -350,7 +396,7 @@ struct ggml_backend_reg {
|
|
350
396
|
void * user_data;
|
351
397
|
};
|
352
398
|
|
353
|
-
static struct ggml_backend_reg ggml_backend_registry[
|
399
|
+
static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
|
354
400
|
static size_t ggml_backend_registry_count = 0;
|
355
401
|
|
356
402
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
@@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
395
441
|
}
|
396
442
|
|
397
443
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
398
|
-
GGML_ASSERT(ggml_backend_registry_count <
|
444
|
+
GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
|
399
445
|
|
400
446
|
size_t id = ggml_backend_registry_count;
|
401
447
|
|
@@ -746,8 +792,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|
746
792
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
747
793
|
|
748
794
|
if (cpu_ctx->work_size < cplan.work_size) {
|
749
|
-
|
750
|
-
cpu_ctx->work_data =
|
795
|
+
free(cpu_ctx->work_data);
|
796
|
+
cpu_ctx->work_data = malloc(cplan.work_size);
|
797
|
+
if (cpu_ctx->work_data == NULL) {
|
798
|
+
cpu_ctx->work_size = 0;
|
799
|
+
return GGML_STATUS_ALLOC_FAILED;
|
800
|
+
}
|
751
801
|
cpu_ctx->work_size = cplan.work_size;
|
752
802
|
}
|
753
803
|
cplan.work_data = cpu_ctx->work_data;
|
@@ -784,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
784
834
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
785
835
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
786
836
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
837
|
+
/* .event_new = */ NULL,
|
838
|
+
/* .event_free = */ NULL,
|
839
|
+
/* .event_record = */ NULL,
|
840
|
+
/* .event_wait = */ NULL,
|
841
|
+
/* .event_synchronize = */ NULL,
|
787
842
|
};
|
788
843
|
|
789
844
|
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
@@ -939,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
939
994
|
|
940
995
|
// scheduler
|
941
996
|
|
942
|
-
#
|
943
|
-
#define
|
944
|
-
#
|
997
|
+
#ifndef GGML_SCHED_MAX_BACKENDS
|
998
|
+
#define GGML_SCHED_MAX_BACKENDS 16
|
999
|
+
#endif
|
1000
|
+
|
1001
|
+
#ifndef GGML_SCHED_MAX_SPLITS
|
1002
|
+
#define GGML_SCHED_MAX_SPLITS 256
|
1003
|
+
#endif
|
1004
|
+
|
1005
|
+
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
1006
|
+
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
1007
|
+
#endif
|
1008
|
+
|
1009
|
+
#ifndef GGML_SCHED_MAX_COPIES
|
1010
|
+
#define GGML_SCHED_MAX_COPIES 4
|
1011
|
+
#endif
|
945
1012
|
|
946
1013
|
struct ggml_backend_sched_split {
|
947
1014
|
int backend_id;
|
948
1015
|
int i_start;
|
949
1016
|
int i_end;
|
950
|
-
struct ggml_tensor * inputs[
|
1017
|
+
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
951
1018
|
int n_inputs;
|
952
1019
|
// graph view of this split
|
953
1020
|
struct ggml_cgraph graph;
|
@@ -955,45 +1022,53 @@ struct ggml_backend_sched_split {
|
|
955
1022
|
|
956
1023
|
struct ggml_backend_sched {
|
957
1024
|
bool is_reset; // true if the scheduler has been reset since the last graph split
|
1025
|
+
bool is_alloc;
|
958
1026
|
|
959
1027
|
int n_backends;
|
960
|
-
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
961
|
-
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
962
1028
|
|
1029
|
+
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
|
1030
|
+
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
963
1031
|
ggml_gallocr_t galloc;
|
964
1032
|
|
965
1033
|
// hash keys of the nodes in the graph
|
966
1034
|
struct ggml_hash_set hash_set;
|
967
1035
|
// hash values
|
968
1036
|
int * tensor_backend_id;
|
969
|
-
struct ggml_tensor * (* tensor_copies)[
|
1037
|
+
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
970
1038
|
|
971
|
-
int * node_backend_ids; // [
|
972
|
-
int
|
1039
|
+
int * node_backend_ids; // [graph_size]
|
1040
|
+
int * leaf_backend_ids; // [graph_size]
|
973
1041
|
|
974
1042
|
// copy of the graph with modified inputs
|
975
1043
|
struct ggml_cgraph * graph;
|
976
1044
|
|
977
|
-
|
1045
|
+
// graph splits
|
1046
|
+
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
|
978
1047
|
int n_splits;
|
979
1048
|
|
1049
|
+
// pipeline parallelism support
|
1050
|
+
int n_copies;
|
1051
|
+
int cur_copy;
|
1052
|
+
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
1053
|
+
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
1054
|
+
int n_graph_inputs;
|
1055
|
+
|
980
1056
|
struct ggml_context * ctx;
|
981
1057
|
|
982
1058
|
ggml_backend_sched_eval_callback callback_eval;
|
983
1059
|
void * callback_eval_user_data;
|
984
1060
|
|
985
1061
|
// align context_buffer to GGML_MEM_ALIGN
|
986
|
-
|
1062
|
+
#ifdef _MSC_VER
|
987
1063
|
__declspec(align(GGML_MEM_ALIGN))
|
988
|
-
|
1064
|
+
#else
|
989
1065
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
990
|
-
|
991
|
-
char context_buffer[
|
1066
|
+
#endif
|
1067
|
+
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
992
1068
|
};
|
993
1069
|
|
994
|
-
#define hash_id(
|
995
|
-
#define tensor_backend_id(
|
996
|
-
#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
|
1070
|
+
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
1071
|
+
#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
|
997
1072
|
|
998
1073
|
// returns the priority of the backend, lower id is higher priority
|
999
1074
|
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
@@ -1005,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
1005
1080
|
return -1;
|
1006
1081
|
}
|
1007
1082
|
|
1008
|
-
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched,
|
1083
|
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
1084
|
+
ggml_backend_buffer_t buffer = tensor->buffer;
|
1009
1085
|
if (buffer == NULL) {
|
1010
1086
|
return -1;
|
1011
1087
|
}
|
@@ -1016,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
|
|
1016
1092
|
return i;
|
1017
1093
|
}
|
1018
1094
|
}
|
1019
|
-
|
1020
|
-
|
1095
|
+
|
1096
|
+
fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
|
1097
|
+
__func__, ggml_backend_buffer_name(buffer), tensor->name);
|
1098
|
+
GGML_ASSERT(false);
|
1099
|
+
|
1100
|
+
return -1;
|
1021
1101
|
}
|
1022
1102
|
|
1023
1103
|
#if 0
|
1024
|
-
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 +
|
1104
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
1025
1105
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
1026
1106
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
1027
1107
|
#else
|
@@ -1035,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1035
1115
|
|
1036
1116
|
// assign pre-allocated nodes to their backend
|
1037
1117
|
// dst
|
1038
|
-
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor
|
1118
|
+
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
1039
1119
|
if (cur_backend != -1) {
|
1040
|
-
SET_CAUSE(
|
1120
|
+
SET_CAUSE(tensor, "1.dst");
|
1041
1121
|
return cur_backend;
|
1042
1122
|
}
|
1123
|
+
|
1043
1124
|
// view_src
|
1044
1125
|
if (tensor->view_src != NULL) {
|
1045
|
-
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src
|
1126
|
+
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
1046
1127
|
if (cur_backend != -1) {
|
1047
|
-
SET_CAUSE(
|
1128
|
+
SET_CAUSE(tensor, "1.vsrc");
|
1048
1129
|
return cur_backend;
|
1049
1130
|
}
|
1050
1131
|
}
|
1132
|
+
|
1133
|
+
// input
|
1134
|
+
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
1135
|
+
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
|
1136
|
+
SET_CAUSE(tensor, "1.inp");
|
1137
|
+
return cur_backend;
|
1138
|
+
}
|
1139
|
+
|
1051
1140
|
// assign nodes that use weights to the backend of the weights
|
1052
1141
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1053
1142
|
const struct ggml_tensor * src = tensor->src[i];
|
@@ -1055,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1055
1144
|
continue;
|
1056
1145
|
}
|
1057
1146
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1058
|
-
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src
|
1147
|
+
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
|
1059
1148
|
// operations with weights are always run on the same backend as the weights
|
1060
|
-
SET_CAUSE(
|
1149
|
+
SET_CAUSE(tensor, "1.wgt%d", i);
|
1061
1150
|
return src_backend;
|
1062
1151
|
}
|
1063
1152
|
}
|
@@ -1093,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1093
1182
|
if (ggml_is_view_op(node->op)) {
|
1094
1183
|
continue;
|
1095
1184
|
}
|
1096
|
-
ggml_backend_t tensor_backend =
|
1185
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
1097
1186
|
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
1098
1187
|
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1099
1188
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -1101,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1101
1190
|
if (src == NULL) {
|
1102
1191
|
continue;
|
1103
1192
|
}
|
1104
|
-
ggml_backend_t src_backend =
|
1193
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
1105
1194
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1106
1195
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1107
1196
|
}
|
@@ -1118,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1118
1207
|
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1119
1208
|
// reset splits
|
1120
1209
|
sched->n_splits = 0;
|
1210
|
+
sched->n_graph_inputs = 0;
|
1121
1211
|
sched->is_reset = false;
|
1122
1212
|
|
1123
1213
|
struct ggml_init_params params = {
|
@@ -1163,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1163
1253
|
}
|
1164
1254
|
}
|
1165
1255
|
#ifdef DEBUG_PASS1
|
1166
|
-
fprintf(stderr, "PASS 1 ASSIGNMENTS\n");
|
1256
|
+
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1167
1257
|
#endif
|
1168
1258
|
|
1169
1259
|
// pass 2: expand current backend assignments
|
@@ -1171,10 +1261,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1171
1261
|
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
1172
1262
|
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
1173
1263
|
|
1174
|
-
|
1264
|
+
|
1265
|
+
// pass 2.2 expand gpu down
|
1175
1266
|
{
|
1176
1267
|
int cur_backend_id = -1;
|
1177
|
-
for (int i =
|
1268
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1178
1269
|
struct ggml_tensor * node = graph->nodes[i];
|
1179
1270
|
if (ggml_is_view_op(node->op)) {
|
1180
1271
|
continue;
|
@@ -1189,15 +1280,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1189
1280
|
}
|
1190
1281
|
} else {
|
1191
1282
|
tensor_backend_id(node) = cur_backend_id;
|
1192
|
-
SET_CAUSE(node, "2.
|
1283
|
+
SET_CAUSE(node, "2.2");
|
1193
1284
|
}
|
1194
1285
|
}
|
1195
1286
|
}
|
1196
1287
|
|
1197
|
-
// pass 2.
|
1288
|
+
// pass 2.1 expand gpu up
|
1198
1289
|
{
|
1199
1290
|
int cur_backend_id = -1;
|
1200
|
-
for (int i =
|
1291
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1201
1292
|
struct ggml_tensor * node = graph->nodes[i];
|
1202
1293
|
if (ggml_is_view_op(node->op)) {
|
1203
1294
|
continue;
|
@@ -1212,15 +1303,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1212
1303
|
}
|
1213
1304
|
} else {
|
1214
1305
|
tensor_backend_id(node) = cur_backend_id;
|
1215
|
-
SET_CAUSE(node, "2.
|
1306
|
+
SET_CAUSE(node, "2.1");
|
1216
1307
|
}
|
1217
1308
|
}
|
1218
1309
|
}
|
1219
1310
|
|
1220
|
-
|
1311
|
+
|
1312
|
+
// pass 2.4 expand rest down
|
1221
1313
|
{
|
1222
1314
|
int cur_backend_id = -1;
|
1223
|
-
for (int i =
|
1315
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1224
1316
|
struct ggml_tensor * node = graph->nodes[i];
|
1225
1317
|
if (ggml_is_view_op(node->op)) {
|
1226
1318
|
continue;
|
@@ -1230,15 +1322,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1230
1322
|
cur_backend_id = tensor_backend_id;
|
1231
1323
|
} else {
|
1232
1324
|
tensor_backend_id(node) = cur_backend_id;
|
1233
|
-
SET_CAUSE(node, "2.
|
1325
|
+
SET_CAUSE(node, "2.4");
|
1234
1326
|
}
|
1235
1327
|
}
|
1236
1328
|
}
|
1237
|
-
|
1238
|
-
// pass 2.4 expand rest down
|
1329
|
+
// pass 2.3 expand rest up
|
1239
1330
|
{
|
1240
1331
|
int cur_backend_id = -1;
|
1241
|
-
for (int i =
|
1332
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1242
1333
|
struct ggml_tensor * node = graph->nodes[i];
|
1243
1334
|
if (ggml_is_view_op(node->op)) {
|
1244
1335
|
continue;
|
@@ -1248,12 +1339,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1248
1339
|
cur_backend_id = tensor_backend_id;
|
1249
1340
|
} else {
|
1250
1341
|
tensor_backend_id(node) = cur_backend_id;
|
1251
|
-
SET_CAUSE(node, "2.
|
1342
|
+
SET_CAUSE(node, "2.3");
|
1252
1343
|
}
|
1253
1344
|
}
|
1254
1345
|
}
|
1346
|
+
|
1255
1347
|
#ifdef DEBUG_PASS2
|
1256
|
-
fprintf(stderr, "PASS 2 ASSIGNMENTS\n");
|
1348
|
+
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1257
1349
|
#endif
|
1258
1350
|
|
1259
1351
|
// pass 3: assign backends to remaining src from dst and view_src
|
@@ -1283,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1283
1375
|
}
|
1284
1376
|
}
|
1285
1377
|
#ifdef DEBUG_PASS3
|
1286
|
-
fprintf(stderr, "PASS 3 ASSIGNMENTS\n");
|
1378
|
+
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1287
1379
|
#endif
|
1288
1380
|
|
1289
1381
|
// pass 4: split graph, find tensors that need to be copied
|
@@ -1315,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1315
1407
|
if (tensor_backend_id != cur_backend_id) {
|
1316
1408
|
sched->splits[cur_split].i_end = i;
|
1317
1409
|
cur_split++;
|
1318
|
-
GGML_ASSERT(cur_split <
|
1410
|
+
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
|
1319
1411
|
sched->splits[cur_split].backend_id = tensor_backend_id;
|
1320
1412
|
sched->splits[cur_split].i_start = i;
|
1321
1413
|
sched->splits[cur_split].n_inputs = 0;
|
@@ -1328,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1328
1420
|
if (src == NULL) {
|
1329
1421
|
continue;
|
1330
1422
|
}
|
1423
|
+
|
1331
1424
|
int src_backend_id = tensor_backend_id(src);
|
1332
1425
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1426
|
+
|
1427
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
1428
|
+
size_t id = hash_id(src);
|
1429
|
+
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
1430
|
+
ggml_backend_t backend = sched->backends[src_backend_id];
|
1431
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1432
|
+
struct ggml_tensor * tensor_copy;
|
1433
|
+
if (c == sched->cur_copy) {
|
1434
|
+
tensor_copy = src; // use the original tensor as the current copy
|
1435
|
+
} else {
|
1436
|
+
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1437
|
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
1438
|
+
}
|
1439
|
+
if (sched->n_copies > 1) {
|
1440
|
+
ggml_set_input(tensor_copy);
|
1441
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1442
|
+
}
|
1443
|
+
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
1444
|
+
tensor_backend_id(tensor_copy) = src_backend_id;
|
1445
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1446
|
+
}
|
1447
|
+
int n_graph_inputs = sched->n_graph_inputs++;
|
1448
|
+
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1449
|
+
sched->graph_inputs[n_graph_inputs] = src;
|
1450
|
+
}
|
1451
|
+
}
|
1452
|
+
|
1333
1453
|
if (src_backend_id != tensor_backend_id) {
|
1334
1454
|
// create a copy of the input in the split's backend
|
1335
1455
|
size_t id = hash_id(src);
|
1336
|
-
if (sched->tensor_copies[id][cur_backend_id] == NULL) {
|
1456
|
+
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
1337
1457
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1458
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1459
|
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1460
|
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
1461
|
+
if (sched->n_copies > 1) {
|
1462
|
+
ggml_set_input(tensor_copy);
|
1463
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1464
|
+
}
|
1465
|
+
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
1466
|
+
tensor_backend_id(tensor_copy) = cur_backend_id;
|
1467
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1468
|
+
}
|
1345
1469
|
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1346
|
-
GGML_ASSERT(n_inputs <
|
1470
|
+
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1347
1471
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
1348
1472
|
}
|
1349
|
-
node->src[j] = sched->tensor_copies[id][cur_backend_id];
|
1473
|
+
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
1350
1474
|
}
|
1351
1475
|
}
|
1352
1476
|
}
|
@@ -1354,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1354
1478
|
sched->n_splits = cur_split + 1;
|
1355
1479
|
}
|
1356
1480
|
#ifdef DEBUG_PASS4
|
1357
|
-
fprintf(stderr, "PASS 4 ASSIGNMENTS\n");
|
1481
|
+
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1358
1482
|
#endif
|
1359
1483
|
|
1360
1484
|
#ifndef NDEBUG
|
1361
1485
|
// sanity check: all sources should have the same backend as the node
|
1362
1486
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1363
1487
|
struct ggml_tensor * node = graph->nodes[i];
|
1364
|
-
ggml_backend_t tensor_backend =
|
1488
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
1365
1489
|
if (tensor_backend == NULL) {
|
1366
1490
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
1367
1491
|
}
|
1368
|
-
if (node->view_src != NULL && tensor_backend !=
|
1492
|
+
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
1369
1493
|
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
1370
1494
|
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1371
|
-
node->view_src->name,
|
1495
|
+
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
1496
|
+
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
1372
1497
|
}
|
1373
1498
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1374
1499
|
struct ggml_tensor * src = node->src[j];
|
1375
1500
|
if (src == NULL) {
|
1376
1501
|
continue;
|
1377
1502
|
}
|
1378
|
-
ggml_backend_t src_backend =
|
1503
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
1379
1504
|
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
1380
1505
|
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
1381
1506
|
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1382
1507
|
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
1383
1508
|
}
|
1384
|
-
if (src->view_src != NULL && src_backend !=
|
1509
|
+
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
1385
1510
|
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
1386
1511
|
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
1387
|
-
src->view_src->name,
|
1512
|
+
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
1513
|
+
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
1388
1514
|
}
|
1389
1515
|
}
|
1390
1516
|
}
|
@@ -1392,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1392
1518
|
#endif
|
1393
1519
|
|
1394
1520
|
// create copies of the graph for each split
|
1395
|
-
//
|
1396
|
-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*
|
1521
|
+
// TODO: avoid this copy
|
1522
|
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
1397
1523
|
for (int i = 0; i < sched->n_splits; i++) {
|
1398
1524
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1399
1525
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
1400
1526
|
|
1527
|
+
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1401
1528
|
for (int j = 0; j < split->n_inputs; j++) {
|
1402
1529
|
struct ggml_tensor * input = split->inputs[j];
|
1403
|
-
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
|
1530
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
|
1404
1531
|
|
1405
1532
|
// add a dependency to the input source so that it is not freed before the copy is done
|
1406
1533
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
1534
|
+
input_dep->src[0] = input;
|
1407
1535
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
1408
1536
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1409
1537
|
|
@@ -1417,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1417
1545
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1418
1546
|
}
|
1419
1547
|
}
|
1548
|
+
|
1549
|
+
if (sched->n_copies > 1) {
|
1550
|
+
// add input copies as leafs so that they are allocated first
|
1551
|
+
for (int i = 0; i < sched->n_graph_inputs; i++) {
|
1552
|
+
struct ggml_tensor * input = sched->graph_inputs[i];
|
1553
|
+
size_t id = hash_id(input);
|
1554
|
+
int backend_id = tensor_backend_id(input);
|
1555
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1556
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
1557
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1558
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1559
|
+
}
|
1560
|
+
}
|
1561
|
+
|
1562
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1563
|
+
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1564
|
+
int backend_id = split->backend_id;
|
1565
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1566
|
+
struct ggml_tensor * input = split->inputs[j];
|
1567
|
+
size_t id = hash_id(input);
|
1568
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1569
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
1570
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1571
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1572
|
+
}
|
1573
|
+
}
|
1574
|
+
}
|
1575
|
+
}
|
1576
|
+
|
1577
|
+
// add leafs from the original graph
|
1578
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
1579
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
1580
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
1581
|
+
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
1582
|
+
}
|
1583
|
+
|
1420
1584
|
sched->graph = graph_copy;
|
1421
1585
|
}
|
1422
1586
|
|
1423
1587
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
1424
|
-
//
|
1588
|
+
// allocate graph
|
1425
1589
|
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1590
|
+
// the re-allocation may cause the split inputs to be moved to a different address
|
1591
|
+
ggml_backend_sched_synchronize(sched);
|
1426
1592
|
#ifndef NDEBUG
|
1427
|
-
fprintf(stderr, "
|
1593
|
+
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
|
1428
1594
|
#endif
|
1429
|
-
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
1595
|
+
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
1430
1596
|
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1431
|
-
fprintf(stderr, "
|
1597
|
+
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
1432
1598
|
return false;
|
1433
1599
|
}
|
1434
1600
|
}
|
@@ -1437,9 +1603,6 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
1437
1603
|
}
|
1438
1604
|
|
1439
1605
|
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
1440
|
-
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
1441
|
-
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
1442
|
-
|
1443
1606
|
struct ggml_backend_sched_split * splits = sched->splits;
|
1444
1607
|
|
1445
1608
|
for (int i = 0; i < sched->n_splits; i++) {
|
@@ -1448,34 +1611,36 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
1448
1611
|
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
1449
1612
|
|
1450
1613
|
// copy the input tensors to the split backend
|
1451
|
-
uint64_t copy_start_us = ggml_time_us();
|
1452
1614
|
for (int j = 0; j < split->n_inputs; j++) {
|
1615
|
+
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
1453
1616
|
struct ggml_tensor * input = split->inputs[j];
|
1454
|
-
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
|
1617
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
|
1455
1618
|
|
1456
|
-
|
1457
|
-
|
1619
|
+
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
1620
|
+
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
1621
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1622
|
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1623
|
+
} else {
|
1624
|
+
ggml_backend_synchronize(split_backend);
|
1625
|
+
}
|
1626
|
+
ggml_backend_tensor_copy(input, input_cpy);
|
1627
|
+
} else {
|
1628
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1629
|
+
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
1630
|
+
} else {
|
1631
|
+
ggml_backend_synchronize(split_backend);
|
1632
|
+
ggml_backend_synchronize(input_backend);
|
1633
|
+
}
|
1458
1634
|
|
1459
|
-
|
1635
|
+
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
1636
|
+
}
|
1460
1637
|
}
|
1461
|
-
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
1462
|
-
int64_t copy_end_us = ggml_time_us();
|
1463
|
-
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
1464
1638
|
|
1465
|
-
#if 0
|
1466
|
-
char split_filename[GGML_MAX_NAME];
|
1467
|
-
snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
|
1468
|
-
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
1469
|
-
#endif
|
1470
|
-
|
1471
|
-
|
1472
|
-
uint64_t compute_start_us = ggml_time_us();
|
1473
1639
|
if (!sched->callback_eval) {
|
1474
|
-
enum ggml_status ec =
|
1640
|
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
1475
1641
|
if (ec != GGML_STATUS_SUCCESS) {
|
1476
1642
|
return ec;
|
1477
1643
|
}
|
1478
|
-
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1479
1644
|
} else {
|
1480
1645
|
// similar to ggml_backend_compare_graph_backend
|
1481
1646
|
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
@@ -1494,11 +1659,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
1494
1659
|
|
1495
1660
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
1496
1661
|
|
1497
|
-
enum ggml_status ec =
|
1662
|
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
|
1498
1663
|
if (ec != GGML_STATUS_SUCCESS) {
|
1499
1664
|
return ec;
|
1500
1665
|
}
|
1501
1666
|
|
1667
|
+
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
1668
|
+
ggml_backend_synchronize(split_backend);
|
1669
|
+
|
1502
1670
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
1503
1671
|
break;
|
1504
1672
|
}
|
@@ -1506,39 +1674,54 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
1506
1674
|
j0 = j1;
|
1507
1675
|
}
|
1508
1676
|
}
|
1509
|
-
uint64_t compute_end_us = ggml_time_us();
|
1510
|
-
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
1511
|
-
}
|
1512
1677
|
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
|
1518
|
-
fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
|
1678
|
+
// record the event of this copy
|
1679
|
+
if (split->n_inputs > 0) {
|
1680
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1681
|
+
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
|
1682
|
+
}
|
1519
1683
|
}
|
1520
1684
|
}
|
1521
|
-
|
1685
|
+
|
1686
|
+
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
1522
1687
|
|
1523
1688
|
return GGML_STATUS_SUCCESS;
|
1524
1689
|
}
|
1525
1690
|
|
1526
|
-
ggml_backend_sched_t ggml_backend_sched_new(
|
1691
|
+
ggml_backend_sched_t ggml_backend_sched_new(
|
1692
|
+
ggml_backend_t * backends,
|
1693
|
+
ggml_backend_buffer_type_t * bufts,
|
1694
|
+
int n_backends,
|
1695
|
+
size_t graph_size,
|
1696
|
+
bool parallel) {
|
1527
1697
|
GGML_ASSERT(n_backends > 0);
|
1528
|
-
GGML_ASSERT(n_backends <=
|
1698
|
+
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1699
|
+
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1529
1700
|
|
1530
1701
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
1531
1702
|
|
1532
1703
|
// initialize hash table
|
1533
|
-
sched->hash_set = ggml_hash_set_new(graph_size +
|
1704
|
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
1534
1705
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
1535
1706
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
1536
1707
|
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
1708
|
+
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
|
1537
1709
|
|
1538
1710
|
sched->n_backends = n_backends;
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1711
|
+
|
1712
|
+
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1713
|
+
|
1714
|
+
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
|
1715
|
+
|
1716
|
+
for (int b = 0; b < n_backends; b++) {
|
1717
|
+
sched->backends[b] = backends[b];
|
1718
|
+
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
1719
|
+
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
|
1720
|
+
if (sched->n_copies > 1) {
|
1721
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1722
|
+
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
1723
|
+
}
|
1724
|
+
}
|
1542
1725
|
}
|
1543
1726
|
|
1544
1727
|
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
@@ -1552,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1552
1735
|
if (sched == NULL) {
|
1553
1736
|
return;
|
1554
1737
|
}
|
1738
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1739
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1740
|
+
ggml_backend_event_free(sched->events[b][c]);
|
1741
|
+
}
|
1742
|
+
}
|
1555
1743
|
ggml_gallocr_free(sched->galloc);
|
1556
1744
|
ggml_free(sched->ctx);
|
1557
1745
|
free(sched->hash_set.keys);
|
1558
1746
|
free(sched->tensor_backend_id);
|
1559
1747
|
free(sched->tensor_copies);
|
1560
1748
|
free(sched->node_backend_ids);
|
1749
|
+
free(sched->leaf_backend_ids);
|
1561
1750
|
free(sched);
|
1562
1751
|
}
|
1563
1752
|
|
@@ -1569,34 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1569
1758
|
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
1570
1759
|
|
1571
1760
|
sched->is_reset = true;
|
1761
|
+
sched->is_alloc = false;
|
1572
1762
|
}
|
1573
1763
|
|
1574
1764
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1575
1765
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
1576
1766
|
|
1577
|
-
|
1767
|
+
// TODO: extract this to a separate function
|
1768
|
+
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
1578
1769
|
return false;
|
1579
1770
|
}
|
1580
1771
|
|
1581
1772
|
ggml_backend_sched_reset(sched);
|
1773
|
+
ggml_backend_sched_synchronize(sched);
|
1774
|
+
|
1775
|
+
return true;
|
1776
|
+
}
|
1777
|
+
|
1778
|
+
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1779
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
1780
|
+
|
1781
|
+
ggml_backend_sched_split_graph(sched, graph);
|
1782
|
+
|
1783
|
+
if (!ggml_backend_sched_alloc_splits(sched)) {
|
1784
|
+
return false;
|
1785
|
+
}
|
1786
|
+
|
1787
|
+
sched->is_alloc = true;
|
1788
|
+
|
1582
1789
|
return true;
|
1583
1790
|
}
|
1584
1791
|
|
1585
1792
|
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1586
|
-
|
1793
|
+
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
|
1794
|
+
ggml_backend_sched_synchronize(sched);
|
1795
|
+
return err;
|
1796
|
+
}
|
1587
1797
|
|
1588
|
-
|
1798
|
+
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1799
|
+
if (!sched->is_reset && !sched->is_alloc) {
|
1589
1800
|
ggml_backend_sched_reset(sched);
|
1590
1801
|
}
|
1591
1802
|
|
1592
|
-
|
1593
|
-
|
1594
|
-
|
1803
|
+
if (!sched->is_alloc) {
|
1804
|
+
if (!ggml_backend_sched_alloc_graph(sched, graph)) {
|
1805
|
+
return GGML_STATUS_ALLOC_FAILED;
|
1806
|
+
}
|
1595
1807
|
}
|
1596
1808
|
|
1597
1809
|
return ggml_backend_sched_compute_splits(sched);
|
1598
1810
|
}
|
1599
1811
|
|
1812
|
+
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
1813
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
1814
|
+
ggml_backend_synchronize(sched->backends[i]);
|
1815
|
+
}
|
1816
|
+
}
|
1817
|
+
|
1600
1818
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
1601
1819
|
sched->callback_eval = callback;
|
1602
1820
|
sched->callback_eval_user_data = user_data;
|
@@ -1606,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|
1606
1824
|
return sched->n_splits;
|
1607
1825
|
}
|
1608
1826
|
|
1827
|
+
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
1828
|
+
return sched->n_copies;
|
1829
|
+
}
|
1830
|
+
|
1609
1831
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1610
1832
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1611
1833
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1834
|
+
|
1612
1835
|
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
1613
1836
|
}
|
1614
1837
|
|
1615
|
-
void
|
1838
|
+
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
1616
1839
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1617
1840
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1618
1841
|
tensor_backend_id(node) = backend_index;
|
1619
1842
|
}
|
1620
1843
|
|
1621
|
-
ggml_backend_t
|
1844
|
+
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
1622
1845
|
int backend_index = tensor_backend_id(node);
|
1623
1846
|
if (backend_index == -1) {
|
1624
1847
|
return NULL;
|