llama_cpp 0.14.2 → 0.14.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +20 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +154 -124
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8741 -8691
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +260 -28
- data/vendor/tmp/llama.cpp/ggml-quants.c +25 -13
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +237 -78
- data/vendor/tmp/llama.cpp/ggml-sycl.h +6 -1
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml.c +98 -16
- data/vendor/tmp/llama.cpp/llama.cpp +382 -42
- data/vendor/tmp/llama.cpp/llama.h +19 -4
- metadata +3 -3
@@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
|
|
278
278
|
return err;
|
279
279
|
}
|
280
280
|
|
281
|
-
|
281
|
+
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
282
282
|
return backend->iface.graph_compute(backend, cgraph);
|
283
283
|
}
|
284
284
|
|
@@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
|
286
286
|
return backend->iface.supports_op(backend, op);
|
287
287
|
}
|
288
288
|
|
289
|
+
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
290
|
+
if (backend->iface.offload_op != NULL) {
|
291
|
+
return backend->iface.offload_op(backend, op);
|
292
|
+
}
|
293
|
+
return false;
|
294
|
+
}
|
295
|
+
|
289
296
|
// backend copy
|
290
297
|
|
291
298
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
@@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
|
761
768
|
|
762
769
|
if (cpu_plan->cplan.work_size > 0) {
|
763
770
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
771
|
+
if (cpu_plan->cplan.work_data == NULL) {
|
772
|
+
free(cpu_plan);
|
773
|
+
return NULL;
|
774
|
+
}
|
764
775
|
}
|
765
776
|
|
766
777
|
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
@@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
834
845
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
835
846
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
836
847
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
848
|
+
/* .offload_op = */ NULL,
|
837
849
|
/* .event_new = */ NULL,
|
838
850
|
/* .event_free = */ NULL,
|
839
851
|
/* .event_record = */ NULL,
|
@@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
999
1011
|
#endif
|
1000
1012
|
|
1001
1013
|
#ifndef GGML_SCHED_MAX_SPLITS
|
1002
|
-
#define GGML_SCHED_MAX_SPLITS
|
1014
|
+
#define GGML_SCHED_MAX_SPLITS 2048
|
1003
1015
|
#endif
|
1004
1016
|
|
1005
1017
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
1006
|
-
#define GGML_SCHED_MAX_SPLIT_INPUTS
|
1018
|
+
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
1007
1019
|
#endif
|
1008
1020
|
|
1009
1021
|
#ifndef GGML_SCHED_MAX_COPIES
|
@@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
|
|
1043
1055
|
struct ggml_cgraph * graph;
|
1044
1056
|
|
1045
1057
|
// graph splits
|
1046
|
-
struct ggml_backend_sched_split splits
|
1058
|
+
struct ggml_backend_sched_split * splits;
|
1047
1059
|
int n_splits;
|
1060
|
+
int splits_capacity;
|
1048
1061
|
|
1049
1062
|
// pipeline parallelism support
|
1050
1063
|
int n_copies;
|
@@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1114
1127
|
// TODO: use supports_op to check if the backend supports the op
|
1115
1128
|
|
1116
1129
|
// assign pre-allocated nodes to their backend
|
1117
|
-
|
1118
|
-
|
1119
|
-
if (cur_backend != -1) {
|
1130
|
+
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
1131
|
+
if (cur_backend_id != -1) {
|
1120
1132
|
SET_CAUSE(tensor, "1.dst");
|
1121
|
-
return
|
1133
|
+
return cur_backend_id;
|
1122
1134
|
}
|
1123
1135
|
|
1124
1136
|
// view_src
|
1125
1137
|
if (tensor->view_src != NULL) {
|
1126
|
-
|
1127
|
-
if (
|
1138
|
+
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
1139
|
+
if (cur_backend_id != -1) {
|
1128
1140
|
SET_CAUSE(tensor, "1.vsrc");
|
1129
|
-
return
|
1141
|
+
return cur_backend_id;
|
1130
1142
|
}
|
1131
1143
|
}
|
1132
1144
|
|
1133
|
-
// input
|
1145
|
+
// graph input
|
1134
1146
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
1135
|
-
|
1147
|
+
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
1136
1148
|
SET_CAUSE(tensor, "1.inp");
|
1137
|
-
return
|
1149
|
+
return cur_backend_id;
|
1138
1150
|
}
|
1139
1151
|
|
1140
1152
|
// assign nodes that use weights to the backend of the weights
|
1153
|
+
// operations with weights are preferably run on the same backend as the weights
|
1141
1154
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1142
1155
|
const struct ggml_tensor * src = tensor->src[i];
|
1143
1156
|
if (src == NULL) {
|
1144
1157
|
continue;
|
1145
1158
|
}
|
1146
1159
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1147
|
-
int
|
1148
|
-
//
|
1160
|
+
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
1161
|
+
// check if a backend with higher prio wants to offload the op
|
1162
|
+
if (src_backend_id == sched->n_backends - 1) {
|
1163
|
+
for (int b = 0; b < src_backend_id; b++) {
|
1164
|
+
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
1165
|
+
SET_CAUSE(tensor, "1.off");
|
1166
|
+
return b;
|
1167
|
+
}
|
1168
|
+
}
|
1169
|
+
}
|
1149
1170
|
SET_CAUSE(tensor, "1.wgt%d", i);
|
1150
|
-
return
|
1171
|
+
return src_backend_id;
|
1151
1172
|
}
|
1152
1173
|
}
|
1153
1174
|
|
@@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1227
1248
|
// pass 1: assign backends to ops with pre-allocated inputs
|
1228
1249
|
for (int i = 0; i < graph->n_leafs; i++) {
|
1229
1250
|
struct ggml_tensor * leaf = graph->leafs[i];
|
1230
|
-
|
1251
|
+
int * leaf_backend_id = &tensor_backend_id(leaf);
|
1252
|
+
if (*leaf_backend_id != -1) {
|
1231
1253
|
// do not overwrite user assignments
|
1232
1254
|
continue;
|
1233
1255
|
}
|
1234
|
-
|
1256
|
+
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
1235
1257
|
}
|
1236
1258
|
|
1237
1259
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1238
1260
|
struct ggml_tensor * node = graph->nodes[i];
|
1239
|
-
|
1261
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1262
|
+
if (*node_backend_id != -1) {
|
1240
1263
|
// do not overwrite user assignments
|
1241
1264
|
continue;
|
1242
1265
|
}
|
1243
|
-
|
1266
|
+
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
1244
1267
|
// src
|
1245
1268
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1246
1269
|
struct ggml_tensor * src = node->src[j];
|
1247
1270
|
if (src == NULL) {
|
1248
1271
|
continue;
|
1249
1272
|
}
|
1250
|
-
|
1251
|
-
|
1273
|
+
int * src_backend_id = &tensor_backend_id(src);
|
1274
|
+
if (*src_backend_id == -1) {
|
1275
|
+
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
1252
1276
|
}
|
1253
1277
|
}
|
1254
1278
|
}
|
@@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1270
1294
|
if (ggml_is_view_op(node->op)) {
|
1271
1295
|
continue;
|
1272
1296
|
}
|
1273
|
-
int
|
1274
|
-
if (
|
1275
|
-
if (
|
1297
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1298
|
+
if (*node_backend_id != -1) {
|
1299
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
1276
1300
|
// skip cpu (lowest prio backend)
|
1277
1301
|
cur_backend_id = -1;
|
1278
1302
|
} else {
|
1279
|
-
cur_backend_id =
|
1303
|
+
cur_backend_id = *node_backend_id;
|
1280
1304
|
}
|
1281
1305
|
} else {
|
1282
|
-
|
1306
|
+
*node_backend_id = cur_backend_id;
|
1283
1307
|
SET_CAUSE(node, "2.2");
|
1284
1308
|
}
|
1285
1309
|
}
|
1286
1310
|
}
|
1287
|
-
|
1288
1311
|
// pass 2.1 expand gpu up
|
1289
1312
|
{
|
1290
1313
|
int cur_backend_id = -1;
|
@@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1293
1316
|
if (ggml_is_view_op(node->op)) {
|
1294
1317
|
continue;
|
1295
1318
|
}
|
1296
|
-
int
|
1297
|
-
if (
|
1298
|
-
if (
|
1319
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1320
|
+
if (*node_backend_id != -1) {
|
1321
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
1299
1322
|
// skip cpu (lowest prio backend)
|
1300
1323
|
cur_backend_id = -1;
|
1301
1324
|
} else {
|
1302
|
-
cur_backend_id =
|
1325
|
+
cur_backend_id = *node_backend_id;
|
1303
1326
|
}
|
1304
1327
|
} else {
|
1305
|
-
|
1328
|
+
*node_backend_id = cur_backend_id;
|
1306
1329
|
SET_CAUSE(node, "2.1");
|
1307
1330
|
}
|
1308
1331
|
}
|
1309
1332
|
}
|
1310
|
-
|
1311
|
-
|
1312
1333
|
// pass 2.4 expand rest down
|
1313
1334
|
{
|
1314
1335
|
int cur_backend_id = -1;
|
@@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1317
1338
|
if (ggml_is_view_op(node->op)) {
|
1318
1339
|
continue;
|
1319
1340
|
}
|
1320
|
-
int
|
1321
|
-
if (
|
1322
|
-
cur_backend_id =
|
1341
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1342
|
+
if (*node_backend_id != -1) {
|
1343
|
+
cur_backend_id = *node_backend_id;
|
1323
1344
|
} else {
|
1324
|
-
|
1345
|
+
*node_backend_id = cur_backend_id;
|
1325
1346
|
SET_CAUSE(node, "2.4");
|
1326
1347
|
}
|
1327
1348
|
}
|
1328
1349
|
}
|
1329
|
-
|
1350
|
+
// pass 2.3 expand rest up
|
1330
1351
|
{
|
1331
1352
|
int cur_backend_id = -1;
|
1332
1353
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
@@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1334
1355
|
if (ggml_is_view_op(node->op)) {
|
1335
1356
|
continue;
|
1336
1357
|
}
|
1337
|
-
int
|
1338
|
-
if (
|
1339
|
-
cur_backend_id =
|
1358
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1359
|
+
if (*node_backend_id != -1) {
|
1360
|
+
cur_backend_id = *node_backend_id;
|
1340
1361
|
} else {
|
1341
|
-
|
1362
|
+
*node_backend_id = cur_backend_id;
|
1342
1363
|
SET_CAUSE(node, "2.3");
|
1343
1364
|
}
|
1344
1365
|
}
|
@@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1351
1372
|
// pass 3: assign backends to remaining src from dst and view_src
|
1352
1373
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1353
1374
|
struct ggml_tensor * node = graph->nodes[i];
|
1354
|
-
int cur_backend_id = tensor_backend_id(node);
|
1355
|
-
if (node->view_src != NULL && cur_backend_id == -1) {
|
1356
|
-
cur_backend_id = tensor_backend_id(node
|
1375
|
+
int * cur_backend_id = &tensor_backend_id(node);
|
1376
|
+
if (node->view_src != NULL && *cur_backend_id == -1) {
|
1377
|
+
*cur_backend_id = tensor_backend_id(node->view_src);
|
1357
1378
|
SET_CAUSE(node, "3.vsrc");
|
1358
1379
|
}
|
1359
1380
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1361
1382
|
if (src == NULL) {
|
1362
1383
|
continue;
|
1363
1384
|
}
|
1364
|
-
int src_backend_id = tensor_backend_id(src);
|
1365
|
-
if (src_backend_id == -1) {
|
1385
|
+
int * src_backend_id = &tensor_backend_id(src);
|
1386
|
+
if (*src_backend_id == -1) {
|
1366
1387
|
if (src->view_src != NULL) {
|
1367
1388
|
// views are always on the same backend as the source
|
1368
|
-
|
1389
|
+
*src_backend_id = tensor_backend_id(src->view_src);
|
1369
1390
|
SET_CAUSE(src, "3.vsrc");
|
1370
1391
|
} else {
|
1371
|
-
|
1392
|
+
*src_backend_id = *cur_backend_id;
|
1372
1393
|
SET_CAUSE(src, "3.cur");
|
1373
1394
|
}
|
1374
1395
|
}
|
@@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1380
1401
|
|
1381
1402
|
// pass 4: split graph, find tensors that need to be copied
|
1382
1403
|
{
|
1383
|
-
int
|
1404
|
+
int i_split = 0;
|
1405
|
+
struct ggml_backend_sched_split * split = &sched->splits[0];
|
1384
1406
|
// find the backend of the first split, skipping view ops
|
1385
1407
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1386
1408
|
struct ggml_tensor * node = graph->nodes[i];
|
1387
1409
|
if (!ggml_is_view_op(node->op)) {
|
1388
|
-
|
1410
|
+
split->backend_id = tensor_backend_id(node);
|
1389
1411
|
break;
|
1390
1412
|
}
|
1391
1413
|
}
|
1392
|
-
|
1393
|
-
|
1394
|
-
memset(
|
1395
|
-
int cur_backend_id =
|
1414
|
+
split->i_start = 0;
|
1415
|
+
split->n_inputs = 0;
|
1416
|
+
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
1417
|
+
int cur_backend_id = split->backend_id;
|
1396
1418
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1397
1419
|
struct ggml_tensor * node = graph->nodes[i];
|
1398
1420
|
|
@@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1400
1422
|
continue;
|
1401
1423
|
}
|
1402
1424
|
|
1403
|
-
int
|
1425
|
+
const int node_backend_id = tensor_backend_id(node);
|
1404
1426
|
|
1405
|
-
GGML_ASSERT(
|
1427
|
+
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
1406
1428
|
|
1407
|
-
if
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1429
|
+
// check if we should start a new split based on the sources of the current node
|
1430
|
+
bool need_new_split = false;
|
1431
|
+
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
1432
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1433
|
+
struct ggml_tensor * src = node->src[j];
|
1434
|
+
if (src == NULL) {
|
1435
|
+
continue;
|
1436
|
+
}
|
1437
|
+
// check if a weight is on a different backend
|
1438
|
+
// by starting a new split, the memory of the previously offloaded weights can be reused
|
1439
|
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1440
|
+
int src_backend_id = tensor_backend_id(src);
|
1441
|
+
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
1442
|
+
need_new_split = true;
|
1443
|
+
break;
|
1444
|
+
}
|
1445
|
+
}
|
1446
|
+
// check if the split has too many inputs
|
1447
|
+
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
1448
|
+
const size_t id = hash_id(src);
|
1449
|
+
int src_backend_id = sched->tensor_backend_id[id];
|
1450
|
+
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
1451
|
+
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
1452
|
+
need_new_split = true;
|
1453
|
+
break;
|
1454
|
+
}
|
1455
|
+
}
|
1456
|
+
}
|
1457
|
+
}
|
1458
|
+
|
1459
|
+
if (node_backend_id != cur_backend_id || need_new_split) {
|
1460
|
+
split->i_end = i;
|
1461
|
+
i_split++;
|
1462
|
+
if (i_split >= sched->splits_capacity) {
|
1463
|
+
sched->splits_capacity *= 2;
|
1464
|
+
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
1465
|
+
GGML_ASSERT(sched->splits != NULL);
|
1466
|
+
}
|
1467
|
+
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
1468
|
+
split = &sched->splits[i_split];
|
1469
|
+
split->backend_id = node_backend_id;
|
1470
|
+
split->i_start = i;
|
1471
|
+
split->n_inputs = 0;
|
1472
|
+
cur_backend_id = node_backend_id;
|
1415
1473
|
}
|
1416
1474
|
|
1417
1475
|
// find inputs that are not on the same backend
|
@@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1421
1479
|
continue;
|
1422
1480
|
}
|
1423
1481
|
|
1424
|
-
int src_backend_id = tensor_backend_id(src);
|
1482
|
+
const int src_backend_id = tensor_backend_id(src);
|
1425
1483
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1426
1484
|
|
1427
|
-
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
1485
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
1428
1486
|
size_t id = hash_id(src);
|
1429
1487
|
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
1430
1488
|
ggml_backend_t backend = sched->backends[src_backend_id];
|
@@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1441
1499
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1442
1500
|
}
|
1443
1501
|
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
1444
|
-
tensor_backend_id(tensor_copy) = src_backend_id;
|
1445
1502
|
SET_CAUSE(tensor_copy, "4.cpy");
|
1446
1503
|
}
|
1447
1504
|
int n_graph_inputs = sched->n_graph_inputs++;
|
@@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1450
1507
|
}
|
1451
1508
|
}
|
1452
1509
|
|
1453
|
-
if (src_backend_id !=
|
1510
|
+
if (src_backend_id != node_backend_id) {
|
1454
1511
|
// create a copy of the input in the split's backend
|
1455
|
-
size_t id = hash_id(src);
|
1512
|
+
const size_t id = hash_id(src);
|
1456
1513
|
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
1457
1514
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
1458
1515
|
for (int c = 0; c < sched->n_copies; c++) {
|
@@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1463
1520
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1464
1521
|
}
|
1465
1522
|
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
1466
|
-
tensor_backend_id(tensor_copy) = cur_backend_id;
|
1467
1523
|
SET_CAUSE(tensor_copy, "4.cpy");
|
1468
1524
|
}
|
1469
|
-
int n_inputs =
|
1525
|
+
int n_inputs = split->n_inputs++;
|
1470
1526
|
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1471
|
-
|
1527
|
+
split->inputs[n_inputs] = src;
|
1472
1528
|
}
|
1473
1529
|
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
1474
1530
|
}
|
1475
1531
|
}
|
1476
1532
|
}
|
1477
|
-
|
1478
|
-
sched->n_splits =
|
1533
|
+
split->i_end = graph->n_nodes;
|
1534
|
+
sched->n_splits = i_split + 1;
|
1479
1535
|
}
|
1480
1536
|
#ifdef DEBUG_PASS4
|
1481
1537
|
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1482
1538
|
#endif
|
1483
1539
|
|
1484
|
-
#ifndef NDEBUG
|
1485
|
-
// sanity check: all sources should have the same backend as the node
|
1486
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1487
|
-
struct ggml_tensor * node = graph->nodes[i];
|
1488
|
-
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
1489
|
-
if (tensor_backend == NULL) {
|
1490
|
-
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
1491
|
-
}
|
1492
|
-
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
1493
|
-
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
1494
|
-
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1495
|
-
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
1496
|
-
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
1497
|
-
}
|
1498
|
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1499
|
-
struct ggml_tensor * src = node->src[j];
|
1500
|
-
if (src == NULL) {
|
1501
|
-
continue;
|
1502
|
-
}
|
1503
|
-
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
1504
|
-
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
1505
|
-
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
1506
|
-
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1507
|
-
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
1508
|
-
}
|
1509
|
-
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
1510
|
-
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
1511
|
-
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
1512
|
-
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
1513
|
-
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
1514
|
-
}
|
1515
|
-
}
|
1516
|
-
}
|
1517
|
-
fflush(stderr);
|
1518
|
-
#endif
|
1519
|
-
|
1520
1540
|
// create copies of the graph for each split
|
1521
1541
|
// TODO: avoid this copy
|
1522
|
-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
1542
|
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
1523
1543
|
for (int i = 0; i < sched->n_splits; i++) {
|
1524
1544
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1525
1545
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
1526
1546
|
|
1527
1547
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1528
1548
|
for (int j = 0; j < split->n_inputs; j++) {
|
1549
|
+
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
1550
|
+
|
1529
1551
|
struct ggml_tensor * input = split->inputs[j];
|
1530
|
-
|
1552
|
+
const size_t input_id = hash_id(input);
|
1553
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
1531
1554
|
|
1532
1555
|
// add a dependency to the input source so that it is not freed before the copy is done
|
1533
1556
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
1534
1557
|
input_dep->src[0] = input;
|
1535
|
-
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id
|
1558
|
+
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
1536
1559
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1537
1560
|
|
1538
1561
|
// add a dependency to the input copy so that it is allocated at the start of the split
|
@@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1541
1564
|
}
|
1542
1565
|
|
1543
1566
|
for (int j = split->i_start; j < split->i_end; j++) {
|
1567
|
+
assert(graph_copy->size > graph_copy->n_nodes);
|
1544
1568
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
1545
1569
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1546
1570
|
}
|
@@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
1625
1649
|
}
|
1626
1650
|
ggml_backend_tensor_copy(input, input_cpy);
|
1627
1651
|
} else {
|
1652
|
+
// wait for the split backend to finish using the input before overwriting it
|
1628
1653
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1629
1654
|
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
1630
1655
|
} else {
|
1631
1656
|
ggml_backend_synchronize(split_backend);
|
1632
|
-
ggml_backend_synchronize(input_backend);
|
1633
1657
|
}
|
1634
|
-
|
1635
1658
|
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
1636
1659
|
}
|
1637
1660
|
}
|
@@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1701
1724
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
1702
1725
|
|
1703
1726
|
// initialize hash table
|
1704
|
-
sched->hash_set = ggml_hash_set_new(graph_size
|
1727
|
+
sched->hash_set = ggml_hash_set_new(graph_size);
|
1705
1728
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
1706
1729
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
1707
|
-
|
1708
|
-
|
1730
|
+
|
1731
|
+
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1732
|
+
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
1733
|
+
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
1709
1734
|
|
1710
1735
|
sched->n_backends = n_backends;
|
1711
1736
|
|
1712
1737
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1713
1738
|
|
1714
|
-
|
1739
|
+
const int initial_splits_capacity = 16;
|
1740
|
+
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
|
1741
|
+
sched->splits_capacity = initial_splits_capacity;
|
1715
1742
|
|
1716
1743
|
for (int b = 0; b < n_backends; b++) {
|
1717
1744
|
sched->backends[b] = backends[b];
|
@@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1742
1769
|
}
|
1743
1770
|
ggml_gallocr_free(sched->galloc);
|
1744
1771
|
ggml_free(sched->ctx);
|
1772
|
+
free(sched->splits);
|
1745
1773
|
free(sched->hash_set.keys);
|
1746
1774
|
free(sched->tensor_backend_id);
|
1747
1775
|
free(sched->tensor_copies);
|
@@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1762
1790
|
}
|
1763
1791
|
|
1764
1792
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1793
|
+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
1794
|
+
|
1765
1795
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
1766
1796
|
|
1767
1797
|
// TODO: extract this to a separate function
|
@@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
1776
1806
|
}
|
1777
1807
|
|
1778
1808
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1779
|
-
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes
|
1809
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
1780
1810
|
|
1781
1811
|
ggml_backend_sched_split_graph(sched, graph);
|
1782
1812
|
|
@@ -70,11 +70,11 @@ extern "C" {
|
|
70
70
|
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
71
71
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
72
72
|
|
73
|
-
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
74
|
-
GGML_API enum ggml_status ggml_backend_graph_compute
|
75
|
-
|
76
|
-
GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
73
|
+
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
74
|
+
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
75
|
+
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
77
76
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
77
|
+
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
78
78
|
|
79
79
|
// tensor copy between different backends
|
80
80
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|