llama_cpp 0.14.2 → 0.14.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
278
278
  return err;
279
279
  }
280
280
 
281
- bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
281
+ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
282
282
  return backend->iface.graph_compute(backend, cgraph);
283
283
  }
284
284
 
@@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
286
286
  return backend->iface.supports_op(backend, op);
287
287
  }
288
288
 
289
+ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
+ if (backend->iface.offload_op != NULL) {
291
+ return backend->iface.offload_op(backend, op);
292
+ }
293
+ return false;
294
+ }
295
+
289
296
  // backend copy
290
297
 
291
298
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
761
768
 
762
769
  if (cpu_plan->cplan.work_size > 0) {
763
770
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
771
+ if (cpu_plan->cplan.work_data == NULL) {
772
+ free(cpu_plan);
773
+ return NULL;
774
+ }
764
775
  }
765
776
 
766
777
  cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
@@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
834
845
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
835
846
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
836
847
  /* .supports_op = */ ggml_backend_cpu_supports_op,
848
+ /* .offload_op = */ NULL,
837
849
  /* .event_new = */ NULL,
838
850
  /* .event_free = */ NULL,
839
851
  /* .event_record = */ NULL,
@@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
999
1011
  #endif
1000
1012
 
1001
1013
  #ifndef GGML_SCHED_MAX_SPLITS
1002
- #define GGML_SCHED_MAX_SPLITS 256
1014
+ #define GGML_SCHED_MAX_SPLITS 2048
1003
1015
  #endif
1004
1016
 
1005
1017
  #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1006
- #define GGML_SCHED_MAX_SPLIT_INPUTS 16
1018
+ #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1007
1019
  #endif
1008
1020
 
1009
1021
  #ifndef GGML_SCHED_MAX_COPIES
@@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
1043
1055
  struct ggml_cgraph * graph;
1044
1056
 
1045
1057
  // graph splits
1046
- struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
1058
+ struct ggml_backend_sched_split * splits;
1047
1059
  int n_splits;
1060
+ int splits_capacity;
1048
1061
 
1049
1062
  // pipeline parallelism support
1050
1063
  int n_copies;
@@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1114
1127
  // TODO: use supports_op to check if the backend supports the op
1115
1128
 
1116
1129
  // assign pre-allocated nodes to their backend
1117
- // dst
1118
- int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
1119
- if (cur_backend != -1) {
1130
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
1131
+ if (cur_backend_id != -1) {
1120
1132
  SET_CAUSE(tensor, "1.dst");
1121
- return cur_backend;
1133
+ return cur_backend_id;
1122
1134
  }
1123
1135
 
1124
1136
  // view_src
1125
1137
  if (tensor->view_src != NULL) {
1126
- cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1127
- if (cur_backend != -1) {
1138
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1139
+ if (cur_backend_id != -1) {
1128
1140
  SET_CAUSE(tensor, "1.vsrc");
1129
- return cur_backend;
1141
+ return cur_backend_id;
1130
1142
  }
1131
1143
  }
1132
1144
 
1133
- // input
1145
+ // graph input
1134
1146
  if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1135
- cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
1147
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1136
1148
  SET_CAUSE(tensor, "1.inp");
1137
- return cur_backend;
1149
+ return cur_backend_id;
1138
1150
  }
1139
1151
 
1140
1152
  // assign nodes that use weights to the backend of the weights
1153
+ // operations with weights are preferably run on the same backend as the weights
1141
1154
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1142
1155
  const struct ggml_tensor * src = tensor->src[i];
1143
1156
  if (src == NULL) {
1144
1157
  continue;
1145
1158
  }
1146
1159
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1147
- int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
1148
- // operations with weights are always run on the same backend as the weights
1160
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
1161
+ // check if a backend with higher prio wants to offload the op
1162
+ if (src_backend_id == sched->n_backends - 1) {
1163
+ for (int b = 0; b < src_backend_id; b++) {
1164
+ if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1165
+ SET_CAUSE(tensor, "1.off");
1166
+ return b;
1167
+ }
1168
+ }
1169
+ }
1149
1170
  SET_CAUSE(tensor, "1.wgt%d", i);
1150
- return src_backend;
1171
+ return src_backend_id;
1151
1172
  }
1152
1173
  }
1153
1174
 
@@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1227
1248
  // pass 1: assign backends to ops with pre-allocated inputs
1228
1249
  for (int i = 0; i < graph->n_leafs; i++) {
1229
1250
  struct ggml_tensor * leaf = graph->leafs[i];
1230
- if (tensor_backend_id(leaf) != -1) {
1251
+ int * leaf_backend_id = &tensor_backend_id(leaf);
1252
+ if (*leaf_backend_id != -1) {
1231
1253
  // do not overwrite user assignments
1232
1254
  continue;
1233
1255
  }
1234
- tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1256
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1235
1257
  }
1236
1258
 
1237
1259
  for (int i = 0; i < graph->n_nodes; i++) {
1238
1260
  struct ggml_tensor * node = graph->nodes[i];
1239
- if (tensor_backend_id(node) != -1) {
1261
+ int * node_backend_id = &tensor_backend_id(node);
1262
+ if (*node_backend_id != -1) {
1240
1263
  // do not overwrite user assignments
1241
1264
  continue;
1242
1265
  }
1243
- tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
1266
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1244
1267
  // src
1245
1268
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1246
1269
  struct ggml_tensor * src = node->src[j];
1247
1270
  if (src == NULL) {
1248
1271
  continue;
1249
1272
  }
1250
- if (tensor_backend_id(src) == -1) {
1251
- tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
1273
+ int * src_backend_id = &tensor_backend_id(src);
1274
+ if (*src_backend_id == -1) {
1275
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1252
1276
  }
1253
1277
  }
1254
1278
  }
@@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1270
1294
  if (ggml_is_view_op(node->op)) {
1271
1295
  continue;
1272
1296
  }
1273
- int tensor_backend_id = tensor_backend_id(node);
1274
- if (tensor_backend_id != -1) {
1275
- if (tensor_backend_id == sched->n_backends - 1) {
1297
+ int * node_backend_id = &tensor_backend_id(node);
1298
+ if (*node_backend_id != -1) {
1299
+ if (*node_backend_id == sched->n_backends - 1) {
1276
1300
  // skip cpu (lowest prio backend)
1277
1301
  cur_backend_id = -1;
1278
1302
  } else {
1279
- cur_backend_id = tensor_backend_id;
1303
+ cur_backend_id = *node_backend_id;
1280
1304
  }
1281
1305
  } else {
1282
- tensor_backend_id(node) = cur_backend_id;
1306
+ *node_backend_id = cur_backend_id;
1283
1307
  SET_CAUSE(node, "2.2");
1284
1308
  }
1285
1309
  }
1286
1310
  }
1287
-
1288
1311
  // pass 2.1 expand gpu up
1289
1312
  {
1290
1313
  int cur_backend_id = -1;
@@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1293
1316
  if (ggml_is_view_op(node->op)) {
1294
1317
  continue;
1295
1318
  }
1296
- int tensor_backend_id = tensor_backend_id(node);
1297
- if (tensor_backend_id != -1) {
1298
- if (tensor_backend_id == sched->n_backends - 1) {
1319
+ int * node_backend_id = &tensor_backend_id(node);
1320
+ if (*node_backend_id != -1) {
1321
+ if (*node_backend_id == sched->n_backends - 1) {
1299
1322
  // skip cpu (lowest prio backend)
1300
1323
  cur_backend_id = -1;
1301
1324
  } else {
1302
- cur_backend_id = tensor_backend_id;
1325
+ cur_backend_id = *node_backend_id;
1303
1326
  }
1304
1327
  } else {
1305
- tensor_backend_id(node) = cur_backend_id;
1328
+ *node_backend_id = cur_backend_id;
1306
1329
  SET_CAUSE(node, "2.1");
1307
1330
  }
1308
1331
  }
1309
1332
  }
1310
-
1311
-
1312
1333
  // pass 2.4 expand rest down
1313
1334
  {
1314
1335
  int cur_backend_id = -1;
@@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1317
1338
  if (ggml_is_view_op(node->op)) {
1318
1339
  continue;
1319
1340
  }
1320
- int tensor_backend_id = tensor_backend_id(node);
1321
- if (tensor_backend_id != -1) {
1322
- cur_backend_id = tensor_backend_id;
1341
+ int * node_backend_id = &tensor_backend_id(node);
1342
+ if (*node_backend_id != -1) {
1343
+ cur_backend_id = *node_backend_id;
1323
1344
  } else {
1324
- tensor_backend_id(node) = cur_backend_id;
1345
+ *node_backend_id = cur_backend_id;
1325
1346
  SET_CAUSE(node, "2.4");
1326
1347
  }
1327
1348
  }
1328
1349
  }
1329
- // pass 2.3 expand rest up
1350
+ // pass 2.3 expand rest up
1330
1351
  {
1331
1352
  int cur_backend_id = -1;
1332
1353
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1334
1355
  if (ggml_is_view_op(node->op)) {
1335
1356
  continue;
1336
1357
  }
1337
- int tensor_backend_id = tensor_backend_id(node);
1338
- if (tensor_backend_id != -1) {
1339
- cur_backend_id = tensor_backend_id;
1358
+ int * node_backend_id = &tensor_backend_id(node);
1359
+ if (*node_backend_id != -1) {
1360
+ cur_backend_id = *node_backend_id;
1340
1361
  } else {
1341
- tensor_backend_id(node) = cur_backend_id;
1362
+ *node_backend_id = cur_backend_id;
1342
1363
  SET_CAUSE(node, "2.3");
1343
1364
  }
1344
1365
  }
@@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1351
1372
  // pass 3: assign backends to remaining src from dst and view_src
1352
1373
  for (int i = 0; i < graph->n_nodes; i++) {
1353
1374
  struct ggml_tensor * node = graph->nodes[i];
1354
- int cur_backend_id = tensor_backend_id(node);
1355
- if (node->view_src != NULL && cur_backend_id == -1) {
1356
- cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
1375
+ int * cur_backend_id = &tensor_backend_id(node);
1376
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1377
+ *cur_backend_id = tensor_backend_id(node->view_src);
1357
1378
  SET_CAUSE(node, "3.vsrc");
1358
1379
  }
1359
1380
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1361
1382
  if (src == NULL) {
1362
1383
  continue;
1363
1384
  }
1364
- int src_backend_id = tensor_backend_id(src);
1365
- if (src_backend_id == -1) {
1385
+ int * src_backend_id = &tensor_backend_id(src);
1386
+ if (*src_backend_id == -1) {
1366
1387
  if (src->view_src != NULL) {
1367
1388
  // views are always on the same backend as the source
1368
- tensor_backend_id(src) = tensor_backend_id(src->view_src);
1389
+ *src_backend_id = tensor_backend_id(src->view_src);
1369
1390
  SET_CAUSE(src, "3.vsrc");
1370
1391
  } else {
1371
- tensor_backend_id(src) = cur_backend_id;
1392
+ *src_backend_id = *cur_backend_id;
1372
1393
  SET_CAUSE(src, "3.cur");
1373
1394
  }
1374
1395
  }
@@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1380
1401
 
1381
1402
  // pass 4: split graph, find tensors that need to be copied
1382
1403
  {
1383
- int cur_split = 0;
1404
+ int i_split = 0;
1405
+ struct ggml_backend_sched_split * split = &sched->splits[0];
1384
1406
  // find the backend of the first split, skipping view ops
1385
1407
  for (int i = 0; i < graph->n_nodes; i++) {
1386
1408
  struct ggml_tensor * node = graph->nodes[i];
1387
1409
  if (!ggml_is_view_op(node->op)) {
1388
- sched->splits[0].backend_id = tensor_backend_id(node);
1410
+ split->backend_id = tensor_backend_id(node);
1389
1411
  break;
1390
1412
  }
1391
1413
  }
1392
- sched->splits[0].i_start = 0;
1393
- sched->splits[0].n_inputs = 0;
1394
- memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1395
- int cur_backend_id = sched->splits[0].backend_id;
1414
+ split->i_start = 0;
1415
+ split->n_inputs = 0;
1416
+ memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1417
+ int cur_backend_id = split->backend_id;
1396
1418
  for (int i = 0; i < graph->n_nodes; i++) {
1397
1419
  struct ggml_tensor * node = graph->nodes[i];
1398
1420
 
@@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1400
1422
  continue;
1401
1423
  }
1402
1424
 
1403
- int tensor_backend_id = tensor_backend_id(node);
1425
+ const int node_backend_id = tensor_backend_id(node);
1404
1426
 
1405
- GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
1427
+ GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
1406
1428
 
1407
- if (tensor_backend_id != cur_backend_id) {
1408
- sched->splits[cur_split].i_end = i;
1409
- cur_split++;
1410
- GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
1411
- sched->splits[cur_split].backend_id = tensor_backend_id;
1412
- sched->splits[cur_split].i_start = i;
1413
- sched->splits[cur_split].n_inputs = 0;
1414
- cur_backend_id = tensor_backend_id;
1429
+ // check if we should start a new split based on the sources of the current node
1430
+ bool need_new_split = false;
1431
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1432
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1433
+ struct ggml_tensor * src = node->src[j];
1434
+ if (src == NULL) {
1435
+ continue;
1436
+ }
1437
+ // check if a weight is on a different backend
1438
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1439
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1440
+ int src_backend_id = tensor_backend_id(src);
1441
+ if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1442
+ need_new_split = true;
1443
+ break;
1444
+ }
1445
+ }
1446
+ // check if the split has too many inputs
1447
+ if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1448
+ const size_t id = hash_id(src);
1449
+ int src_backend_id = sched->tensor_backend_id[id];
1450
+ if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1451
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1452
+ need_new_split = true;
1453
+ break;
1454
+ }
1455
+ }
1456
+ }
1457
+ }
1458
+
1459
+ if (node_backend_id != cur_backend_id || need_new_split) {
1460
+ split->i_end = i;
1461
+ i_split++;
1462
+ if (i_split >= sched->splits_capacity) {
1463
+ sched->splits_capacity *= 2;
1464
+ sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1465
+ GGML_ASSERT(sched->splits != NULL);
1466
+ }
1467
+ GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
1468
+ split = &sched->splits[i_split];
1469
+ split->backend_id = node_backend_id;
1470
+ split->i_start = i;
1471
+ split->n_inputs = 0;
1472
+ cur_backend_id = node_backend_id;
1415
1473
  }
1416
1474
 
1417
1475
  // find inputs that are not on the same backend
@@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1421
1479
  continue;
1422
1480
  }
1423
1481
 
1424
- int src_backend_id = tensor_backend_id(src);
1482
+ const int src_backend_id = tensor_backend_id(src);
1425
1483
  assert(src_backend_id != -1); // all inputs should be assigned by now
1426
1484
 
1427
- if (src->flags & GGML_TENSOR_FLAG_INPUT) {
1485
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1428
1486
  size_t id = hash_id(src);
1429
1487
  if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1430
1488
  ggml_backend_t backend = sched->backends[src_backend_id];
@@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1441
1499
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1442
1500
  }
1443
1501
  sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1444
- tensor_backend_id(tensor_copy) = src_backend_id;
1445
1502
  SET_CAUSE(tensor_copy, "4.cpy");
1446
1503
  }
1447
1504
  int n_graph_inputs = sched->n_graph_inputs++;
@@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1450
1507
  }
1451
1508
  }
1452
1509
 
1453
- if (src_backend_id != tensor_backend_id) {
1510
+ if (src_backend_id != node_backend_id) {
1454
1511
  // create a copy of the input in the split's backend
1455
- size_t id = hash_id(src);
1512
+ const size_t id = hash_id(src);
1456
1513
  if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1457
1514
  ggml_backend_t backend = sched->backends[cur_backend_id];
1458
1515
  for (int c = 0; c < sched->n_copies; c++) {
@@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1463
1520
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1464
1521
  }
1465
1522
  sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1466
- tensor_backend_id(tensor_copy) = cur_backend_id;
1467
1523
  SET_CAUSE(tensor_copy, "4.cpy");
1468
1524
  }
1469
- int n_inputs = sched->splits[cur_split].n_inputs++;
1525
+ int n_inputs = split->n_inputs++;
1470
1526
  GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1471
- sched->splits[cur_split].inputs[n_inputs] = src;
1527
+ split->inputs[n_inputs] = src;
1472
1528
  }
1473
1529
  node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1474
1530
  }
1475
1531
  }
1476
1532
  }
1477
- sched->splits[cur_split].i_end = graph->n_nodes;
1478
- sched->n_splits = cur_split + 1;
1533
+ split->i_end = graph->n_nodes;
1534
+ sched->n_splits = i_split + 1;
1479
1535
  }
1480
1536
  #ifdef DEBUG_PASS4
1481
1537
  fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1482
1538
  #endif
1483
1539
 
1484
- #ifndef NDEBUG
1485
- // sanity check: all sources should have the same backend as the node
1486
- for (int i = 0; i < graph->n_nodes; i++) {
1487
- struct ggml_tensor * node = graph->nodes[i];
1488
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1489
- if (tensor_backend == NULL) {
1490
- fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1491
- }
1492
- if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
1493
- fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1494
- node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1495
- node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
1496
- ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
1497
- }
1498
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1499
- struct ggml_tensor * src = node->src[j];
1500
- if (src == NULL) {
1501
- continue;
1502
- }
1503
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1504
- if (src_backend != tensor_backend /* && src_backend != NULL */) {
1505
- fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1506
- node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1507
- j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1508
- }
1509
- if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
1510
- fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1511
- src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1512
- src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
1513
- ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
1514
- }
1515
- }
1516
- }
1517
- fflush(stderr);
1518
- #endif
1519
-
1520
1540
  // create copies of the graph for each split
1521
1541
  // TODO: avoid this copy
1522
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
1542
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
1523
1543
  for (int i = 0; i < sched->n_splits; i++) {
1524
1544
  struct ggml_backend_sched_split * split = &sched->splits[i];
1525
1545
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1526
1546
 
1527
1547
  // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1528
1548
  for (int j = 0; j < split->n_inputs; j++) {
1549
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1550
+
1529
1551
  struct ggml_tensor * input = split->inputs[j];
1530
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
1552
+ const size_t input_id = hash_id(input);
1553
+ struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
1531
1554
 
1532
1555
  // add a dependency to the input source so that it is not freed before the copy is done
1533
1556
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1534
1557
  input_dep->src[0] = input;
1535
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1558
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
1536
1559
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1537
1560
 
1538
1561
  // add a dependency to the input copy so that it is allocated at the start of the split
@@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1541
1564
  }
1542
1565
 
1543
1566
  for (int j = split->i_start; j < split->i_end; j++) {
1567
+ assert(graph_copy->size > graph_copy->n_nodes);
1544
1568
  sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1545
1569
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1546
1570
  }
@@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1625
1649
  }
1626
1650
  ggml_backend_tensor_copy(input, input_cpy);
1627
1651
  } else {
1652
+ // wait for the split backend to finish using the input before overwriting it
1628
1653
  if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1629
1654
  ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1630
1655
  } else {
1631
1656
  ggml_backend_synchronize(split_backend);
1632
- ggml_backend_synchronize(input_backend);
1633
1657
  }
1634
-
1635
1658
  ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1636
1659
  }
1637
1660
  }
@@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
1701
1724
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1702
1725
 
1703
1726
  // initialize hash table
1704
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1727
+ sched->hash_set = ggml_hash_set_new(graph_size);
1705
1728
  sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1706
1729
  sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1707
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1708
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
1730
+
1731
+ const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
+ sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
+ sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1709
1734
 
1710
1735
  sched->n_backends = n_backends;
1711
1736
 
1712
1737
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1713
1738
 
1714
- GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
1739
+ const int initial_splits_capacity = 16;
1740
+ sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1741
+ sched->splits_capacity = initial_splits_capacity;
1715
1742
 
1716
1743
  for (int b = 0; b < n_backends; b++) {
1717
1744
  sched->backends[b] = backends[b];
@@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1742
1769
  }
1743
1770
  ggml_gallocr_free(sched->galloc);
1744
1771
  ggml_free(sched->ctx);
1772
+ free(sched->splits);
1745
1773
  free(sched->hash_set.keys);
1746
1774
  free(sched->tensor_backend_id);
1747
1775
  free(sched->tensor_copies);
@@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1762
1790
  }
1763
1791
 
1764
1792
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1793
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1794
+
1765
1795
  ggml_backend_sched_split_graph(sched, measure_graph);
1766
1796
 
1767
1797
  // TODO: extract this to a separate function
@@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1776
1806
  }
1777
1807
 
1778
1808
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1779
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1809
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
1780
1810
 
1781
1811
  ggml_backend_sched_split_graph(sched, graph);
1782
1812
 
@@ -70,11 +70,11 @@ extern "C" {
70
70
  GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
71
71
  GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
72
72
 
73
- GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
74
- GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
-
76
- GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
73
+ GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
74
+ GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
+ GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
77
76
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
77
+ GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
78
 
79
79
  // tensor copy between different backends
80
80
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);