llama_cpp 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
278
278
  return err;
279
279
  }
280
280
 
281
- bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
281
+ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
282
282
  return backend->iface.graph_compute(backend, cgraph);
283
283
  }
284
284
 
@@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
286
286
  return backend->iface.supports_op(backend, op);
287
287
  }
288
288
 
289
+ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
+ if (backend->iface.offload_op != NULL) {
291
+ return backend->iface.offload_op(backend, op);
292
+ }
293
+ return false;
294
+ }
295
+
289
296
  // backend copy
290
297
 
291
298
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -413,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
413
420
  ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
414
421
 
415
422
  // add forward decls here to avoid including the backend headers
416
- #ifdef GGML_USE_CUBLAS
423
+ #ifdef GGML_USE_CUDA
417
424
  extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
418
425
  ggml_backend_cuda_reg_devices();
419
426
  #endif
@@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
761
768
 
762
769
  if (cpu_plan->cplan.work_size > 0) {
763
770
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
771
+ if (cpu_plan->cplan.work_data == NULL) {
772
+ free(cpu_plan);
773
+ return NULL;
774
+ }
764
775
  }
765
776
 
766
777
  cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
@@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
834
845
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
835
846
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
836
847
  /* .supports_op = */ ggml_backend_cpu_supports_op,
848
+ /* .offload_op = */ NULL,
837
849
  /* .event_new = */ NULL,
838
850
  /* .event_free = */ NULL,
839
851
  /* .event_record = */ NULL,
@@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
999
1011
  #endif
1000
1012
 
1001
1013
  #ifndef GGML_SCHED_MAX_SPLITS
1002
- #define GGML_SCHED_MAX_SPLITS 256
1014
+ #define GGML_SCHED_MAX_SPLITS 2048
1003
1015
  #endif
1004
1016
 
1005
1017
  #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1006
- #define GGML_SCHED_MAX_SPLIT_INPUTS 16
1018
+ #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1007
1019
  #endif
1008
1020
 
1009
1021
  #ifndef GGML_SCHED_MAX_COPIES
@@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
1043
1055
  struct ggml_cgraph * graph;
1044
1056
 
1045
1057
  // graph splits
1046
- struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
1058
+ struct ggml_backend_sched_split * splits;
1047
1059
  int n_splits;
1060
+ int splits_capacity;
1048
1061
 
1049
1062
  // pipeline parallelism support
1050
1063
  int n_copies;
@@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1114
1127
  // TODO: use supports_op to check if the backend supports the op
1115
1128
 
1116
1129
  // assign pre-allocated nodes to their backend
1117
- // dst
1118
- int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
1119
- if (cur_backend != -1) {
1130
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
1131
+ if (cur_backend_id != -1) {
1120
1132
  SET_CAUSE(tensor, "1.dst");
1121
- return cur_backend;
1133
+ return cur_backend_id;
1122
1134
  }
1123
1135
 
1124
1136
  // view_src
1125
1137
  if (tensor->view_src != NULL) {
1126
- cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1127
- if (cur_backend != -1) {
1138
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1139
+ if (cur_backend_id != -1) {
1128
1140
  SET_CAUSE(tensor, "1.vsrc");
1129
- return cur_backend;
1141
+ return cur_backend_id;
1130
1142
  }
1131
1143
  }
1132
1144
 
1133
- // input
1145
+ // graph input
1134
1146
  if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1135
- cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
1147
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1136
1148
  SET_CAUSE(tensor, "1.inp");
1137
- return cur_backend;
1149
+ return cur_backend_id;
1138
1150
  }
1139
1151
 
1140
1152
  // assign nodes that use weights to the backend of the weights
1153
+ // operations with weights are preferably run on the same backend as the weights
1141
1154
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1142
1155
  const struct ggml_tensor * src = tensor->src[i];
1143
1156
  if (src == NULL) {
1144
1157
  continue;
1145
1158
  }
1146
1159
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1147
- int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
1148
- // operations with weights are always run on the same backend as the weights
1160
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
1161
+ // check if a backend with higher prio wants to offload the op
1162
+ if (src_backend_id == sched->n_backends - 1) {
1163
+ for (int b = 0; b < src_backend_id; b++) {
1164
+ if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1165
+ SET_CAUSE(tensor, "1.off");
1166
+ return b;
1167
+ }
1168
+ }
1169
+ }
1149
1170
  SET_CAUSE(tensor, "1.wgt%d", i);
1150
- return src_backend;
1171
+ return src_backend_id;
1151
1172
  }
1152
1173
  }
1153
1174
 
@@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1227
1248
  // pass 1: assign backends to ops with pre-allocated inputs
1228
1249
  for (int i = 0; i < graph->n_leafs; i++) {
1229
1250
  struct ggml_tensor * leaf = graph->leafs[i];
1230
- if (tensor_backend_id(leaf) != -1) {
1251
+ int * leaf_backend_id = &tensor_backend_id(leaf);
1252
+ if (*leaf_backend_id != -1) {
1231
1253
  // do not overwrite user assignments
1232
1254
  continue;
1233
1255
  }
1234
- tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1256
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1235
1257
  }
1236
1258
 
1237
1259
  for (int i = 0; i < graph->n_nodes; i++) {
1238
1260
  struct ggml_tensor * node = graph->nodes[i];
1239
- if (tensor_backend_id(node) != -1) {
1261
+ int * node_backend_id = &tensor_backend_id(node);
1262
+ if (*node_backend_id != -1) {
1240
1263
  // do not overwrite user assignments
1241
1264
  continue;
1242
1265
  }
1243
- tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
1266
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1244
1267
  // src
1245
1268
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1246
1269
  struct ggml_tensor * src = node->src[j];
1247
1270
  if (src == NULL) {
1248
1271
  continue;
1249
1272
  }
1250
- if (tensor_backend_id(src) == -1) {
1251
- tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
1273
+ int * src_backend_id = &tensor_backend_id(src);
1274
+ if (*src_backend_id == -1) {
1275
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1252
1276
  }
1253
1277
  }
1254
1278
  }
@@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1270
1294
  if (ggml_is_view_op(node->op)) {
1271
1295
  continue;
1272
1296
  }
1273
- int tensor_backend_id = tensor_backend_id(node);
1274
- if (tensor_backend_id != -1) {
1275
- if (tensor_backend_id == sched->n_backends - 1) {
1297
+ int * node_backend_id = &tensor_backend_id(node);
1298
+ if (*node_backend_id != -1) {
1299
+ if (*node_backend_id == sched->n_backends - 1) {
1276
1300
  // skip cpu (lowest prio backend)
1277
1301
  cur_backend_id = -1;
1278
1302
  } else {
1279
- cur_backend_id = tensor_backend_id;
1303
+ cur_backend_id = *node_backend_id;
1280
1304
  }
1281
1305
  } else {
1282
- tensor_backend_id(node) = cur_backend_id;
1306
+ *node_backend_id = cur_backend_id;
1283
1307
  SET_CAUSE(node, "2.2");
1284
1308
  }
1285
1309
  }
1286
1310
  }
1287
-
1288
1311
  // pass 2.1 expand gpu up
1289
1312
  {
1290
1313
  int cur_backend_id = -1;
@@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1293
1316
  if (ggml_is_view_op(node->op)) {
1294
1317
  continue;
1295
1318
  }
1296
- int tensor_backend_id = tensor_backend_id(node);
1297
- if (tensor_backend_id != -1) {
1298
- if (tensor_backend_id == sched->n_backends - 1) {
1319
+ int * node_backend_id = &tensor_backend_id(node);
1320
+ if (*node_backend_id != -1) {
1321
+ if (*node_backend_id == sched->n_backends - 1) {
1299
1322
  // skip cpu (lowest prio backend)
1300
1323
  cur_backend_id = -1;
1301
1324
  } else {
1302
- cur_backend_id = tensor_backend_id;
1325
+ cur_backend_id = *node_backend_id;
1303
1326
  }
1304
1327
  } else {
1305
- tensor_backend_id(node) = cur_backend_id;
1328
+ *node_backend_id = cur_backend_id;
1306
1329
  SET_CAUSE(node, "2.1");
1307
1330
  }
1308
1331
  }
1309
1332
  }
1310
-
1311
-
1312
1333
  // pass 2.4 expand rest down
1313
1334
  {
1314
1335
  int cur_backend_id = -1;
@@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1317
1338
  if (ggml_is_view_op(node->op)) {
1318
1339
  continue;
1319
1340
  }
1320
- int tensor_backend_id = tensor_backend_id(node);
1321
- if (tensor_backend_id != -1) {
1322
- cur_backend_id = tensor_backend_id;
1341
+ int * node_backend_id = &tensor_backend_id(node);
1342
+ if (*node_backend_id != -1) {
1343
+ cur_backend_id = *node_backend_id;
1323
1344
  } else {
1324
- tensor_backend_id(node) = cur_backend_id;
1345
+ *node_backend_id = cur_backend_id;
1325
1346
  SET_CAUSE(node, "2.4");
1326
1347
  }
1327
1348
  }
1328
1349
  }
1329
- // pass 2.3 expand rest up
1350
+ // pass 2.3 expand rest up
1330
1351
  {
1331
1352
  int cur_backend_id = -1;
1332
1353
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1334
1355
  if (ggml_is_view_op(node->op)) {
1335
1356
  continue;
1336
1357
  }
1337
- int tensor_backend_id = tensor_backend_id(node);
1338
- if (tensor_backend_id != -1) {
1339
- cur_backend_id = tensor_backend_id;
1358
+ int * node_backend_id = &tensor_backend_id(node);
1359
+ if (*node_backend_id != -1) {
1360
+ cur_backend_id = *node_backend_id;
1340
1361
  } else {
1341
- tensor_backend_id(node) = cur_backend_id;
1362
+ *node_backend_id = cur_backend_id;
1342
1363
  SET_CAUSE(node, "2.3");
1343
1364
  }
1344
1365
  }
@@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1351
1372
  // pass 3: assign backends to remaining src from dst and view_src
1352
1373
  for (int i = 0; i < graph->n_nodes; i++) {
1353
1374
  struct ggml_tensor * node = graph->nodes[i];
1354
- int cur_backend_id = tensor_backend_id(node);
1355
- if (node->view_src != NULL && cur_backend_id == -1) {
1356
- cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
1375
+ int * cur_backend_id = &tensor_backend_id(node);
1376
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1377
+ *cur_backend_id = tensor_backend_id(node->view_src);
1357
1378
  SET_CAUSE(node, "3.vsrc");
1358
1379
  }
1359
1380
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1361
1382
  if (src == NULL) {
1362
1383
  continue;
1363
1384
  }
1364
- int src_backend_id = tensor_backend_id(src);
1365
- if (src_backend_id == -1) {
1385
+ int * src_backend_id = &tensor_backend_id(src);
1386
+ if (*src_backend_id == -1) {
1366
1387
  if (src->view_src != NULL) {
1367
1388
  // views are always on the same backend as the source
1368
- tensor_backend_id(src) = tensor_backend_id(src->view_src);
1389
+ *src_backend_id = tensor_backend_id(src->view_src);
1369
1390
  SET_CAUSE(src, "3.vsrc");
1370
1391
  } else {
1371
- tensor_backend_id(src) = cur_backend_id;
1392
+ *src_backend_id = *cur_backend_id;
1372
1393
  SET_CAUSE(src, "3.cur");
1373
1394
  }
1374
1395
  }
@@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1380
1401
 
1381
1402
  // pass 4: split graph, find tensors that need to be copied
1382
1403
  {
1383
- int cur_split = 0;
1404
+ int i_split = 0;
1405
+ struct ggml_backend_sched_split * split = &sched->splits[0];
1384
1406
  // find the backend of the first split, skipping view ops
1385
1407
  for (int i = 0; i < graph->n_nodes; i++) {
1386
1408
  struct ggml_tensor * node = graph->nodes[i];
1387
1409
  if (!ggml_is_view_op(node->op)) {
1388
- sched->splits[0].backend_id = tensor_backend_id(node);
1410
+ split->backend_id = tensor_backend_id(node);
1389
1411
  break;
1390
1412
  }
1391
1413
  }
1392
- sched->splits[0].i_start = 0;
1393
- sched->splits[0].n_inputs = 0;
1394
- memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1395
- int cur_backend_id = sched->splits[0].backend_id;
1414
+ split->i_start = 0;
1415
+ split->n_inputs = 0;
1416
+ memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1417
+ int cur_backend_id = split->backend_id;
1396
1418
  for (int i = 0; i < graph->n_nodes; i++) {
1397
1419
  struct ggml_tensor * node = graph->nodes[i];
1398
1420
 
@@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1400
1422
  continue;
1401
1423
  }
1402
1424
 
1403
- int tensor_backend_id = tensor_backend_id(node);
1425
+ const int node_backend_id = tensor_backend_id(node);
1404
1426
 
1405
- GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
1427
+ GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
1406
1428
 
1407
- if (tensor_backend_id != cur_backend_id) {
1408
- sched->splits[cur_split].i_end = i;
1409
- cur_split++;
1410
- GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
1411
- sched->splits[cur_split].backend_id = tensor_backend_id;
1412
- sched->splits[cur_split].i_start = i;
1413
- sched->splits[cur_split].n_inputs = 0;
1414
- cur_backend_id = tensor_backend_id;
1429
+ // check if we should start a new split based on the sources of the current node
1430
+ bool need_new_split = false;
1431
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1432
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1433
+ struct ggml_tensor * src = node->src[j];
1434
+ if (src == NULL) {
1435
+ continue;
1436
+ }
1437
+ // check if a weight is on a different backend
1438
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1439
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1440
+ int src_backend_id = tensor_backend_id(src);
1441
+ if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1442
+ need_new_split = true;
1443
+ break;
1444
+ }
1445
+ }
1446
+ // check if the split has too many inputs
1447
+ if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1448
+ const size_t id = hash_id(src);
1449
+ int src_backend_id = sched->tensor_backend_id[id];
1450
+ if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1451
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1452
+ need_new_split = true;
1453
+ break;
1454
+ }
1455
+ }
1456
+ }
1457
+ }
1458
+
1459
+ if (node_backend_id != cur_backend_id || need_new_split) {
1460
+ split->i_end = i;
1461
+ i_split++;
1462
+ if (i_split >= sched->splits_capacity) {
1463
+ sched->splits_capacity *= 2;
1464
+ sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1465
+ GGML_ASSERT(sched->splits != NULL);
1466
+ }
1467
+ GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
1468
+ split = &sched->splits[i_split];
1469
+ split->backend_id = node_backend_id;
1470
+ split->i_start = i;
1471
+ split->n_inputs = 0;
1472
+ cur_backend_id = node_backend_id;
1415
1473
  }
1416
1474
 
1417
1475
  // find inputs that are not on the same backend
@@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1421
1479
  continue;
1422
1480
  }
1423
1481
 
1424
- int src_backend_id = tensor_backend_id(src);
1482
+ const int src_backend_id = tensor_backend_id(src);
1425
1483
  assert(src_backend_id != -1); // all inputs should be assigned by now
1426
1484
 
1427
- if (src->flags & GGML_TENSOR_FLAG_INPUT) {
1485
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1428
1486
  size_t id = hash_id(src);
1429
1487
  if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1430
1488
  ggml_backend_t backend = sched->backends[src_backend_id];
@@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1441
1499
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1442
1500
  }
1443
1501
  sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1444
- tensor_backend_id(tensor_copy) = src_backend_id;
1445
1502
  SET_CAUSE(tensor_copy, "4.cpy");
1446
1503
  }
1447
1504
  int n_graph_inputs = sched->n_graph_inputs++;
@@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1450
1507
  }
1451
1508
  }
1452
1509
 
1453
- if (src_backend_id != tensor_backend_id) {
1510
+ if (src_backend_id != node_backend_id) {
1454
1511
  // create a copy of the input in the split's backend
1455
- size_t id = hash_id(src);
1512
+ const size_t id = hash_id(src);
1456
1513
  if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1457
1514
  ggml_backend_t backend = sched->backends[cur_backend_id];
1458
1515
  for (int c = 0; c < sched->n_copies; c++) {
@@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1463
1520
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1464
1521
  }
1465
1522
  sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1466
- tensor_backend_id(tensor_copy) = cur_backend_id;
1467
1523
  SET_CAUSE(tensor_copy, "4.cpy");
1468
1524
  }
1469
- int n_inputs = sched->splits[cur_split].n_inputs++;
1525
+ int n_inputs = split->n_inputs++;
1470
1526
  GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1471
- sched->splits[cur_split].inputs[n_inputs] = src;
1527
+ split->inputs[n_inputs] = src;
1472
1528
  }
1473
1529
  node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1474
1530
  }
1475
1531
  }
1476
1532
  }
1477
- sched->splits[cur_split].i_end = graph->n_nodes;
1478
- sched->n_splits = cur_split + 1;
1533
+ split->i_end = graph->n_nodes;
1534
+ sched->n_splits = i_split + 1;
1479
1535
  }
1480
1536
  #ifdef DEBUG_PASS4
1481
1537
  fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1482
1538
  #endif
1483
1539
 
1484
- #ifndef NDEBUG
1485
- // sanity check: all sources should have the same backend as the node
1486
- for (int i = 0; i < graph->n_nodes; i++) {
1487
- struct ggml_tensor * node = graph->nodes[i];
1488
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1489
- if (tensor_backend == NULL) {
1490
- fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1491
- }
1492
- if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
1493
- fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1494
- node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1495
- node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
1496
- ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
1497
- }
1498
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1499
- struct ggml_tensor * src = node->src[j];
1500
- if (src == NULL) {
1501
- continue;
1502
- }
1503
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1504
- if (src_backend != tensor_backend /* && src_backend != NULL */) {
1505
- fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
1506
- node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
1507
- j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
1508
- }
1509
- if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
1510
- fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1511
- src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
1512
- src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
1513
- ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
1514
- }
1515
- }
1516
- }
1517
- fflush(stderr);
1518
- #endif
1519
-
1520
1540
  // create copies of the graph for each split
1521
1541
  // TODO: avoid this copy
1522
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
1542
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
1523
1543
  for (int i = 0; i < sched->n_splits; i++) {
1524
1544
  struct ggml_backend_sched_split * split = &sched->splits[i];
1525
1545
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1526
1546
 
1527
1547
  // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1528
1548
  for (int j = 0; j < split->n_inputs; j++) {
1549
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1550
+
1529
1551
  struct ggml_tensor * input = split->inputs[j];
1530
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
1552
+ const size_t input_id = hash_id(input);
1553
+ struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
1531
1554
 
1532
1555
  // add a dependency to the input source so that it is not freed before the copy is done
1533
1556
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1534
1557
  input_dep->src[0] = input;
1535
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
1558
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
1536
1559
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1537
1560
 
1538
1561
  // add a dependency to the input copy so that it is allocated at the start of the split
@@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1541
1564
  }
1542
1565
 
1543
1566
  for (int j = split->i_start; j < split->i_end; j++) {
1567
+ assert(graph_copy->size > graph_copy->n_nodes);
1544
1568
  sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1545
1569
  graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1546
1570
  }
@@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1625
1649
  }
1626
1650
  ggml_backend_tensor_copy(input, input_cpy);
1627
1651
  } else {
1652
+ // wait for the split backend to finish using the input before overwriting it
1628
1653
  if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1629
1654
  ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1630
1655
  } else {
1631
1656
  ggml_backend_synchronize(split_backend);
1632
- ggml_backend_synchronize(input_backend);
1633
1657
  }
1634
-
1635
1658
  ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1636
1659
  }
1637
1660
  }
@@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
1701
1724
  struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1702
1725
 
1703
1726
  // initialize hash table
1704
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1727
+ sched->hash_set = ggml_hash_set_new(graph_size);
1705
1728
  sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1706
1729
  sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1707
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1708
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
1730
+
1731
+ const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
+ sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
+ sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1709
1734
 
1710
1735
  sched->n_backends = n_backends;
1711
1736
 
1712
1737
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1713
1738
 
1714
- GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
1739
+ const int initial_splits_capacity = 16;
1740
+ sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1741
+ sched->splits_capacity = initial_splits_capacity;
1715
1742
 
1716
1743
  for (int b = 0; b < n_backends; b++) {
1717
1744
  sched->backends[b] = backends[b];
@@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1742
1769
  }
1743
1770
  ggml_gallocr_free(sched->galloc);
1744
1771
  ggml_free(sched->ctx);
1772
+ free(sched->splits);
1745
1773
  free(sched->hash_set.keys);
1746
1774
  free(sched->tensor_backend_id);
1747
1775
  free(sched->tensor_copies);
@@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1762
1790
  }
1763
1791
 
1764
1792
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1793
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1794
+
1765
1795
  ggml_backend_sched_split_graph(sched, measure_graph);
1766
1796
 
1767
1797
  // TODO: extract this to a separate function
@@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1776
1806
  }
1777
1807
 
1778
1808
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1779
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
1809
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
1780
1810
 
1781
1811
  ggml_backend_sched_split_graph(sched, graph);
1782
1812
 
@@ -70,11 +70,11 @@ extern "C" {
70
70
  GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
71
71
  GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
72
72
 
73
- GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
74
- GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
-
76
- GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
73
+ GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
74
+ GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
+ GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
77
76
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
77
+ GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
78
 
79
79
  // tensor copy between different backends
80
80
  GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
@@ -377,6 +377,27 @@ typedef struct {
377
377
  } block_iq1_s;
378
378
  static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
379
379
 
380
+ // 1.75 bpw
381
+ typedef struct {
382
+ uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
+ uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
+ #if QK_K == 64
385
+ ggml_half d;
386
+ #endif
387
+ uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
+ } block_iq1_m;
389
+ #if QK_K == 64
390
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
+ #else
392
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
+ #endif
394
+
395
+ // Used by IQ1_M quants
396
+ typedef union {
397
+ ggml_half f16;
398
+ uint16_t u16;
399
+ } iq1m_scale_t;
400
+
380
401
  // Non-linear quants
381
402
  #define QK4_NL 32
382
403
  typedef struct {
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
426
447
 
427
448
  #define GGML_COMMON_IMPL
428
449
  #elif defined(GGML_COMMON_IMPL_SYCL)
450
+
429
451
  #include <cstdint>
430
452
 
431
- #define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), {
432
- #define GGML_TABLE_END() });
453
+ #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
454
+ #define GGML_TABLE_END() };
433
455
 
434
456
  #define GGML_COMMON_IMPL
435
457
  #endif
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
1050
1072
 
1051
1073
  #define NGRID_IQ1S 2048
1052
1074
  #define IQ1S_DELTA 0.125f
1075
+ #define IQ1M_DELTA 0.125f
1053
1076
  #if defined(GGML_COMMON_IMPL_C)
1054
1077
  GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
1055
1078
  0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,