llama_cpp 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
|
|
278
278
|
return err;
|
279
279
|
}
|
280
280
|
|
281
|
-
|
281
|
+
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
282
282
|
return backend->iface.graph_compute(backend, cgraph);
|
283
283
|
}
|
284
284
|
|
@@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
|
286
286
|
return backend->iface.supports_op(backend, op);
|
287
287
|
}
|
288
288
|
|
289
|
+
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
290
|
+
if (backend->iface.offload_op != NULL) {
|
291
|
+
return backend->iface.offload_op(backend, op);
|
292
|
+
}
|
293
|
+
return false;
|
294
|
+
}
|
295
|
+
|
289
296
|
// backend copy
|
290
297
|
|
291
298
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
@@ -413,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
413
420
|
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
414
421
|
|
415
422
|
// add forward decls here to avoid including the backend headers
|
416
|
-
#ifdef
|
423
|
+
#ifdef GGML_USE_CUDA
|
417
424
|
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
418
425
|
ggml_backend_cuda_reg_devices();
|
419
426
|
#endif
|
@@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
|
761
768
|
|
762
769
|
if (cpu_plan->cplan.work_size > 0) {
|
763
770
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
771
|
+
if (cpu_plan->cplan.work_data == NULL) {
|
772
|
+
free(cpu_plan);
|
773
|
+
return NULL;
|
774
|
+
}
|
764
775
|
}
|
765
776
|
|
766
777
|
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
@@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
834
845
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
835
846
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
836
847
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
848
|
+
/* .offload_op = */ NULL,
|
837
849
|
/* .event_new = */ NULL,
|
838
850
|
/* .event_free = */ NULL,
|
839
851
|
/* .event_record = */ NULL,
|
@@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
999
1011
|
#endif
|
1000
1012
|
|
1001
1013
|
#ifndef GGML_SCHED_MAX_SPLITS
|
1002
|
-
#define GGML_SCHED_MAX_SPLITS
|
1014
|
+
#define GGML_SCHED_MAX_SPLITS 2048
|
1003
1015
|
#endif
|
1004
1016
|
|
1005
1017
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
1006
|
-
#define GGML_SCHED_MAX_SPLIT_INPUTS
|
1018
|
+
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
1007
1019
|
#endif
|
1008
1020
|
|
1009
1021
|
#ifndef GGML_SCHED_MAX_COPIES
|
@@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
|
|
1043
1055
|
struct ggml_cgraph * graph;
|
1044
1056
|
|
1045
1057
|
// graph splits
|
1046
|
-
struct ggml_backend_sched_split splits
|
1058
|
+
struct ggml_backend_sched_split * splits;
|
1047
1059
|
int n_splits;
|
1060
|
+
int splits_capacity;
|
1048
1061
|
|
1049
1062
|
// pipeline parallelism support
|
1050
1063
|
int n_copies;
|
@@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1114
1127
|
// TODO: use supports_op to check if the backend supports the op
|
1115
1128
|
|
1116
1129
|
// assign pre-allocated nodes to their backend
|
1117
|
-
|
1118
|
-
|
1119
|
-
if (cur_backend != -1) {
|
1130
|
+
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
1131
|
+
if (cur_backend_id != -1) {
|
1120
1132
|
SET_CAUSE(tensor, "1.dst");
|
1121
|
-
return
|
1133
|
+
return cur_backend_id;
|
1122
1134
|
}
|
1123
1135
|
|
1124
1136
|
// view_src
|
1125
1137
|
if (tensor->view_src != NULL) {
|
1126
|
-
|
1127
|
-
if (
|
1138
|
+
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
1139
|
+
if (cur_backend_id != -1) {
|
1128
1140
|
SET_CAUSE(tensor, "1.vsrc");
|
1129
|
-
return
|
1141
|
+
return cur_backend_id;
|
1130
1142
|
}
|
1131
1143
|
}
|
1132
1144
|
|
1133
|
-
// input
|
1145
|
+
// graph input
|
1134
1146
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
1135
|
-
|
1147
|
+
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
1136
1148
|
SET_CAUSE(tensor, "1.inp");
|
1137
|
-
return
|
1149
|
+
return cur_backend_id;
|
1138
1150
|
}
|
1139
1151
|
|
1140
1152
|
// assign nodes that use weights to the backend of the weights
|
1153
|
+
// operations with weights are preferably run on the same backend as the weights
|
1141
1154
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1142
1155
|
const struct ggml_tensor * src = tensor->src[i];
|
1143
1156
|
if (src == NULL) {
|
1144
1157
|
continue;
|
1145
1158
|
}
|
1146
1159
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1147
|
-
int
|
1148
|
-
//
|
1160
|
+
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
1161
|
+
// check if a backend with higher prio wants to offload the op
|
1162
|
+
if (src_backend_id == sched->n_backends - 1) {
|
1163
|
+
for (int b = 0; b < src_backend_id; b++) {
|
1164
|
+
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
1165
|
+
SET_CAUSE(tensor, "1.off");
|
1166
|
+
return b;
|
1167
|
+
}
|
1168
|
+
}
|
1169
|
+
}
|
1149
1170
|
SET_CAUSE(tensor, "1.wgt%d", i);
|
1150
|
-
return
|
1171
|
+
return src_backend_id;
|
1151
1172
|
}
|
1152
1173
|
}
|
1153
1174
|
|
@@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1227
1248
|
// pass 1: assign backends to ops with pre-allocated inputs
|
1228
1249
|
for (int i = 0; i < graph->n_leafs; i++) {
|
1229
1250
|
struct ggml_tensor * leaf = graph->leafs[i];
|
1230
|
-
|
1251
|
+
int * leaf_backend_id = &tensor_backend_id(leaf);
|
1252
|
+
if (*leaf_backend_id != -1) {
|
1231
1253
|
// do not overwrite user assignments
|
1232
1254
|
continue;
|
1233
1255
|
}
|
1234
|
-
|
1256
|
+
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
1235
1257
|
}
|
1236
1258
|
|
1237
1259
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1238
1260
|
struct ggml_tensor * node = graph->nodes[i];
|
1239
|
-
|
1261
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1262
|
+
if (*node_backend_id != -1) {
|
1240
1263
|
// do not overwrite user assignments
|
1241
1264
|
continue;
|
1242
1265
|
}
|
1243
|
-
|
1266
|
+
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
1244
1267
|
// src
|
1245
1268
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1246
1269
|
struct ggml_tensor * src = node->src[j];
|
1247
1270
|
if (src == NULL) {
|
1248
1271
|
continue;
|
1249
1272
|
}
|
1250
|
-
|
1251
|
-
|
1273
|
+
int * src_backend_id = &tensor_backend_id(src);
|
1274
|
+
if (*src_backend_id == -1) {
|
1275
|
+
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
1252
1276
|
}
|
1253
1277
|
}
|
1254
1278
|
}
|
@@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1270
1294
|
if (ggml_is_view_op(node->op)) {
|
1271
1295
|
continue;
|
1272
1296
|
}
|
1273
|
-
int
|
1274
|
-
if (
|
1275
|
-
if (
|
1297
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1298
|
+
if (*node_backend_id != -1) {
|
1299
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
1276
1300
|
// skip cpu (lowest prio backend)
|
1277
1301
|
cur_backend_id = -1;
|
1278
1302
|
} else {
|
1279
|
-
cur_backend_id =
|
1303
|
+
cur_backend_id = *node_backend_id;
|
1280
1304
|
}
|
1281
1305
|
} else {
|
1282
|
-
|
1306
|
+
*node_backend_id = cur_backend_id;
|
1283
1307
|
SET_CAUSE(node, "2.2");
|
1284
1308
|
}
|
1285
1309
|
}
|
1286
1310
|
}
|
1287
|
-
|
1288
1311
|
// pass 2.1 expand gpu up
|
1289
1312
|
{
|
1290
1313
|
int cur_backend_id = -1;
|
@@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1293
1316
|
if (ggml_is_view_op(node->op)) {
|
1294
1317
|
continue;
|
1295
1318
|
}
|
1296
|
-
int
|
1297
|
-
if (
|
1298
|
-
if (
|
1319
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1320
|
+
if (*node_backend_id != -1) {
|
1321
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
1299
1322
|
// skip cpu (lowest prio backend)
|
1300
1323
|
cur_backend_id = -1;
|
1301
1324
|
} else {
|
1302
|
-
cur_backend_id =
|
1325
|
+
cur_backend_id = *node_backend_id;
|
1303
1326
|
}
|
1304
1327
|
} else {
|
1305
|
-
|
1328
|
+
*node_backend_id = cur_backend_id;
|
1306
1329
|
SET_CAUSE(node, "2.1");
|
1307
1330
|
}
|
1308
1331
|
}
|
1309
1332
|
}
|
1310
|
-
|
1311
|
-
|
1312
1333
|
// pass 2.4 expand rest down
|
1313
1334
|
{
|
1314
1335
|
int cur_backend_id = -1;
|
@@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1317
1338
|
if (ggml_is_view_op(node->op)) {
|
1318
1339
|
continue;
|
1319
1340
|
}
|
1320
|
-
int
|
1321
|
-
if (
|
1322
|
-
cur_backend_id =
|
1341
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1342
|
+
if (*node_backend_id != -1) {
|
1343
|
+
cur_backend_id = *node_backend_id;
|
1323
1344
|
} else {
|
1324
|
-
|
1345
|
+
*node_backend_id = cur_backend_id;
|
1325
1346
|
SET_CAUSE(node, "2.4");
|
1326
1347
|
}
|
1327
1348
|
}
|
1328
1349
|
}
|
1329
|
-
|
1350
|
+
// pass 2.3 expand rest up
|
1330
1351
|
{
|
1331
1352
|
int cur_backend_id = -1;
|
1332
1353
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
@@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1334
1355
|
if (ggml_is_view_op(node->op)) {
|
1335
1356
|
continue;
|
1336
1357
|
}
|
1337
|
-
int
|
1338
|
-
if (
|
1339
|
-
cur_backend_id =
|
1358
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1359
|
+
if (*node_backend_id != -1) {
|
1360
|
+
cur_backend_id = *node_backend_id;
|
1340
1361
|
} else {
|
1341
|
-
|
1362
|
+
*node_backend_id = cur_backend_id;
|
1342
1363
|
SET_CAUSE(node, "2.3");
|
1343
1364
|
}
|
1344
1365
|
}
|
@@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1351
1372
|
// pass 3: assign backends to remaining src from dst and view_src
|
1352
1373
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1353
1374
|
struct ggml_tensor * node = graph->nodes[i];
|
1354
|
-
int cur_backend_id = tensor_backend_id(node);
|
1355
|
-
if (node->view_src != NULL && cur_backend_id == -1) {
|
1356
|
-
cur_backend_id = tensor_backend_id(node
|
1375
|
+
int * cur_backend_id = &tensor_backend_id(node);
|
1376
|
+
if (node->view_src != NULL && *cur_backend_id == -1) {
|
1377
|
+
*cur_backend_id = tensor_backend_id(node->view_src);
|
1357
1378
|
SET_CAUSE(node, "3.vsrc");
|
1358
1379
|
}
|
1359
1380
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1361
1382
|
if (src == NULL) {
|
1362
1383
|
continue;
|
1363
1384
|
}
|
1364
|
-
int src_backend_id = tensor_backend_id(src);
|
1365
|
-
if (src_backend_id == -1) {
|
1385
|
+
int * src_backend_id = &tensor_backend_id(src);
|
1386
|
+
if (*src_backend_id == -1) {
|
1366
1387
|
if (src->view_src != NULL) {
|
1367
1388
|
// views are always on the same backend as the source
|
1368
|
-
|
1389
|
+
*src_backend_id = tensor_backend_id(src->view_src);
|
1369
1390
|
SET_CAUSE(src, "3.vsrc");
|
1370
1391
|
} else {
|
1371
|
-
|
1392
|
+
*src_backend_id = *cur_backend_id;
|
1372
1393
|
SET_CAUSE(src, "3.cur");
|
1373
1394
|
}
|
1374
1395
|
}
|
@@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1380
1401
|
|
1381
1402
|
// pass 4: split graph, find tensors that need to be copied
|
1382
1403
|
{
|
1383
|
-
int
|
1404
|
+
int i_split = 0;
|
1405
|
+
struct ggml_backend_sched_split * split = &sched->splits[0];
|
1384
1406
|
// find the backend of the first split, skipping view ops
|
1385
1407
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1386
1408
|
struct ggml_tensor * node = graph->nodes[i];
|
1387
1409
|
if (!ggml_is_view_op(node->op)) {
|
1388
|
-
|
1410
|
+
split->backend_id = tensor_backend_id(node);
|
1389
1411
|
break;
|
1390
1412
|
}
|
1391
1413
|
}
|
1392
|
-
|
1393
|
-
|
1394
|
-
memset(
|
1395
|
-
int cur_backend_id =
|
1414
|
+
split->i_start = 0;
|
1415
|
+
split->n_inputs = 0;
|
1416
|
+
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
1417
|
+
int cur_backend_id = split->backend_id;
|
1396
1418
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1397
1419
|
struct ggml_tensor * node = graph->nodes[i];
|
1398
1420
|
|
@@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1400
1422
|
continue;
|
1401
1423
|
}
|
1402
1424
|
|
1403
|
-
int
|
1425
|
+
const int node_backend_id = tensor_backend_id(node);
|
1404
1426
|
|
1405
|
-
GGML_ASSERT(
|
1427
|
+
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
1406
1428
|
|
1407
|
-
if
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1429
|
+
// check if we should start a new split based on the sources of the current node
|
1430
|
+
bool need_new_split = false;
|
1431
|
+
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
1432
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1433
|
+
struct ggml_tensor * src = node->src[j];
|
1434
|
+
if (src == NULL) {
|
1435
|
+
continue;
|
1436
|
+
}
|
1437
|
+
// check if a weight is on a different backend
|
1438
|
+
// by starting a new split, the memory of the previously offloaded weights can be reused
|
1439
|
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1440
|
+
int src_backend_id = tensor_backend_id(src);
|
1441
|
+
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
1442
|
+
need_new_split = true;
|
1443
|
+
break;
|
1444
|
+
}
|
1445
|
+
}
|
1446
|
+
// check if the split has too many inputs
|
1447
|
+
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
1448
|
+
const size_t id = hash_id(src);
|
1449
|
+
int src_backend_id = sched->tensor_backend_id[id];
|
1450
|
+
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
1451
|
+
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
1452
|
+
need_new_split = true;
|
1453
|
+
break;
|
1454
|
+
}
|
1455
|
+
}
|
1456
|
+
}
|
1457
|
+
}
|
1458
|
+
|
1459
|
+
if (node_backend_id != cur_backend_id || need_new_split) {
|
1460
|
+
split->i_end = i;
|
1461
|
+
i_split++;
|
1462
|
+
if (i_split >= sched->splits_capacity) {
|
1463
|
+
sched->splits_capacity *= 2;
|
1464
|
+
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
1465
|
+
GGML_ASSERT(sched->splits != NULL);
|
1466
|
+
}
|
1467
|
+
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
1468
|
+
split = &sched->splits[i_split];
|
1469
|
+
split->backend_id = node_backend_id;
|
1470
|
+
split->i_start = i;
|
1471
|
+
split->n_inputs = 0;
|
1472
|
+
cur_backend_id = node_backend_id;
|
1415
1473
|
}
|
1416
1474
|
|
1417
1475
|
// find inputs that are not on the same backend
|
@@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1421
1479
|
continue;
|
1422
1480
|
}
|
1423
1481
|
|
1424
|
-
int src_backend_id = tensor_backend_id(src);
|
1482
|
+
const int src_backend_id = tensor_backend_id(src);
|
1425
1483
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1426
1484
|
|
1427
|
-
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
1485
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
1428
1486
|
size_t id = hash_id(src);
|
1429
1487
|
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
1430
1488
|
ggml_backend_t backend = sched->backends[src_backend_id];
|
@@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1441
1499
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1442
1500
|
}
|
1443
1501
|
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
1444
|
-
tensor_backend_id(tensor_copy) = src_backend_id;
|
1445
1502
|
SET_CAUSE(tensor_copy, "4.cpy");
|
1446
1503
|
}
|
1447
1504
|
int n_graph_inputs = sched->n_graph_inputs++;
|
@@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1450
1507
|
}
|
1451
1508
|
}
|
1452
1509
|
|
1453
|
-
if (src_backend_id !=
|
1510
|
+
if (src_backend_id != node_backend_id) {
|
1454
1511
|
// create a copy of the input in the split's backend
|
1455
|
-
size_t id = hash_id(src);
|
1512
|
+
const size_t id = hash_id(src);
|
1456
1513
|
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
1457
1514
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
1458
1515
|
for (int c = 0; c < sched->n_copies; c++) {
|
@@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1463
1520
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1464
1521
|
}
|
1465
1522
|
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
1466
|
-
tensor_backend_id(tensor_copy) = cur_backend_id;
|
1467
1523
|
SET_CAUSE(tensor_copy, "4.cpy");
|
1468
1524
|
}
|
1469
|
-
int n_inputs =
|
1525
|
+
int n_inputs = split->n_inputs++;
|
1470
1526
|
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1471
|
-
|
1527
|
+
split->inputs[n_inputs] = src;
|
1472
1528
|
}
|
1473
1529
|
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
1474
1530
|
}
|
1475
1531
|
}
|
1476
1532
|
}
|
1477
|
-
|
1478
|
-
sched->n_splits =
|
1533
|
+
split->i_end = graph->n_nodes;
|
1534
|
+
sched->n_splits = i_split + 1;
|
1479
1535
|
}
|
1480
1536
|
#ifdef DEBUG_PASS4
|
1481
1537
|
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1482
1538
|
#endif
|
1483
1539
|
|
1484
|
-
#ifndef NDEBUG
|
1485
|
-
// sanity check: all sources should have the same backend as the node
|
1486
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1487
|
-
struct ggml_tensor * node = graph->nodes[i];
|
1488
|
-
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
1489
|
-
if (tensor_backend == NULL) {
|
1490
|
-
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
1491
|
-
}
|
1492
|
-
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
1493
|
-
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
1494
|
-
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1495
|
-
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
1496
|
-
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
1497
|
-
}
|
1498
|
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1499
|
-
struct ggml_tensor * src = node->src[j];
|
1500
|
-
if (src == NULL) {
|
1501
|
-
continue;
|
1502
|
-
}
|
1503
|
-
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
1504
|
-
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
1505
|
-
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
1506
|
-
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
1507
|
-
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
1508
|
-
}
|
1509
|
-
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
1510
|
-
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
1511
|
-
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
1512
|
-
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
1513
|
-
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
1514
|
-
}
|
1515
|
-
}
|
1516
|
-
}
|
1517
|
-
fflush(stderr);
|
1518
|
-
#endif
|
1519
|
-
|
1520
1540
|
// create copies of the graph for each split
|
1521
1541
|
// TODO: avoid this copy
|
1522
|
-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
1542
|
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
1523
1543
|
for (int i = 0; i < sched->n_splits; i++) {
|
1524
1544
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1525
1545
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
1526
1546
|
|
1527
1547
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1528
1548
|
for (int j = 0; j < split->n_inputs; j++) {
|
1549
|
+
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
1550
|
+
|
1529
1551
|
struct ggml_tensor * input = split->inputs[j];
|
1530
|
-
|
1552
|
+
const size_t input_id = hash_id(input);
|
1553
|
+
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
1531
1554
|
|
1532
1555
|
// add a dependency to the input source so that it is not freed before the copy is done
|
1533
1556
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
1534
1557
|
input_dep->src[0] = input;
|
1535
|
-
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id
|
1558
|
+
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
1536
1559
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1537
1560
|
|
1538
1561
|
// add a dependency to the input copy so that it is allocated at the start of the split
|
@@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1541
1564
|
}
|
1542
1565
|
|
1543
1566
|
for (int j = split->i_start; j < split->i_end; j++) {
|
1567
|
+
assert(graph_copy->size > graph_copy->n_nodes);
|
1544
1568
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
1545
1569
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1546
1570
|
}
|
@@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
1625
1649
|
}
|
1626
1650
|
ggml_backend_tensor_copy(input, input_cpy);
|
1627
1651
|
} else {
|
1652
|
+
// wait for the split backend to finish using the input before overwriting it
|
1628
1653
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1629
1654
|
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
1630
1655
|
} else {
|
1631
1656
|
ggml_backend_synchronize(split_backend);
|
1632
|
-
ggml_backend_synchronize(input_backend);
|
1633
1657
|
}
|
1634
|
-
|
1635
1658
|
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
1636
1659
|
}
|
1637
1660
|
}
|
@@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1701
1724
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
1702
1725
|
|
1703
1726
|
// initialize hash table
|
1704
|
-
sched->hash_set = ggml_hash_set_new(graph_size
|
1727
|
+
sched->hash_set = ggml_hash_set_new(graph_size);
|
1705
1728
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
1706
1729
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
1707
|
-
|
1708
|
-
|
1730
|
+
|
1731
|
+
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1732
|
+
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
1733
|
+
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
1709
1734
|
|
1710
1735
|
sched->n_backends = n_backends;
|
1711
1736
|
|
1712
1737
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1713
1738
|
|
1714
|
-
|
1739
|
+
const int initial_splits_capacity = 16;
|
1740
|
+
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
|
1741
|
+
sched->splits_capacity = initial_splits_capacity;
|
1715
1742
|
|
1716
1743
|
for (int b = 0; b < n_backends; b++) {
|
1717
1744
|
sched->backends[b] = backends[b];
|
@@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1742
1769
|
}
|
1743
1770
|
ggml_gallocr_free(sched->galloc);
|
1744
1771
|
ggml_free(sched->ctx);
|
1772
|
+
free(sched->splits);
|
1745
1773
|
free(sched->hash_set.keys);
|
1746
1774
|
free(sched->tensor_backend_id);
|
1747
1775
|
free(sched->tensor_copies);
|
@@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1762
1790
|
}
|
1763
1791
|
|
1764
1792
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1793
|
+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
1794
|
+
|
1765
1795
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
1766
1796
|
|
1767
1797
|
// TODO: extract this to a separate function
|
@@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
1776
1806
|
}
|
1777
1807
|
|
1778
1808
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1779
|
-
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes
|
1809
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
1780
1810
|
|
1781
1811
|
ggml_backend_sched_split_graph(sched, graph);
|
1782
1812
|
|
@@ -70,11 +70,11 @@ extern "C" {
|
|
70
70
|
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
71
71
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
72
72
|
|
73
|
-
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
74
|
-
GGML_API enum ggml_status ggml_backend_graph_compute
|
75
|
-
|
76
|
-
GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
73
|
+
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
74
|
+
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
75
|
+
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
77
76
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
77
|
+
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
78
78
|
|
79
79
|
// tensor copy between different backends
|
80
80
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
@@ -377,6 +377,27 @@ typedef struct {
|
|
377
377
|
} block_iq1_s;
|
378
378
|
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
379
379
|
|
380
|
+
// 1.75 bpw
|
381
|
+
typedef struct {
|
382
|
+
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
|
+
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
+
#if QK_K == 64
|
385
|
+
ggml_half d;
|
386
|
+
#endif
|
387
|
+
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
|
+
} block_iq1_m;
|
389
|
+
#if QK_K == 64
|
390
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
+
#else
|
392
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
+
#endif
|
394
|
+
|
395
|
+
// Used by IQ1_M quants
|
396
|
+
typedef union {
|
397
|
+
ggml_half f16;
|
398
|
+
uint16_t u16;
|
399
|
+
} iq1m_scale_t;
|
400
|
+
|
380
401
|
// Non-linear quants
|
381
402
|
#define QK4_NL 32
|
382
403
|
typedef struct {
|
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
|
426
447
|
|
427
448
|
#define GGML_COMMON_IMPL
|
428
449
|
#elif defined(GGML_COMMON_IMPL_SYCL)
|
450
|
+
|
429
451
|
#include <cstdint>
|
430
452
|
|
431
|
-
#define GGML_TABLE_BEGIN(type, name, size) static
|
432
|
-
#define GGML_TABLE_END() }
|
453
|
+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
454
|
+
#define GGML_TABLE_END() };
|
433
455
|
|
434
456
|
#define GGML_COMMON_IMPL
|
435
457
|
#endif
|
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
|
|
1050
1072
|
|
1051
1073
|
#define NGRID_IQ1S 2048
|
1052
1074
|
#define IQ1S_DELTA 0.125f
|
1075
|
+
#define IQ1M_DELTA 0.125f
|
1053
1076
|
#if defined(GGML_COMMON_IMPL_C)
|
1054
1077
|
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
|
1055
1078
|
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
|