llama_cpp 0.12.2 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +68 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -2
- data/vendor/tmp/llama.cpp/Makefile +25 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +87 -27
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +176 -18
- data/vendor/tmp/llama.cpp/ggml-backend.h +14 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +144 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +736 -59
- data/vendor/tmp/llama.cpp/ggml-quants.h +20 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +664 -117
- data/vendor/tmp/llama.cpp/ggml.h +46 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1426 -341
- data/vendor/tmp/llama.cpp/llama.h +24 -15
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +10 -3
@@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
27
27
|
return buft->iface.get_alignment(buft);
|
28
28
|
}
|
29
29
|
|
30
|
+
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
31
|
+
// get_max_size is optional, defaults to SIZE_MAX
|
32
|
+
if (buft->iface.get_max_size) {
|
33
|
+
return buft->iface.get_max_size(buft);
|
34
|
+
}
|
35
|
+
return SIZE_MAX;
|
36
|
+
}
|
37
|
+
|
30
38
|
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
31
39
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
32
40
|
if (buft->iface.get_alloc_size) {
|
33
|
-
|
41
|
+
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
42
|
+
assert(size >= ggml_nbytes(tensor));
|
43
|
+
return size;
|
34
44
|
}
|
35
45
|
return ggml_nbytes(tensor);
|
36
46
|
}
|
@@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
55
65
|
size_t size) {
|
56
66
|
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
57
67
|
|
58
|
-
GGML_ASSERT(iface.get_base != NULL);
|
59
|
-
|
60
68
|
(*buffer) = (struct ggml_backend_buffer) {
|
61
69
|
/* .interface = */ iface,
|
62
70
|
/* .buft = */ buft,
|
@@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
|
106
114
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
107
115
|
}
|
108
116
|
|
117
|
+
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
118
|
+
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
119
|
+
}
|
120
|
+
|
109
121
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
110
122
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
111
123
|
}
|
@@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|
120
132
|
|
121
133
|
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
122
134
|
buffer->usage = usage;
|
135
|
+
|
136
|
+
// FIXME: add a generic callback to the buffer interface
|
137
|
+
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
|
138
|
+
ggml_backend_multi_buffer_set_usage(buffer, usage);
|
139
|
+
}
|
123
140
|
}
|
124
141
|
|
125
142
|
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
@@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
|
169
186
|
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
170
187
|
}
|
171
188
|
|
189
|
+
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
190
|
+
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
|
191
|
+
}
|
192
|
+
|
172
193
|
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
173
194
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
174
195
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
@@ -337,11 +358,26 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
337
358
|
ggml_backend_cuda_reg_devices();
|
338
359
|
#endif
|
339
360
|
|
361
|
+
#ifdef GGML_USE_SYCL
|
362
|
+
extern void ggml_backend_sycl_reg_devices(void);
|
363
|
+
ggml_backend_sycl_reg_devices();
|
364
|
+
#endif
|
365
|
+
|
340
366
|
#ifdef GGML_USE_METAL
|
341
367
|
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
342
368
|
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
343
369
|
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
344
370
|
#endif
|
371
|
+
|
372
|
+
#ifdef GGML_USE_VULKAN
|
373
|
+
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
|
374
|
+
ggml_backend_vk_reg_devices();
|
375
|
+
#endif
|
376
|
+
|
377
|
+
#ifdef GGML_USE_KOMPUTE
|
378
|
+
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
379
|
+
ggml_backend_kompute_reg_devices();
|
380
|
+
#endif
|
345
381
|
}
|
346
382
|
|
347
383
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
@@ -545,6 +581,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
545
581
|
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
546
582
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
547
583
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
584
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
548
585
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
549
586
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
550
587
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
@@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
600
637
|
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
601
638
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
602
639
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
640
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
603
641
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
604
642
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
605
643
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
@@ -692,6 +730,8 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
692
730
|
|
693
731
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
694
732
|
switch (op->op) {
|
733
|
+
case GGML_OP_CPY:
|
734
|
+
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
|
695
735
|
case GGML_OP_MUL_MAT:
|
696
736
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
697
737
|
default:
|
@@ -754,6 +794,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
|
|
754
794
|
GGML_UNUSED(user_data);
|
755
795
|
}
|
756
796
|
|
797
|
+
// multi-buffer buffer
|
798
|
+
|
799
|
+
struct ggml_backend_multi_buffer_context {
|
800
|
+
ggml_backend_buffer_t * buffers;
|
801
|
+
size_t n_buffers;
|
802
|
+
};
|
803
|
+
|
804
|
+
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
|
805
|
+
|
806
|
+
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
807
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
808
|
+
|
809
|
+
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
810
|
+
}
|
811
|
+
|
812
|
+
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
813
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
814
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
815
|
+
ggml_backend_buffer_free(ctx->buffers[i]);
|
816
|
+
}
|
817
|
+
|
818
|
+
free(ctx->buffers);
|
819
|
+
free(ctx);
|
820
|
+
}
|
821
|
+
|
822
|
+
GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
823
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
824
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
825
|
+
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
826
|
+
}
|
827
|
+
}
|
828
|
+
|
829
|
+
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
|
830
|
+
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
|
831
|
+
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
832
|
+
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
833
|
+
/* .get_base = */ NULL,
|
834
|
+
/* .init_tensor = */ NULL,
|
835
|
+
/* .set_tensor = */ NULL,
|
836
|
+
/* .get_tensor = */ NULL,
|
837
|
+
/* .cpy_tensor = */ NULL,
|
838
|
+
/* .clear = */ ggml_backend_multi_buffer_clear,
|
839
|
+
/* .reset = */ NULL,
|
840
|
+
};
|
841
|
+
|
842
|
+
return multi_backend_buffer_i;
|
843
|
+
}
|
844
|
+
|
845
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
846
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
847
|
+
ctx->n_buffers = n_buffers;
|
848
|
+
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
849
|
+
|
850
|
+
size_t total_size = 0;
|
851
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
852
|
+
ctx->buffers[i] = buffers[i];
|
853
|
+
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
854
|
+
}
|
855
|
+
|
856
|
+
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
|
857
|
+
}
|
858
|
+
|
859
|
+
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
860
|
+
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
861
|
+
}
|
862
|
+
|
863
|
+
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
864
|
+
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
865
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
866
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
867
|
+
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
868
|
+
}
|
869
|
+
}
|
870
|
+
|
757
871
|
|
758
872
|
// scheduler
|
759
873
|
|
@@ -802,6 +916,9 @@ struct ggml_backend_sched {
|
|
802
916
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
803
917
|
#endif
|
804
918
|
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
919
|
+
|
920
|
+
ggml_backend_sched_eval_callback callback_eval;
|
921
|
+
void * callback_eval_user_data;
|
805
922
|
};
|
806
923
|
|
807
924
|
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
@@ -1186,6 +1303,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1186
1303
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
1187
1304
|
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
1188
1305
|
if (src_allocr != node_allocr) {
|
1306
|
+
// create a copy of the input in the split's backend
|
1307
|
+
size_t id = hash_id(src);
|
1308
|
+
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
1309
|
+
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
1310
|
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1311
|
+
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
1312
|
+
|
1313
|
+
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
1314
|
+
node_allocr(tensor_copy) = cur_allocr;
|
1315
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1316
|
+
|
1317
|
+
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1318
|
+
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
1319
|
+
sched->splits[cur_split].inputs[n_inputs] = src;
|
1320
|
+
}
|
1321
|
+
node->src[j] = sched->node_copies[id][cur_backend_id];
|
1322
|
+
|
1323
|
+
#if 0
|
1189
1324
|
// check if the input is already in the split
|
1190
1325
|
bool found = false;
|
1191
1326
|
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
@@ -1201,19 +1336,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1201
1336
|
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
1202
1337
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
1203
1338
|
}
|
1204
|
-
|
1205
|
-
// create a copy of the input in the split's backend
|
1206
|
-
size_t id = hash_id(src);
|
1207
|
-
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
1208
|
-
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
1209
|
-
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1210
|
-
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
1211
|
-
|
1212
|
-
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
1213
|
-
node_allocr(tensor_copy) = cur_allocr;
|
1214
|
-
SET_CAUSE(tensor_copy, "4.cpy");
|
1215
|
-
}
|
1216
|
-
node->src[j] = sched->node_copies[id][cur_backend_id];
|
1339
|
+
#endif
|
1217
1340
|
}
|
1218
1341
|
}
|
1219
1342
|
}
|
@@ -1324,9 +1447,38 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1324
1447
|
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
1325
1448
|
#endif
|
1326
1449
|
|
1450
|
+
|
1327
1451
|
uint64_t compute_start_us = ggml_time_us();
|
1328
|
-
|
1329
|
-
|
1452
|
+
if (!sched->callback_eval) {
|
1453
|
+
ggml_backend_graph_compute(split_backend, &split->graph);
|
1454
|
+
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1455
|
+
} else {
|
1456
|
+
// similar to ggml_backend_compare_graph_backend
|
1457
|
+
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
1458
|
+
struct ggml_tensor * t = split->graph.nodes[j0];
|
1459
|
+
|
1460
|
+
// check if the user needs data from this node
|
1461
|
+
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
1462
|
+
|
1463
|
+
int j1 = j0;
|
1464
|
+
|
1465
|
+
// determine the range [j0, j1] of nodes that can be computed together
|
1466
|
+
while (!need && j1 < split->graph.n_nodes - 1) {
|
1467
|
+
t = split->graph.nodes[++j1];
|
1468
|
+
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
1469
|
+
}
|
1470
|
+
|
1471
|
+
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
1472
|
+
|
1473
|
+
ggml_backend_graph_compute(split_backend, &gv);
|
1474
|
+
|
1475
|
+
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
1476
|
+
break;
|
1477
|
+
}
|
1478
|
+
|
1479
|
+
j0 = j1;
|
1480
|
+
}
|
1481
|
+
}
|
1330
1482
|
uint64_t compute_end_us = ggml_time_us();
|
1331
1483
|
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
1332
1484
|
}
|
@@ -1431,6 +1583,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1431
1583
|
sched_reset(sched);
|
1432
1584
|
}
|
1433
1585
|
|
1586
|
+
|
1587
|
+
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
1588
|
+
sched->callback_eval = callback;
|
1589
|
+
sched->callback_eval_user_data = user_data;
|
1590
|
+
}
|
1591
|
+
|
1434
1592
|
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
1435
1593
|
return sched->n_splits;
|
1436
1594
|
}
|
@@ -20,6 +20,7 @@ extern "C" {
|
|
20
20
|
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
21
21
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
22
22
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
23
|
+
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
23
24
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
24
25
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
25
26
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
@@ -36,6 +37,7 @@ extern "C" {
|
|
36
37
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
37
38
|
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
38
39
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
40
|
+
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
39
41
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
40
42
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
41
43
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
@@ -54,6 +56,7 @@ extern "C" {
|
|
54
56
|
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
55
57
|
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
56
58
|
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
59
|
+
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
|
57
60
|
|
58
61
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
59
62
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
@@ -148,6 +151,14 @@ extern "C" {
|
|
148
151
|
struct ggml_backend_sched;
|
149
152
|
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
150
153
|
|
154
|
+
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
155
|
+
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
156
|
+
//
|
157
|
+
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
158
|
+
// if the user returns false, the scheduler will cancel the graph compute
|
159
|
+
//
|
160
|
+
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
161
|
+
|
151
162
|
// Initialize a backend scheduler
|
152
163
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
153
164
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
@@ -168,6 +179,9 @@ extern "C" {
|
|
168
179
|
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
|
169
180
|
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
170
181
|
|
182
|
+
// Set a callback to be called for each resulting node during graph compute
|
183
|
+
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
184
|
+
|
171
185
|
//
|
172
186
|
// Utils
|
173
187
|
//
|