llama_cpp 0.12.3 → 0.12.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +22 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -2
- data/vendor/tmp/llama.cpp/Makefile +160 -56
- data/vendor/tmp/llama.cpp/ggml-alloc.c +85 -25
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +115 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +688 -270
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +121 -86
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +745 -109
- data/vendor/tmp/llama.cpp/ggml-quants.h +81 -56
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15296 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +51714 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5726 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +39 -0
- data/vendor/tmp/llama.cpp/ggml.c +356 -60
- data/vendor/tmp/llama.cpp/ggml.h +7 -1
- data/vendor/tmp/llama.cpp/llama.cpp +876 -118
- data/vendor/tmp/llama.cpp/llama.h +12 -16
- metadata +9 -2
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
|
|
335
335
|
}
|
336
336
|
|
337
337
|
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
|
338
|
-
|
338
|
+
// FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
|
339
|
+
// to avoid this, we add a 10% margin to the buffer size
|
340
|
+
return alloc->max_size + alloc->max_size/10;
|
339
341
|
}
|
340
342
|
|
341
343
|
// graph allocator
|
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
|
|
776
778
|
}
|
777
779
|
|
778
780
|
// utils
|
779
|
-
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
780
|
-
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
781
|
-
|
782
|
-
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
783
|
-
|
784
|
-
size_t nbytes = 0;
|
785
|
-
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
786
|
-
if (t->data == NULL && t->view_src == NULL) {
|
787
|
-
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
788
|
-
}
|
789
|
-
}
|
790
|
-
|
791
|
-
if (nbytes == 0) {
|
792
|
-
// all the tensors in the context are already allocated
|
793
|
-
#ifndef NDEBUG
|
794
|
-
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
795
|
-
#endif
|
796
|
-
return NULL;
|
797
|
-
}
|
798
781
|
|
799
|
-
|
782
|
+
static bool alloc_tensor_range(struct ggml_context * ctx,
|
783
|
+
struct ggml_tensor * first, struct ggml_tensor * last,
|
784
|
+
ggml_backend_buffer_type_t buft, size_t size,
|
785
|
+
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
|
786
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
800
787
|
if (buffer == NULL) {
|
801
|
-
// failed to allocate buffer
|
802
788
|
#ifndef NDEBUG
|
803
|
-
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
789
|
+
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
804
790
|
#endif
|
805
|
-
|
791
|
+
for (size_t i = 0; i < *n_buffers; i++) {
|
792
|
+
ggml_backend_buffer_free(*buffers[i]);
|
793
|
+
}
|
794
|
+
free(*buffers);
|
795
|
+
return false;
|
806
796
|
}
|
807
797
|
|
808
798
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
809
799
|
|
810
|
-
for (struct ggml_tensor * t =
|
800
|
+
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
811
801
|
if (t->data == NULL) {
|
812
802
|
if (t->view_src == NULL) {
|
813
803
|
ggml_tallocr_alloc(tallocr, t);
|
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
824
814
|
|
825
815
|
ggml_tallocr_free(tallocr);
|
826
816
|
|
817
|
+
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
818
|
+
(*buffers)[(*n_buffers)++] = buffer;
|
819
|
+
|
820
|
+
return true;
|
821
|
+
}
|
822
|
+
|
823
|
+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
824
|
+
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
825
|
+
|
826
|
+
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
827
|
+
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
828
|
+
|
829
|
+
ggml_backend_buffer_t * buffers = NULL;
|
830
|
+
size_t n_buffers = 0;
|
831
|
+
|
832
|
+
size_t cur_buf_size = 0;
|
833
|
+
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
834
|
+
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
835
|
+
size_t this_size = 0;
|
836
|
+
if (t->data == NULL && t->view_src == NULL) {
|
837
|
+
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
838
|
+
}
|
839
|
+
|
840
|
+
if (this_size > max_size) {
|
841
|
+
// tensor is too large to fit in a single buffer
|
842
|
+
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
843
|
+
__func__, t->name,
|
844
|
+
ggml_backend_buft_name(buft),
|
845
|
+
this_size, max_size);
|
846
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
847
|
+
ggml_backend_buffer_free(buffers[i]);
|
848
|
+
}
|
849
|
+
free(buffers);
|
850
|
+
return NULL;
|
851
|
+
}
|
852
|
+
|
853
|
+
if ((cur_buf_size + this_size) > max_size) {
|
854
|
+
// allocate tensors in the current buffer
|
855
|
+
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
856
|
+
return NULL;
|
857
|
+
}
|
858
|
+
first = t;
|
859
|
+
cur_buf_size = this_size;
|
860
|
+
} else {
|
861
|
+
cur_buf_size += this_size;
|
862
|
+
}
|
863
|
+
}
|
864
|
+
|
865
|
+
// allocate remaining tensors
|
866
|
+
if (cur_buf_size > 0) {
|
867
|
+
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
868
|
+
return NULL;
|
869
|
+
}
|
870
|
+
}
|
871
|
+
|
872
|
+
if (n_buffers == 0) {
|
873
|
+
// all the tensors in the context are already allocated
|
874
|
+
#ifndef NDEBUG
|
875
|
+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
876
|
+
#endif
|
877
|
+
return NULL;
|
878
|
+
}
|
879
|
+
|
880
|
+
ggml_backend_buffer_t buffer;
|
881
|
+
if (n_buffers == 1) {
|
882
|
+
buffer = buffers[0];
|
883
|
+
} else {
|
884
|
+
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
885
|
+
}
|
886
|
+
free(buffers);
|
827
887
|
return buffer;
|
828
888
|
}
|
829
889
|
|
@@ -19,6 +19,7 @@ extern "C" {
|
|
19
19
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
20
20
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
21
21
|
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
22
|
+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
|
22
23
|
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
23
24
|
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
24
25
|
// check if tensor data is in host memory
|
@@ -63,6 +64,11 @@ extern "C" {
|
|
63
64
|
// do not use directly, use ggml_backend_tensor_copy instead
|
64
65
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
65
66
|
|
67
|
+
// buffer that contains a collection of buffers
|
68
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
69
|
+
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
70
|
+
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
71
|
+
|
66
72
|
//
|
67
73
|
// Backend
|
68
74
|
//
|
@@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
27
27
|
return buft->iface.get_alignment(buft);
|
28
28
|
}
|
29
29
|
|
30
|
+
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
31
|
+
// get_max_size is optional, defaults to SIZE_MAX
|
32
|
+
if (buft->iface.get_max_size) {
|
33
|
+
return buft->iface.get_max_size(buft);
|
34
|
+
}
|
35
|
+
return SIZE_MAX;
|
36
|
+
}
|
37
|
+
|
30
38
|
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
31
39
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
32
40
|
if (buft->iface.get_alloc_size) {
|
33
|
-
|
41
|
+
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
42
|
+
assert(size >= ggml_nbytes(tensor));
|
43
|
+
return size;
|
34
44
|
}
|
35
45
|
return ggml_nbytes(tensor);
|
36
46
|
}
|
@@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
55
65
|
size_t size) {
|
56
66
|
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
57
67
|
|
58
|
-
GGML_ASSERT(iface.get_base != NULL);
|
59
|
-
|
60
68
|
(*buffer) = (struct ggml_backend_buffer) {
|
61
69
|
/* .interface = */ iface,
|
62
70
|
/* .buft = */ buft,
|
@@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
|
106
114
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
107
115
|
}
|
108
116
|
|
117
|
+
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
118
|
+
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
119
|
+
}
|
120
|
+
|
109
121
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
110
122
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
111
123
|
}
|
@@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|
120
132
|
|
121
133
|
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
122
134
|
buffer->usage = usage;
|
135
|
+
|
136
|
+
// FIXME: add a generic callback to the buffer interface
|
137
|
+
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
|
138
|
+
ggml_backend_multi_buffer_set_usage(buffer, usage);
|
139
|
+
}
|
123
140
|
}
|
124
141
|
|
125
142
|
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
@@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
|
169
186
|
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
170
187
|
}
|
171
188
|
|
189
|
+
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
190
|
+
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
|
191
|
+
}
|
192
|
+
|
172
193
|
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
173
194
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
174
195
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
@@ -337,11 +358,26 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
337
358
|
ggml_backend_cuda_reg_devices();
|
338
359
|
#endif
|
339
360
|
|
361
|
+
#ifdef GGML_USE_SYCL
|
362
|
+
extern void ggml_backend_sycl_reg_devices(void);
|
363
|
+
ggml_backend_sycl_reg_devices();
|
364
|
+
#endif
|
365
|
+
|
340
366
|
#ifdef GGML_USE_METAL
|
341
367
|
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
342
368
|
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
343
369
|
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
344
370
|
#endif
|
371
|
+
|
372
|
+
#ifdef GGML_USE_VULKAN
|
373
|
+
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
|
374
|
+
ggml_backend_vk_reg_devices();
|
375
|
+
#endif
|
376
|
+
|
377
|
+
#ifdef GGML_USE_KOMPUTE
|
378
|
+
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
379
|
+
ggml_backend_kompute_reg_devices();
|
380
|
+
#endif
|
345
381
|
}
|
346
382
|
|
347
383
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
@@ -545,6 +581,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
545
581
|
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
546
582
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
547
583
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
584
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
548
585
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
549
586
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
550
587
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
@@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
600
637
|
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
601
638
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
602
639
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
640
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
603
641
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
604
642
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
605
643
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
@@ -756,6 +794,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
|
|
756
794
|
GGML_UNUSED(user_data);
|
757
795
|
}
|
758
796
|
|
797
|
+
// multi-buffer buffer
|
798
|
+
|
799
|
+
struct ggml_backend_multi_buffer_context {
|
800
|
+
ggml_backend_buffer_t * buffers;
|
801
|
+
size_t n_buffers;
|
802
|
+
};
|
803
|
+
|
804
|
+
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
|
805
|
+
|
806
|
+
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
807
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
808
|
+
|
809
|
+
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
810
|
+
}
|
811
|
+
|
812
|
+
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
813
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
814
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
815
|
+
ggml_backend_buffer_free(ctx->buffers[i]);
|
816
|
+
}
|
817
|
+
|
818
|
+
free(ctx->buffers);
|
819
|
+
free(ctx);
|
820
|
+
}
|
821
|
+
|
822
|
+
GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
823
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
824
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
825
|
+
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
826
|
+
}
|
827
|
+
}
|
828
|
+
|
829
|
+
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
|
830
|
+
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
|
831
|
+
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
832
|
+
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
833
|
+
/* .get_base = */ NULL,
|
834
|
+
/* .init_tensor = */ NULL,
|
835
|
+
/* .set_tensor = */ NULL,
|
836
|
+
/* .get_tensor = */ NULL,
|
837
|
+
/* .cpy_tensor = */ NULL,
|
838
|
+
/* .clear = */ ggml_backend_multi_buffer_clear,
|
839
|
+
/* .reset = */ NULL,
|
840
|
+
};
|
841
|
+
|
842
|
+
return multi_backend_buffer_i;
|
843
|
+
}
|
844
|
+
|
845
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
846
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
847
|
+
ctx->n_buffers = n_buffers;
|
848
|
+
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
849
|
+
|
850
|
+
size_t total_size = 0;
|
851
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
852
|
+
ctx->buffers[i] = buffers[i];
|
853
|
+
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
854
|
+
}
|
855
|
+
|
856
|
+
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
|
857
|
+
}
|
858
|
+
|
859
|
+
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
860
|
+
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
861
|
+
}
|
862
|
+
|
863
|
+
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
864
|
+
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
865
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
866
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
867
|
+
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
868
|
+
}
|
869
|
+
}
|
870
|
+
|
759
871
|
|
760
872
|
// scheduler
|
761
873
|
|
@@ -20,6 +20,7 @@ extern "C" {
|
|
20
20
|
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
21
21
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
22
22
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
23
|
+
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
23
24
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
24
25
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
25
26
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
@@ -36,6 +37,7 @@ extern "C" {
|
|
36
37
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
37
38
|
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
38
39
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
40
|
+
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
39
41
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
40
42
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
41
43
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
@@ -54,6 +56,7 @@ extern "C" {
|
|
54
56
|
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
55
57
|
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
56
58
|
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
59
|
+
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
|
57
60
|
|
58
61
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
59
62
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|