llama_cpp 0.12.3 → 0.12.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +22 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -2
- data/vendor/tmp/llama.cpp/Makefile +23 -4
- data/vendor/tmp/llama.cpp/ggml-alloc.c +85 -25
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +115 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +121 -86
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +706 -15
- data/vendor/tmp/llama.cpp/ggml-quants.h +17 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +350 -57
- data/vendor/tmp/llama.cpp/ggml.h +7 -1
- data/vendor/tmp/llama.cpp/llama.cpp +574 -39
- data/vendor/tmp/llama.cpp/llama.h +11 -15
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e77376858bfb07c67b29963a898f3cf9f2494a5cadabbc4cf777e87af801b33c
|
4
|
+
data.tar.gz: 1196c932182a2c76416c326dac934e97cb9111e6bed269c4776e05587391b916
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 594f4af7e1e88f156926b7605683e29b47a7caf3afb2c18434fa0035415902fb51a9dafe845a4a108bce0dfdd9ad63b5301790826ee6995fa1799cf2bff0c1ee
|
7
|
+
data.tar.gz: 4199b0e417efc0e469172c147aa766a81b3f073158eefc13315ab50e4240a4e2f41611e3c87939f4d3012357edf339b1450e49f2bc324f37f92040396342d476
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## [[0.12.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.3...v0.12.4)] - 2024-02-03
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1971 to b2047.
|
4
|
+
- Add constant for file type: `LLAMA_FTYPE_MOSTLY_IQ3_XXS`.
|
5
|
+
- Add `supports_mmap?`, `supports_mlock?`, and `supports_gpu_offload?` module functions to `LLaMACpp`.
|
6
|
+
- Add `--with-vulkan` configuration option.
|
7
|
+
- Deprecate `mmap_supported?` and `mlock_supported?` module functions in `LLaMACpp`.
|
8
|
+
- Remove `LLAMA_MAX_DEVICES` constant.
|
9
|
+
|
1
10
|
## [[0.12.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.2...v0.12.3)] - 2024-01-27
|
2
11
|
|
3
12
|
- Bump bundled llama.cpp from b1892 to b1971.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -19,6 +19,7 @@ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
|
|
19
19
|
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
20
20
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
21
21
|
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
22
|
+
make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
|
22
23
|
|
23
24
|
Dir.chdir(LLAMA_CPP_DIR) do
|
24
25
|
_mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -843,15 +843,15 @@ private:
|
|
843
843
|
|
844
844
|
// tensor_split
|
845
845
|
static VALUE _llama_model_params_get_tensor_split(VALUE self) {
|
846
|
-
if (
|
846
|
+
if (llama_max_devices() < 1) {
|
847
847
|
return rb_ary_new();
|
848
848
|
}
|
849
|
-
VALUE ret = rb_ary_new2(
|
849
|
+
VALUE ret = rb_ary_new2(llama_max_devices());
|
850
850
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
851
851
|
if (ptr->params.tensor_split == nullptr) {
|
852
852
|
return rb_ary_new();
|
853
853
|
}
|
854
|
-
for (size_t i = 0; i <
|
854
|
+
for (size_t i = 0; i < llama_max_devices(); i++) {
|
855
855
|
rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
|
856
856
|
}
|
857
857
|
return ret;
|
@@ -3259,15 +3259,29 @@ static VALUE rb_llama_time_us(VALUE self) {
|
|
3259
3259
|
}
|
3260
3260
|
|
3261
3261
|
static VALUE rb_llama_mmap_supported(VALUE self) {
|
3262
|
+
rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
|
3262
3263
|
return llama_mmap_supported() ? Qtrue : Qfalse;
|
3263
3264
|
}
|
3264
3265
|
|
3265
3266
|
static VALUE rb_llama_mlock_supported(VALUE self) {
|
3267
|
+
rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
|
3266
3268
|
return llama_mlock_supported() ? Qtrue : Qfalse;
|
3267
3269
|
}
|
3268
3270
|
|
3269
3271
|
static VALUE rb_llama_max_devices(VALUE self) {
|
3270
|
-
return
|
3272
|
+
return SIZET2NUM(llama_max_devices());
|
3273
|
+
}
|
3274
|
+
|
3275
|
+
static VALUE rb_llama_supports_mmap(VALUE self) {
|
3276
|
+
return llama_supports_mmap() ? Qtrue : Qfalse;
|
3277
|
+
}
|
3278
|
+
|
3279
|
+
static VALUE rb_llama_supports_mlock(VALUE self) {
|
3280
|
+
return llama_supports_mlock() ? Qtrue : Qfalse;
|
3281
|
+
}
|
3282
|
+
|
3283
|
+
static VALUE rb_llama_supports_gpu_offload(VALUE self) {
|
3284
|
+
return llama_supports_gpu_offload() ? Qtrue : Qfalse;
|
3271
3285
|
}
|
3272
3286
|
|
3273
3287
|
extern "C" void Init_llama_cpp(void) {
|
@@ -3294,8 +3308,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3294
3308
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
3295
3309
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
3296
3310
|
rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
|
3297
|
-
|
3298
|
-
|
3311
|
+
rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
|
3312
|
+
rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
|
3313
|
+
rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
|
3299
3314
|
|
3300
3315
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
3301
3316
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
@@ -3329,6 +3344,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3329
3344
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3330
3345
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3331
3346
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
|
3347
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3332
3348
|
|
3333
3349
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3334
3350
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2047'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,8 +3,6 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
-
LLAMA_MAX_DEVICES: Integer
|
7
|
-
|
8
6
|
LLAMA_FTYPE_ALL_F32: Integer
|
9
7
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
8
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -26,6 +24,7 @@ module LLaMACpp
|
|
26
24
|
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
27
25
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
28
26
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
|
27
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
29
28
|
|
30
29
|
LLAMA_KV_OVERRIDE_INT: Integer
|
31
30
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -61,6 +60,9 @@ module LLaMACpp
|
|
61
60
|
def self?.mmap_supported?: () -> bool
|
62
61
|
def self?.mlock_supported?: () -> bool
|
63
62
|
def self?.max_devices: () -> Integer
|
63
|
+
def self?.supports_mmap?: () -> bool
|
64
|
+
def self?.supports_mlock?: () -> bool
|
65
|
+
def self?.supports_gpu_offload?: () -> bool
|
64
66
|
|
65
67
|
class TokenData
|
66
68
|
public
|
@@ -9,7 +9,7 @@ TEST_TARGETS = \
|
|
9
9
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
|
-
tests/test-backend-ops tests/test-autorelease
|
12
|
+
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
13
13
|
|
14
14
|
# Code coverage output files
|
15
15
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -450,6 +450,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
|
450
450
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
451
451
|
endif # LLAMA_CLBLAST
|
452
452
|
|
453
|
+
ifdef LLAMA_VULKAN
|
454
|
+
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
455
|
+
MK_LDFLAGS += -lvulkan
|
456
|
+
OBJS += ggml-vulkan.o
|
457
|
+
|
458
|
+
ifdef LLAMA_VULKAN_CHECK_RESULTS
|
459
|
+
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
460
|
+
endif
|
461
|
+
|
462
|
+
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
463
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
464
|
+
endif # LLAMA_VULKAN
|
465
|
+
|
453
466
|
ifdef LLAMA_HIPBLAS
|
454
467
|
|
455
468
|
ifeq ($(wildcard /opt/rocm),)
|
@@ -575,12 +588,15 @@ train.o: common/train.cpp common/train.h
|
|
575
588
|
libllama.so: llama.o ggml.o $(OBJS)
|
576
589
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
577
590
|
|
591
|
+
libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
592
|
+
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
593
|
+
|
578
594
|
lib: llama.o ggml.o $(OBJS)
|
579
595
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
|
580
596
|
ar rcs libllama.a $^
|
581
597
|
|
582
598
|
clean:
|
583
|
-
rm -vrf *.o tests/*.o *.so *.dll *.dylib
|
599
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
584
600
|
|
585
601
|
#
|
586
602
|
# Examples
|
@@ -625,7 +641,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
|
|
625
641
|
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
626
642
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
627
643
|
|
628
|
-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
644
|
+
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
629
645
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
630
646
|
|
631
647
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
@@ -754,5 +770,8 @@ tests/test-c.o: tests/test-c.c llama.h
|
|
754
770
|
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
755
771
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
756
772
|
|
757
|
-
tests/test-
|
773
|
+
tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
774
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
775
|
+
|
776
|
+
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
758
777
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
|
|
335
335
|
}
|
336
336
|
|
337
337
|
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
|
338
|
-
|
338
|
+
// FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
|
339
|
+
// to avoid this, we add a 10% margin to the buffer size
|
340
|
+
return alloc->max_size + alloc->max_size/10;
|
339
341
|
}
|
340
342
|
|
341
343
|
// graph allocator
|
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
|
|
776
778
|
}
|
777
779
|
|
778
780
|
// utils
|
779
|
-
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
780
|
-
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
781
|
-
|
782
|
-
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
783
|
-
|
784
|
-
size_t nbytes = 0;
|
785
|
-
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
786
|
-
if (t->data == NULL && t->view_src == NULL) {
|
787
|
-
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
788
|
-
}
|
789
|
-
}
|
790
|
-
|
791
|
-
if (nbytes == 0) {
|
792
|
-
// all the tensors in the context are already allocated
|
793
|
-
#ifndef NDEBUG
|
794
|
-
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
795
|
-
#endif
|
796
|
-
return NULL;
|
797
|
-
}
|
798
781
|
|
799
|
-
|
782
|
+
static bool alloc_tensor_range(struct ggml_context * ctx,
|
783
|
+
struct ggml_tensor * first, struct ggml_tensor * last,
|
784
|
+
ggml_backend_buffer_type_t buft, size_t size,
|
785
|
+
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
|
786
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
800
787
|
if (buffer == NULL) {
|
801
|
-
// failed to allocate buffer
|
802
788
|
#ifndef NDEBUG
|
803
|
-
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
789
|
+
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
804
790
|
#endif
|
805
|
-
|
791
|
+
for (size_t i = 0; i < *n_buffers; i++) {
|
792
|
+
ggml_backend_buffer_free(*buffers[i]);
|
793
|
+
}
|
794
|
+
free(*buffers);
|
795
|
+
return false;
|
806
796
|
}
|
807
797
|
|
808
798
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
809
799
|
|
810
|
-
for (struct ggml_tensor * t =
|
800
|
+
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
811
801
|
if (t->data == NULL) {
|
812
802
|
if (t->view_src == NULL) {
|
813
803
|
ggml_tallocr_alloc(tallocr, t);
|
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
824
814
|
|
825
815
|
ggml_tallocr_free(tallocr);
|
826
816
|
|
817
|
+
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
818
|
+
(*buffers)[(*n_buffers)++] = buffer;
|
819
|
+
|
820
|
+
return true;
|
821
|
+
}
|
822
|
+
|
823
|
+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
824
|
+
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
825
|
+
|
826
|
+
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
827
|
+
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
828
|
+
|
829
|
+
ggml_backend_buffer_t * buffers = NULL;
|
830
|
+
size_t n_buffers = 0;
|
831
|
+
|
832
|
+
size_t cur_buf_size = 0;
|
833
|
+
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
834
|
+
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
835
|
+
size_t this_size = 0;
|
836
|
+
if (t->data == NULL && t->view_src == NULL) {
|
837
|
+
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
838
|
+
}
|
839
|
+
|
840
|
+
if (this_size > max_size) {
|
841
|
+
// tensor is too large to fit in a single buffer
|
842
|
+
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
843
|
+
__func__, t->name,
|
844
|
+
ggml_backend_buft_name(buft),
|
845
|
+
this_size, max_size);
|
846
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
847
|
+
ggml_backend_buffer_free(buffers[i]);
|
848
|
+
}
|
849
|
+
free(buffers);
|
850
|
+
return NULL;
|
851
|
+
}
|
852
|
+
|
853
|
+
if ((cur_buf_size + this_size) > max_size) {
|
854
|
+
// allocate tensors in the current buffer
|
855
|
+
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
856
|
+
return NULL;
|
857
|
+
}
|
858
|
+
first = t;
|
859
|
+
cur_buf_size = this_size;
|
860
|
+
} else {
|
861
|
+
cur_buf_size += this_size;
|
862
|
+
}
|
863
|
+
}
|
864
|
+
|
865
|
+
// allocate remaining tensors
|
866
|
+
if (cur_buf_size > 0) {
|
867
|
+
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
868
|
+
return NULL;
|
869
|
+
}
|
870
|
+
}
|
871
|
+
|
872
|
+
if (n_buffers == 0) {
|
873
|
+
// all the tensors in the context are already allocated
|
874
|
+
#ifndef NDEBUG
|
875
|
+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
876
|
+
#endif
|
877
|
+
return NULL;
|
878
|
+
}
|
879
|
+
|
880
|
+
ggml_backend_buffer_t buffer;
|
881
|
+
if (n_buffers == 1) {
|
882
|
+
buffer = buffers[0];
|
883
|
+
} else {
|
884
|
+
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
885
|
+
}
|
886
|
+
free(buffers);
|
827
887
|
return buffer;
|
828
888
|
}
|
829
889
|
|
@@ -19,6 +19,7 @@ extern "C" {
|
|
19
19
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
20
20
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
21
21
|
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
22
|
+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
|
22
23
|
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
23
24
|
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
24
25
|
// check if tensor data is in host memory
|
@@ -63,6 +64,11 @@ extern "C" {
|
|
63
64
|
// do not use directly, use ggml_backend_tensor_copy instead
|
64
65
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
65
66
|
|
67
|
+
// buffer that contains a collection of buffers
|
68
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
69
|
+
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
70
|
+
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
71
|
+
|
66
72
|
//
|
67
73
|
// Backend
|
68
74
|
//
|
@@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
27
27
|
return buft->iface.get_alignment(buft);
|
28
28
|
}
|
29
29
|
|
30
|
+
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
31
|
+
// get_max_size is optional, defaults to SIZE_MAX
|
32
|
+
if (buft->iface.get_max_size) {
|
33
|
+
return buft->iface.get_max_size(buft);
|
34
|
+
}
|
35
|
+
return SIZE_MAX;
|
36
|
+
}
|
37
|
+
|
30
38
|
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
31
39
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
32
40
|
if (buft->iface.get_alloc_size) {
|
33
|
-
|
41
|
+
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
42
|
+
assert(size >= ggml_nbytes(tensor));
|
43
|
+
return size;
|
34
44
|
}
|
35
45
|
return ggml_nbytes(tensor);
|
36
46
|
}
|
@@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
55
65
|
size_t size) {
|
56
66
|
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
57
67
|
|
58
|
-
GGML_ASSERT(iface.get_base != NULL);
|
59
|
-
|
60
68
|
(*buffer) = (struct ggml_backend_buffer) {
|
61
69
|
/* .interface = */ iface,
|
62
70
|
/* .buft = */ buft,
|
@@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
|
106
114
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
107
115
|
}
|
108
116
|
|
117
|
+
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
118
|
+
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
119
|
+
}
|
120
|
+
|
109
121
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
110
122
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
111
123
|
}
|
@@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|
120
132
|
|
121
133
|
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
122
134
|
buffer->usage = usage;
|
135
|
+
|
136
|
+
// FIXME: add a generic callback to the buffer interface
|
137
|
+
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
|
138
|
+
ggml_backend_multi_buffer_set_usage(buffer, usage);
|
139
|
+
}
|
123
140
|
}
|
124
141
|
|
125
142
|
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
@@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
|
169
186
|
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
170
187
|
}
|
171
188
|
|
189
|
+
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
190
|
+
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
|
191
|
+
}
|
192
|
+
|
172
193
|
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
173
194
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
174
195
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
@@ -337,11 +358,26 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
337
358
|
ggml_backend_cuda_reg_devices();
|
338
359
|
#endif
|
339
360
|
|
361
|
+
#ifdef GGML_USE_SYCL
|
362
|
+
extern void ggml_backend_sycl_reg_devices(void);
|
363
|
+
ggml_backend_sycl_reg_devices();
|
364
|
+
#endif
|
365
|
+
|
340
366
|
#ifdef GGML_USE_METAL
|
341
367
|
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
342
368
|
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
343
369
|
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
344
370
|
#endif
|
371
|
+
|
372
|
+
#ifdef GGML_USE_VULKAN
|
373
|
+
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
|
374
|
+
ggml_backend_vk_reg_devices();
|
375
|
+
#endif
|
376
|
+
|
377
|
+
#ifdef GGML_USE_KOMPUTE
|
378
|
+
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
379
|
+
ggml_backend_kompute_reg_devices();
|
380
|
+
#endif
|
345
381
|
}
|
346
382
|
|
347
383
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
@@ -545,6 +581,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
545
581
|
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
546
582
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
547
583
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
584
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
548
585
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
549
586
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
550
587
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
@@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
600
637
|
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
601
638
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
602
639
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
640
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
603
641
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
604
642
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
605
643
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
@@ -756,6 +794,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
|
|
756
794
|
GGML_UNUSED(user_data);
|
757
795
|
}
|
758
796
|
|
797
|
+
// multi-buffer buffer
|
798
|
+
|
799
|
+
struct ggml_backend_multi_buffer_context {
|
800
|
+
ggml_backend_buffer_t * buffers;
|
801
|
+
size_t n_buffers;
|
802
|
+
};
|
803
|
+
|
804
|
+
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
|
805
|
+
|
806
|
+
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
807
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
808
|
+
|
809
|
+
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
810
|
+
}
|
811
|
+
|
812
|
+
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
813
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
814
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
815
|
+
ggml_backend_buffer_free(ctx->buffers[i]);
|
816
|
+
}
|
817
|
+
|
818
|
+
free(ctx->buffers);
|
819
|
+
free(ctx);
|
820
|
+
}
|
821
|
+
|
822
|
+
GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
823
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
824
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
825
|
+
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
826
|
+
}
|
827
|
+
}
|
828
|
+
|
829
|
+
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
|
830
|
+
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
|
831
|
+
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
832
|
+
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
833
|
+
/* .get_base = */ NULL,
|
834
|
+
/* .init_tensor = */ NULL,
|
835
|
+
/* .set_tensor = */ NULL,
|
836
|
+
/* .get_tensor = */ NULL,
|
837
|
+
/* .cpy_tensor = */ NULL,
|
838
|
+
/* .clear = */ ggml_backend_multi_buffer_clear,
|
839
|
+
/* .reset = */ NULL,
|
840
|
+
};
|
841
|
+
|
842
|
+
return multi_backend_buffer_i;
|
843
|
+
}
|
844
|
+
|
845
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
846
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
847
|
+
ctx->n_buffers = n_buffers;
|
848
|
+
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
849
|
+
|
850
|
+
size_t total_size = 0;
|
851
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
852
|
+
ctx->buffers[i] = buffers[i];
|
853
|
+
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
854
|
+
}
|
855
|
+
|
856
|
+
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
|
857
|
+
}
|
858
|
+
|
859
|
+
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
860
|
+
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
861
|
+
}
|
862
|
+
|
863
|
+
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
864
|
+
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
865
|
+
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
866
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
867
|
+
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
868
|
+
}
|
869
|
+
}
|
870
|
+
|
759
871
|
|
760
872
|
// scheduler
|
761
873
|
|
@@ -20,6 +20,7 @@ extern "C" {
|
|
20
20
|
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
21
21
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
22
22
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
23
|
+
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
23
24
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
24
25
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
25
26
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
@@ -36,6 +37,7 @@ extern "C" {
|
|
36
37
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
37
38
|
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
38
39
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
40
|
+
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
39
41
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
40
42
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
41
43
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
@@ -54,6 +56,7 @@ extern "C" {
|
|
54
56
|
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
55
57
|
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
56
58
|
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
59
|
+
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
|
57
60
|
|
58
61
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
59
62
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|