llama_cpp 0.12.3 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +22 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -2
- data/vendor/tmp/llama.cpp/Makefile +23 -4
- data/vendor/tmp/llama.cpp/ggml-alloc.c +85 -25
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +115 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +121 -86
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +706 -15
- data/vendor/tmp/llama.cpp/ggml-quants.h +17 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +350 -57
- data/vendor/tmp/llama.cpp/ggml.h +7 -1
- data/vendor/tmp/llama.cpp/llama.cpp +574 -39
- data/vendor/tmp/llama.cpp/llama.h +11 -15
- metadata +9 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: e77376858bfb07c67b29963a898f3cf9f2494a5cadabbc4cf777e87af801b33c
         | 
| 4 | 
            +
              data.tar.gz: 1196c932182a2c76416c326dac934e97cb9111e6bed269c4776e05587391b916
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 594f4af7e1e88f156926b7605683e29b47a7caf3afb2c18434fa0035415902fb51a9dafe845a4a108bce0dfdd9ad63b5301790826ee6995fa1799cf2bff0c1ee
         | 
| 7 | 
            +
              data.tar.gz: 4199b0e417efc0e469172c147aa766a81b3f073158eefc13315ab50e4240a4e2f41611e3c87939f4d3012357edf339b1450e49f2bc324f37f92040396342d476
         | 
    
        data/CHANGELOG.md
    CHANGED
    
    | @@ -1,3 +1,12 @@ | |
| 1 | 
            +
            ## [[0.12.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.3...v0.12.4)] - 2024-02-03
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            - Bump bundled llama.cpp from b1971 to b2047.
         | 
| 4 | 
            +
              - Add constant for file type: `LLAMA_FTYPE_MOSTLY_IQ3_XXS`.
         | 
| 5 | 
            +
              - Add `supports_mmap?`, `supports_mlock?`, and `supports_gpu_offload?` module functions to `LLaMACpp`.
         | 
| 6 | 
            +
              - Add `--with-vulkan` configuration option.
         | 
| 7 | 
            +
              - Deprecate `mmap_supported?` and `mlock_supported?` module functions in `LLaMACpp`.
         | 
| 8 | 
            +
              - Remove `LLAMA_MAX_DEVICES` constant.
         | 
| 9 | 
            +
             | 
| 1 10 | 
             
            ## [[0.12.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.2...v0.12.3)] - 2024-01-27
         | 
| 2 11 |  | 
| 3 12 | 
             
            - Bump bundled llama.cpp from b1892 to b1971.
         | 
    
        data/ext/llama_cpp/extconf.rb
    CHANGED
    
    | @@ -19,6 +19,7 @@ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') | |
| 19 19 | 
             
            make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
         | 
| 20 20 | 
             
            make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
         | 
| 21 21 | 
             
            make_envs << ' LLAMA_MPI=1' if with_config('mpi')
         | 
| 22 | 
            +
            make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
         | 
| 22 23 |  | 
| 23 24 | 
             
            Dir.chdir(LLAMA_CPP_DIR) do
         | 
| 24 25 | 
             
              _mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)
         | 
    
        data/ext/llama_cpp/llama_cpp.cpp
    CHANGED
    
    | @@ -843,15 +843,15 @@ private: | |
| 843 843 |  | 
| 844 844 | 
             
              // tensor_split
         | 
| 845 845 | 
             
              static VALUE _llama_model_params_get_tensor_split(VALUE self) {
         | 
| 846 | 
            -
                if ( | 
| 846 | 
            +
                if (llama_max_devices() < 1) {
         | 
| 847 847 | 
             
                  return rb_ary_new();
         | 
| 848 848 | 
             
                }
         | 
| 849 | 
            -
                VALUE ret = rb_ary_new2( | 
| 849 | 
            +
                VALUE ret = rb_ary_new2(llama_max_devices());
         | 
| 850 850 | 
             
                LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
         | 
| 851 851 | 
             
                if (ptr->params.tensor_split == nullptr) {
         | 
| 852 852 | 
             
                  return rb_ary_new();
         | 
| 853 853 | 
             
                }
         | 
| 854 | 
            -
                for (size_t i = 0; i <  | 
| 854 | 
            +
                for (size_t i = 0; i < llama_max_devices(); i++) {
         | 
| 855 855 | 
             
                  rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
         | 
| 856 856 | 
             
                }
         | 
| 857 857 | 
             
                return ret;
         | 
| @@ -3259,15 +3259,29 @@ static VALUE rb_llama_time_us(VALUE self) { | |
| 3259 3259 | 
             
            }
         | 
| 3260 3260 |  | 
| 3261 3261 | 
             
            static VALUE rb_llama_mmap_supported(VALUE self) {
         | 
| 3262 | 
            +
              rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
         | 
| 3262 3263 | 
             
              return llama_mmap_supported() ? Qtrue : Qfalse;
         | 
| 3263 3264 | 
             
            }
         | 
| 3264 3265 |  | 
| 3265 3266 | 
             
            static VALUE rb_llama_mlock_supported(VALUE self) {
         | 
| 3267 | 
            +
              rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
         | 
| 3266 3268 | 
             
              return llama_mlock_supported() ? Qtrue : Qfalse;
         | 
| 3267 3269 | 
             
            }
         | 
| 3268 3270 |  | 
| 3269 3271 | 
             
            static VALUE rb_llama_max_devices(VALUE self) {
         | 
| 3270 | 
            -
              return  | 
| 3272 | 
            +
              return SIZET2NUM(llama_max_devices());
         | 
| 3273 | 
            +
            }
         | 
| 3274 | 
            +
             | 
| 3275 | 
            +
            static VALUE rb_llama_supports_mmap(VALUE self) {
         | 
| 3276 | 
            +
              return llama_supports_mmap() ? Qtrue : Qfalse;
         | 
| 3277 | 
            +
            }
         | 
| 3278 | 
            +
             | 
| 3279 | 
            +
            static VALUE rb_llama_supports_mlock(VALUE self) {
         | 
| 3280 | 
            +
              return llama_supports_mlock() ? Qtrue : Qfalse;
         | 
| 3281 | 
            +
            }
         | 
| 3282 | 
            +
             | 
| 3283 | 
            +
            static VALUE rb_llama_supports_gpu_offload(VALUE self) {
         | 
| 3284 | 
            +
              return llama_supports_gpu_offload() ? Qtrue : Qfalse;
         | 
| 3271 3285 | 
             
            }
         | 
| 3272 3286 |  | 
| 3273 3287 | 
             
            extern "C" void Init_llama_cpp(void) {
         | 
| @@ -3294,8 +3308,9 @@ extern "C" void Init_llama_cpp(void) { | |
| 3294 3308 | 
             
              rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
         | 
| 3295 3309 | 
             
              rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
         | 
| 3296 3310 | 
             
              rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
         | 
| 3297 | 
            -
             | 
| 3298 | 
            -
               | 
| 3311 | 
            +
              rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
         | 
| 3312 | 
            +
              rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
         | 
| 3313 | 
            +
              rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
         | 
| 3299 3314 |  | 
| 3300 3315 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
         | 
| 3301 3316 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
         | 
| @@ -3329,6 +3344,7 @@ extern "C" void Init_llama_cpp(void) { | |
| 3329 3344 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
         | 
| 3330 3345 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
         | 
| 3331 3346 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
         | 
| 3347 | 
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
         | 
| 3332 3348 |  | 
| 3333 3349 | 
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
         | 
| 3334 3350 |  | 
    
        data/lib/llama_cpp/version.rb
    CHANGED
    
    | @@ -3,8 +3,8 @@ | |
| 3 3 | 
             
            # llama_cpp.rb provides Ruby bindings for the llama.cpp.
         | 
| 4 4 | 
             
            module LLaMACpp
         | 
| 5 5 | 
             
              # The version of llama_cpp.rb you install.
         | 
| 6 | 
            -
              VERSION = '0.12. | 
| 6 | 
            +
              VERSION = '0.12.4'
         | 
| 7 7 |  | 
| 8 8 | 
             
              # The version of llama.cpp bundled with llama_cpp.rb.
         | 
| 9 | 
            -
              LLAMA_CPP_VERSION = ' | 
| 9 | 
            +
              LLAMA_CPP_VERSION = 'b2047'
         | 
| 10 10 | 
             
            end
         | 
    
        data/sig/llama_cpp.rbs
    CHANGED
    
    | @@ -3,8 +3,6 @@ module LLaMACpp | |
| 3 3 | 
             
              LLAMA_CPP_VERSION: String
         | 
| 4 4 | 
             
              LLAMA_DEFALUT_SEED: String
         | 
| 5 5 |  | 
| 6 | 
            -
              LLAMA_MAX_DEVICES: Integer
         | 
| 7 | 
            -
             | 
| 8 6 | 
             
              LLAMA_FTYPE_ALL_F32: Integer
         | 
| 9 7 | 
             
              LLAMA_FTYPE_MOSTLY_F16: Integer
         | 
| 10 8 | 
             
              LLAMA_FTYPE_MOSTLY_Q4_0: Integer
         | 
| @@ -26,6 +24,7 @@ module LLaMACpp | |
| 26 24 | 
             
              LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
         | 
| 27 25 | 
             
              LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
         | 
| 28 26 | 
             
              LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
         | 
| 27 | 
            +
              LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
         | 
| 29 28 |  | 
| 30 29 | 
             
              LLAMA_KV_OVERRIDE_INT: Integer
         | 
| 31 30 | 
             
              LLAMA_KV_OVERRIDE_FLOAT: Integer
         | 
| @@ -61,6 +60,9 @@ module LLaMACpp | |
| 61 60 | 
             
              def self?.mmap_supported?: () -> bool
         | 
| 62 61 | 
             
              def self?.mlock_supported?: () -> bool
         | 
| 63 62 | 
             
              def self?.max_devices: () -> Integer
         | 
| 63 | 
            +
              def self?.supports_mmap?: () -> bool
         | 
| 64 | 
            +
              def self?.supports_mlock?: () -> bool
         | 
| 65 | 
            +
              def self?.supports_gpu_offload?: () -> bool
         | 
| 64 66 |  | 
| 65 67 | 
             
              class TokenData
         | 
| 66 68 | 
             
                public
         | 
| @@ -9,7 +9,7 @@ TEST_TARGETS = \ | |
| 9 9 | 
             
            	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
         | 
| 10 10 | 
             
            	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
         | 
| 11 11 | 
             
            	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
         | 
| 12 | 
            -
            	tests/test-backend-ops tests/test-autorelease
         | 
| 12 | 
            +
            	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
         | 
| 13 13 |  | 
| 14 14 | 
             
            # Code coverage output files
         | 
| 15 15 | 
             
            COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
         | 
| @@ -450,6 +450,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h | |
| 450 450 | 
             
            	$(CXX) $(CXXFLAGS) -c $< -o $@
         | 
| 451 451 | 
             
            endif # LLAMA_CLBLAST
         | 
| 452 452 |  | 
| 453 | 
            +
            ifdef LLAMA_VULKAN
         | 
| 454 | 
            +
            	MK_CPPFLAGS  += -DGGML_USE_VULKAN
         | 
| 455 | 
            +
            	MK_LDFLAGS += -lvulkan
         | 
| 456 | 
            +
            	OBJS    += ggml-vulkan.o
         | 
| 457 | 
            +
             | 
| 458 | 
            +
            ifdef LLAMA_VULKAN_CHECK_RESULTS
         | 
| 459 | 
            +
            	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
         | 
| 460 | 
            +
            endif
         | 
| 461 | 
            +
             | 
| 462 | 
            +
            ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
         | 
| 463 | 
            +
            	$(CXX) $(CXXFLAGS) -c $< -o $@
         | 
| 464 | 
            +
            endif # LLAMA_VULKAN
         | 
| 465 | 
            +
             | 
| 453 466 | 
             
            ifdef LLAMA_HIPBLAS
         | 
| 454 467 |  | 
| 455 468 | 
             
            	ifeq ($(wildcard /opt/rocm),)
         | 
| @@ -575,12 +588,15 @@ train.o: common/train.cpp common/train.h | |
| 575 588 | 
             
            libllama.so: llama.o ggml.o $(OBJS)
         | 
| 576 589 | 
             
            	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
         | 
| 577 590 |  | 
| 591 | 
            +
            libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
         | 
| 592 | 
            +
            	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
         | 
| 593 | 
            +
             | 
| 578 594 | 
             
            lib: llama.o ggml.o $(OBJS)
         | 
| 579 595 | 
             
            	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
         | 
| 580 596 | 
             
            	ar rcs libllama.a $^
         | 
| 581 597 |  | 
| 582 598 | 
             
            clean:
         | 
| 583 | 
            -
            	rm -vrf *.o tests/*.o *.so *.dll *.dylib  | 
| 599 | 
            +
            	rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
         | 
| 584 600 |  | 
| 585 601 | 
             
            #
         | 
| 586 602 | 
             
            # Examples
         | 
| @@ -625,7 +641,7 @@ embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(C | |
| 625 641 | 
             
            save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
         | 
| 626 642 | 
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
         | 
| 627 643 |  | 
| 628 | 
            -
            server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
         | 
| 644 | 
            +
            server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
         | 
| 629 645 | 
             
            	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
         | 
| 630 646 |  | 
| 631 647 | 
             
            gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
         | 
| @@ -754,5 +770,8 @@ tests/test-c.o: tests/test-c.c llama.h | |
| 754 770 | 
             
            tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
         | 
| 755 771 | 
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
         | 
| 756 772 |  | 
| 757 | 
            -
            tests/test- | 
| 773 | 
            +
            tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
         | 
| 774 | 
            +
            	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
         | 
| 775 | 
            +
             | 
| 776 | 
            +
            tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
         | 
| 758 777 | 
             
            	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
         | 
| @@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) { | |
| 335 335 | 
             
            }
         | 
| 336 336 |  | 
| 337 337 | 
             
            size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
         | 
| 338 | 
            -
                 | 
| 338 | 
            +
                // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
         | 
| 339 | 
            +
                // to avoid this, we add a 10% margin to the buffer size
         | 
| 340 | 
            +
                return alloc->max_size + alloc->max_size/10;
         | 
| 339 341 | 
             
            }
         | 
| 340 342 |  | 
| 341 343 | 
             
            // graph allocator
         | 
| @@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) | |
| 776 778 | 
             
            }
         | 
| 777 779 |  | 
| 778 780 | 
             
            // utils
         | 
| 779 | 
            -
            ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
         | 
| 780 | 
            -
                GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
         | 
| 781 | 
            -
             | 
| 782 | 
            -
                size_t alignment = ggml_backend_buft_get_alignment(buft);
         | 
| 783 | 
            -
             | 
| 784 | 
            -
                size_t nbytes = 0;
         | 
| 785 | 
            -
                for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
         | 
| 786 | 
            -
                    if (t->data == NULL && t->view_src == NULL) {
         | 
| 787 | 
            -
                        nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
         | 
| 788 | 
            -
                    }
         | 
| 789 | 
            -
                }
         | 
| 790 | 
            -
             | 
| 791 | 
            -
                if (nbytes == 0) {
         | 
| 792 | 
            -
                    // all the tensors in the context are already allocated
         | 
| 793 | 
            -
            #ifndef NDEBUG
         | 
| 794 | 
            -
                    fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
         | 
| 795 | 
            -
            #endif
         | 
| 796 | 
            -
                    return NULL;
         | 
| 797 | 
            -
                }
         | 
| 798 781 |  | 
| 799 | 
            -
             | 
| 782 | 
            +
            static bool alloc_tensor_range(struct ggml_context * ctx,
         | 
| 783 | 
            +
                    struct ggml_tensor * first, struct ggml_tensor * last,
         | 
| 784 | 
            +
                    ggml_backend_buffer_type_t buft, size_t size,
         | 
| 785 | 
            +
                    ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
         | 
| 786 | 
            +
                ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
         | 
| 800 787 | 
             
                if (buffer == NULL) {
         | 
| 801 | 
            -
                    // failed to allocate buffer
         | 
| 802 788 | 
             
            #ifndef NDEBUG
         | 
| 803 | 
            -
                    fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
         | 
| 789 | 
            +
                    fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
         | 
| 804 790 | 
             
            #endif
         | 
| 805 | 
            -
                     | 
| 791 | 
            +
                    for (size_t i = 0; i < *n_buffers; i++) {
         | 
| 792 | 
            +
                        ggml_backend_buffer_free(*buffers[i]);
         | 
| 793 | 
            +
                    }
         | 
| 794 | 
            +
                    free(*buffers);
         | 
| 795 | 
            +
                    return false;
         | 
| 806 796 | 
             
                }
         | 
| 807 797 |  | 
| 808 798 | 
             
                ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
         | 
| 809 799 |  | 
| 810 | 
            -
                for (struct ggml_tensor * t =  | 
| 800 | 
            +
                for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
         | 
| 811 801 | 
             
                    if (t->data == NULL) {
         | 
| 812 802 | 
             
                        if (t->view_src == NULL) {
         | 
| 813 803 | 
             
                            ggml_tallocr_alloc(tallocr, t);
         | 
| @@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte | |
| 824 814 |  | 
| 825 815 | 
             
                ggml_tallocr_free(tallocr);
         | 
| 826 816 |  | 
| 817 | 
            +
                *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
         | 
| 818 | 
            +
                (*buffers)[(*n_buffers)++] = buffer;
         | 
| 819 | 
            +
             | 
| 820 | 
            +
                return true;
         | 
| 821 | 
            +
            }
         | 
| 822 | 
            +
             | 
| 823 | 
            +
            ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
         | 
| 824 | 
            +
                GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
         | 
| 825 | 
            +
             | 
| 826 | 
            +
                size_t alignment = ggml_backend_buft_get_alignment(buft);
         | 
| 827 | 
            +
                size_t max_size = ggml_backend_buft_get_max_size(buft);
         | 
| 828 | 
            +
             | 
| 829 | 
            +
                ggml_backend_buffer_t * buffers = NULL;
         | 
| 830 | 
            +
                size_t n_buffers = 0;
         | 
| 831 | 
            +
             | 
| 832 | 
            +
                size_t cur_buf_size = 0;
         | 
| 833 | 
            +
                struct ggml_tensor * first = ggml_get_first_tensor(ctx);
         | 
| 834 | 
            +
                for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
         | 
| 835 | 
            +
                    size_t this_size = 0;
         | 
| 836 | 
            +
                    if (t->data == NULL && t->view_src == NULL) {
         | 
| 837 | 
            +
                        this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
         | 
| 838 | 
            +
                    }
         | 
| 839 | 
            +
             | 
| 840 | 
            +
                    if (this_size > max_size) {
         | 
| 841 | 
            +
                        // tensor is too large to fit in a single buffer
         | 
| 842 | 
            +
                        fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
         | 
| 843 | 
            +
                                __func__, t->name,
         | 
| 844 | 
            +
                                ggml_backend_buft_name(buft),
         | 
| 845 | 
            +
                                this_size, max_size);
         | 
| 846 | 
            +
                        for (size_t i = 0; i < n_buffers; i++) {
         | 
| 847 | 
            +
                            ggml_backend_buffer_free(buffers[i]);
         | 
| 848 | 
            +
                        }
         | 
| 849 | 
            +
                        free(buffers);
         | 
| 850 | 
            +
                        return NULL;
         | 
| 851 | 
            +
                    }
         | 
| 852 | 
            +
             | 
| 853 | 
            +
                    if ((cur_buf_size + this_size) > max_size) {
         | 
| 854 | 
            +
                        // allocate tensors in the current buffer
         | 
| 855 | 
            +
                        if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
         | 
| 856 | 
            +
                            return NULL;
         | 
| 857 | 
            +
                        }
         | 
| 858 | 
            +
                        first = t;
         | 
| 859 | 
            +
                        cur_buf_size = this_size;
         | 
| 860 | 
            +
                    } else {
         | 
| 861 | 
            +
                        cur_buf_size += this_size;
         | 
| 862 | 
            +
                    }
         | 
| 863 | 
            +
                }
         | 
| 864 | 
            +
             | 
| 865 | 
            +
                // allocate remaining tensors
         | 
| 866 | 
            +
                if (cur_buf_size > 0) {
         | 
| 867 | 
            +
                    if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
         | 
| 868 | 
            +
                        return NULL;
         | 
| 869 | 
            +
                    }
         | 
| 870 | 
            +
                }
         | 
| 871 | 
            +
             | 
| 872 | 
            +
                if (n_buffers == 0) {
         | 
| 873 | 
            +
                    // all the tensors in the context are already allocated
         | 
| 874 | 
            +
            #ifndef NDEBUG
         | 
| 875 | 
            +
                    fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
         | 
| 876 | 
            +
            #endif
         | 
| 877 | 
            +
                    return NULL;
         | 
| 878 | 
            +
                }
         | 
| 879 | 
            +
             | 
| 880 | 
            +
                ggml_backend_buffer_t buffer;
         | 
| 881 | 
            +
                if (n_buffers == 1) {
         | 
| 882 | 
            +
                    buffer = buffers[0];
         | 
| 883 | 
            +
                } else {
         | 
| 884 | 
            +
                    buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
         | 
| 885 | 
            +
                }
         | 
| 886 | 
            +
                free(buffers);
         | 
| 827 887 | 
             
                return buffer;
         | 
| 828 888 | 
             
            }
         | 
| 829 889 |  | 
| @@ -19,6 +19,7 @@ extern "C" { | |
| 19 19 | 
             
                    const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
         | 
| 20 20 | 
             
                    ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
         | 
| 21 21 | 
             
                    size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
         | 
| 22 | 
            +
                    size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
         | 
| 22 23 | 
             
                    size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
         | 
| 23 24 | 
             
                    bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
         | 
| 24 25 | 
             
                    // check if tensor data is in host memory
         | 
| @@ -63,6 +64,11 @@ extern "C" { | |
| 63 64 | 
             
                // do not use directly, use ggml_backend_tensor_copy instead
         | 
| 64 65 | 
             
                bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
         | 
| 65 66 |  | 
| 67 | 
            +
                // buffer that contains a collection of buffers
         | 
| 68 | 
            +
                GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
         | 
| 69 | 
            +
                GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
         | 
| 70 | 
            +
                GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
         | 
| 71 | 
            +
             | 
| 66 72 | 
             
                //
         | 
| 67 73 | 
             
                // Backend
         | 
| 68 74 | 
             
                //
         | 
| @@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { | |
| 27 27 | 
             
                return buft->iface.get_alignment(buft);
         | 
| 28 28 | 
             
            }
         | 
| 29 29 |  | 
| 30 | 
            +
            size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
         | 
| 31 | 
            +
                // get_max_size is optional, defaults to SIZE_MAX
         | 
| 32 | 
            +
                if (buft->iface.get_max_size) {
         | 
| 33 | 
            +
                    return buft->iface.get_max_size(buft);
         | 
| 34 | 
            +
                }
         | 
| 35 | 
            +
                return SIZE_MAX;
         | 
| 36 | 
            +
            }
         | 
| 37 | 
            +
             | 
| 30 38 | 
             
            GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
         | 
| 31 39 | 
             
                // get_alloc_size is optional, defaults to ggml_nbytes
         | 
| 32 40 | 
             
                if (buft->iface.get_alloc_size) {
         | 
| 33 | 
            -
                     | 
| 41 | 
            +
                    size_t size = buft->iface.get_alloc_size(buft, tensor);
         | 
| 42 | 
            +
                    assert(size >= ggml_nbytes(tensor));
         | 
| 43 | 
            +
                    return size;
         | 
| 34 44 | 
             
                }
         | 
| 35 45 | 
             
                return ggml_nbytes(tensor);
         | 
| 36 46 | 
             
            }
         | 
| @@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( | |
| 55 65 | 
             
                           size_t                          size) {
         | 
| 56 66 | 
             
                ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
         | 
| 57 67 |  | 
| 58 | 
            -
                GGML_ASSERT(iface.get_base != NULL);
         | 
| 59 | 
            -
             | 
| 60 68 | 
             
                (*buffer) = (struct ggml_backend_buffer) {
         | 
| 61 69 | 
             
                    /* .interface = */ iface,
         | 
| 62 70 | 
             
                    /* .buft      = */ buft,
         | 
| @@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) { | |
| 106 114 | 
             
                return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
         | 
| 107 115 | 
             
            }
         | 
| 108 116 |  | 
| 117 | 
            +
            size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
         | 
| 118 | 
            +
                return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
         | 
| 119 | 
            +
            }
         | 
| 120 | 
            +
             | 
| 109 121 | 
             
            size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
         | 
| 110 122 | 
             
                return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
         | 
| 111 123 | 
             
            }
         | 
| @@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { | |
| 120 132 |  | 
| 121 133 | 
             
            void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
         | 
| 122 134 | 
             
                buffer->usage = usage;
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                // FIXME: add a generic callback to the buffer interface
         | 
| 137 | 
            +
                if (ggml_backend_buffer_is_multi_buffer(buffer)) {
         | 
| 138 | 
            +
                    ggml_backend_multi_buffer_set_usage(buffer, usage);
         | 
| 139 | 
            +
                }
         | 
| 123 140 | 
             
            }
         | 
| 124 141 |  | 
| 125 142 | 
             
            ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
         | 
| @@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) { | |
| 169 186 | 
             
                return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
         | 
| 170 187 | 
             
            }
         | 
| 171 188 |  | 
| 189 | 
            +
            size_t ggml_backend_get_max_size(ggml_backend_t backend) {
         | 
| 190 | 
            +
                return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
         | 
| 191 | 
            +
            }
         | 
| 192 | 
            +
             | 
| 172 193 | 
             
            void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
         | 
| 173 194 | 
             
                GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
         | 
| 174 195 | 
             
                GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
         | 
| @@ -337,11 +358,26 @@ GGML_CALL static void ggml_backend_registry_init(void) { | |
| 337 358 | 
             
                ggml_backend_cuda_reg_devices();
         | 
| 338 359 | 
             
            #endif
         | 
| 339 360 |  | 
| 361 | 
            +
            #ifdef GGML_USE_SYCL
         | 
| 362 | 
            +
                extern void ggml_backend_sycl_reg_devices(void);
         | 
| 363 | 
            +
                ggml_backend_sycl_reg_devices();
         | 
| 364 | 
            +
            #endif
         | 
| 365 | 
            +
             | 
| 340 366 | 
             
            #ifdef GGML_USE_METAL
         | 
| 341 367 | 
             
                extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
         | 
| 342 368 | 
             
                extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
         | 
| 343 369 | 
             
                ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
         | 
| 344 370 | 
             
            #endif
         | 
| 371 | 
            +
             | 
| 372 | 
            +
            #ifdef GGML_USE_VULKAN
         | 
| 373 | 
            +
                extern GGML_CALL int ggml_backend_vk_reg_devices(void);
         | 
| 374 | 
            +
                ggml_backend_vk_reg_devices();
         | 
| 375 | 
            +
            #endif
         | 
| 376 | 
            +
             | 
| 377 | 
            +
            #ifdef GGML_USE_KOMPUTE
         | 
| 378 | 
            +
                extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
         | 
| 379 | 
            +
                ggml_backend_kompute_reg_devices();
         | 
| 380 | 
            +
            #endif
         | 
| 345 381 | 
             
            }
         | 
| 346 382 |  | 
| 347 383 | 
             
            GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
         | 
| @@ -545,6 +581,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { | |
| 545 581 | 
             
                        /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
         | 
| 546 582 | 
             
                        /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
         | 
| 547 583 | 
             
                        /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
         | 
| 584 | 
            +
                        /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
         | 
| 548 585 | 
             
                        /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
         | 
| 549 586 | 
             
                        /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
         | 
| 550 587 | 
             
                        /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
         | 
| @@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { | |
| 600 637 | 
             
                        /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
         | 
| 601 638 | 
             
                        /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
         | 
| 602 639 | 
             
                        /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
         | 
| 640 | 
            +
                        /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
         | 
| 603 641 | 
             
                        /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
         | 
| 604 642 | 
             
                        /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
         | 
| 605 643 | 
             
                        /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
         | 
| @@ -756,6 +794,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v | |
| 756 794 | 
             
                GGML_UNUSED(user_data);
         | 
| 757 795 | 
             
            }
         | 
| 758 796 |  | 
| 797 | 
            +
            // multi-buffer buffer
         | 
| 798 | 
            +
             | 
| 799 | 
            +
            struct ggml_backend_multi_buffer_context {
         | 
| 800 | 
            +
                ggml_backend_buffer_t * buffers;
         | 
| 801 | 
            +
                size_t n_buffers;
         | 
| 802 | 
            +
            };
         | 
| 803 | 
            +
             | 
| 804 | 
            +
            typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
         | 
| 805 | 
            +
             | 
| 806 | 
            +
            GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
         | 
| 807 | 
            +
                ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
         | 
| 808 | 
            +
             | 
| 809 | 
            +
                return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
         | 
| 810 | 
            +
            }
         | 
| 811 | 
            +
             | 
| 812 | 
            +
            GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
         | 
| 813 | 
            +
                ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
         | 
| 814 | 
            +
                for (size_t i = 0; i < ctx->n_buffers; i++) {
         | 
| 815 | 
            +
                    ggml_backend_buffer_free(ctx->buffers[i]);
         | 
| 816 | 
            +
                }
         | 
| 817 | 
            +
             | 
| 818 | 
            +
                free(ctx->buffers);
         | 
| 819 | 
            +
                free(ctx);
         | 
| 820 | 
            +
            }
         | 
| 821 | 
            +
             | 
| 822 | 
            +
            GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
         | 
| 823 | 
            +
                ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
         | 
| 824 | 
            +
                for (size_t i = 0; i < ctx->n_buffers; i++) {
         | 
| 825 | 
            +
                    ggml_backend_buffer_clear(ctx->buffers[i], value);
         | 
| 826 | 
            +
                }
         | 
| 827 | 
            +
            }
         | 
| 828 | 
            +
             | 
| 829 | 
            +
            static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
         | 
| 830 | 
            +
                static struct ggml_backend_buffer_i multi_backend_buffer_i = {
         | 
| 831 | 
            +
                    /* .get_name        = */ ggml_backend_multi_buffer_get_name,
         | 
| 832 | 
            +
                    /* .free_buffer     = */ ggml_backend_multi_buffer_free_buffer,
         | 
| 833 | 
            +
                    /* .get_base        = */ NULL,
         | 
| 834 | 
            +
                    /* .init_tensor     = */ NULL,
         | 
| 835 | 
            +
                    /* .set_tensor      = */ NULL,
         | 
| 836 | 
            +
                    /* .get_tensor      = */ NULL,
         | 
| 837 | 
            +
                    /* .cpy_tensor      = */ NULL,
         | 
| 838 | 
            +
                    /* .clear           = */ ggml_backend_multi_buffer_clear,
         | 
| 839 | 
            +
                    /* .reset           = */ NULL,
         | 
| 840 | 
            +
                };
         | 
| 841 | 
            +
             | 
| 842 | 
            +
                return multi_backend_buffer_i;
         | 
| 843 | 
            +
            }
         | 
| 844 | 
            +
             | 
| 845 | 
            +
            GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
         | 
| 846 | 
            +
                ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
         | 
| 847 | 
            +
                ctx->n_buffers = n_buffers;
         | 
| 848 | 
            +
                ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
         | 
| 849 | 
            +
             | 
| 850 | 
            +
                size_t total_size = 0;
         | 
| 851 | 
            +
                for (size_t i = 0; i < n_buffers; i++) {
         | 
| 852 | 
            +
                    ctx->buffers[i] = buffers[i];
         | 
| 853 | 
            +
                    total_size += ggml_backend_buffer_get_size(buffers[i]);
         | 
| 854 | 
            +
                }
         | 
| 855 | 
            +
             | 
| 856 | 
            +
                return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
         | 
| 857 | 
            +
            }
         | 
| 858 | 
            +
             | 
| 859 | 
            +
            GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
         | 
| 860 | 
            +
                return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
         | 
| 861 | 
            +
            }
         | 
| 862 | 
            +
             | 
| 863 | 
            +
            GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
         | 
| 864 | 
            +
                GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
         | 
| 865 | 
            +
                ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
         | 
| 866 | 
            +
                for (size_t i = 0; i < ctx->n_buffers; i++) {
         | 
| 867 | 
            +
                    ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
         | 
| 868 | 
            +
                }
         | 
| 869 | 
            +
            }
         | 
| 870 | 
            +
             | 
| 759 871 |  | 
| 760 872 | 
             
            // scheduler
         | 
| 761 873 |  | 
| @@ -20,6 +20,7 @@ extern "C" { | |
| 20 20 | 
             
                GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
         | 
| 21 21 | 
             
                GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
         | 
| 22 22 | 
             
                GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
         | 
| 23 | 
            +
                GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
         | 
| 23 24 | 
             
                GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
         | 
| 24 25 | 
             
                GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
         | 
| 25 26 | 
             
                GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
         | 
| @@ -36,6 +37,7 @@ extern "C" { | |
| 36 37 | 
             
                GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
         | 
| 37 38 | 
             
                GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
         | 
| 38 39 | 
             
                GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
         | 
| 40 | 
            +
                GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
         | 
| 39 41 | 
             
                GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
         | 
| 40 42 | 
             
                GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
         | 
| 41 43 | 
             
                GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
         | 
| @@ -54,6 +56,7 @@ extern "C" { | |
| 54 56 | 
             
                GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
         | 
| 55 57 | 
             
                GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
         | 
| 56 58 | 
             
                GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
         | 
| 59 | 
            +
                GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
         | 
| 57 60 |  | 
| 58 61 | 
             
                GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         | 
| 59 62 | 
             
                GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
         |