llama_cpp 0.12.2 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a65b477c93060832783d03b065dd336820bf22e985dd7b9f53a20e5834f29a0d
4
- data.tar.gz: 3ab3f5147bb207ddeea4b902e86de41398fbe497bb521ab00a4fe89ccd790d50
3
+ metadata.gz: e77376858bfb07c67b29963a898f3cf9f2494a5cadabbc4cf777e87af801b33c
4
+ data.tar.gz: 1196c932182a2c76416c326dac934e97cb9111e6bed269c4776e05587391b916
5
5
  SHA512:
6
- metadata.gz: 119a77a344ece09afda87d89321f679b9c53975c6b340150e298fa3869a0bf48849fafd49e5ef18b001311aae10e3fa9aba29c96de2c4aa8535cdad7d01382cb
7
- data.tar.gz: 444fc224413ee6fc94b0866da07460e9c95162941fcd80c831c6f7a950373503eba74b10d437724db2c9debec4719c5a9b25875f1b0a014c956bcb424ca8bf47
6
+ metadata.gz: 594f4af7e1e88f156926b7605683e29b47a7caf3afb2c18434fa0035415902fb51a9dafe845a4a108bce0dfdd9ad63b5301790826ee6995fa1799cf2bff0c1ee
7
+ data.tar.gz: 4199b0e417efc0e469172c147aa766a81b3f073158eefc13315ab50e4240a4e2f41611e3c87939f4d3012357edf339b1450e49f2bc324f37f92040396342d476
data/CHANGELOG.md CHANGED
@@ -1,3 +1,18 @@
1
+ ## [[0.12.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.3...v0.12.4)] - 2024-02-03
2
+
3
+ - Bump bundled llama.cpp from b1971 to b2047.
4
+ - Add constant for file type: `LLAMA_FTYPE_MOSTLY_IQ3_XXS`.
5
+ - Add `supports_mmap?`, `supports_mlock?`, and `supports_gpu_offload?` module functions to `LLaMACpp`.
6
+ - Add `--with-vulkan` configuration option.
7
+ - Deprecate `mmap_supported?` and `mlock_supported?` module functions in `LLaMACpp`.
8
+ - Remove `LLAMA_MAX_DEVICES` constant.
9
+
10
+ ## [[0.12.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.2...v0.12.3)] - 2024-01-27
11
+
12
+ - Bump bundled llama.cpp from b1892 to b1971.
13
+ - Add constant for file type: `LLAMA_FTYPE_MOSTLY_Q3_K_XS`.
14
+ - Add `sample_entropy` method to Context.
15
+
1
16
  ## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
2
17
 
3
18
  - Bump bundled llama.cpp from b1833 to b1892.
data/README.md CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
28
28
  # use OpenBLAS
29
29
  $ gem install llama_cpp -- --with-openblas
30
30
 
31
- # use CUDA
32
- $ gem install llama_cpp -- --with-cuda
31
+ # use cuBLAS
32
+ $ gem install llama_cpp -- --with-cublas
33
33
  ```
34
34
 
35
35
  Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
@@ -19,6 +19,7 @@ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
19
19
  make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
20
20
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
21
21
  make_envs << ' LLAMA_MPI=1' if with_config('mpi')
22
+ make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
22
23
 
23
24
  Dir.chdir(LLAMA_CPP_DIR) do
24
25
  _mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)
@@ -843,15 +843,15 @@ private:
843
843
 
844
844
  // tensor_split
845
845
  static VALUE _llama_model_params_get_tensor_split(VALUE self) {
846
- if (LLAMA_MAX_DEVICES < 1) {
846
+ if (llama_max_devices() < 1) {
847
847
  return rb_ary_new();
848
848
  }
849
- VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
849
+ VALUE ret = rb_ary_new2(llama_max_devices());
850
850
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
851
851
  if (ptr->params.tensor_split == nullptr) {
852
852
  return rb_ary_new();
853
853
  }
854
- for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
854
+ for (size_t i = 0; i < llama_max_devices(); i++) {
855
855
  rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
856
856
  }
857
857
  return ret;
@@ -2054,6 +2054,7 @@ public:
2054
2054
  rb_define_method(rb_cLLaMAContext, "sample_tail_free", RUBY_METHOD_FUNC(_llama_context_sample_tail_free), -1);
2055
2055
  rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
2056
2056
  rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
2057
+ rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
2057
2058
  rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
2058
2059
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
2059
2060
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
@@ -2904,6 +2905,50 @@ private:
2904
2905
  return Qnil;
2905
2906
  }
2906
2907
 
2908
+ static VALUE _llama_context_sample_entropy(int argc, VALUE* argv, VALUE self) {
2909
+ VALUE kw_args = Qnil;
2910
+ ID kw_table[3] = { rb_intern("min_temp"), rb_intern("max_temp"), rb_intern("exponent_val") };
2911
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2912
+ VALUE candidates = Qnil;
2913
+ rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
2914
+ rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
2915
+
2916
+ if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
2917
+ rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
2918
+ return Qnil;
2919
+ }
2920
+ if (!RB_FLOAT_TYPE_P(kw_values[0])) {
2921
+ rb_raise(rb_eArgError, "min_temp must be a float");
2922
+ return Qnil;
2923
+ }
2924
+ if (!RB_FLOAT_TYPE_P(kw_values[1])) {
2925
+ rb_raise(rb_eArgError, "max_temp must be a float");
2926
+ return Qnil;
2927
+ }
2928
+ if (!RB_FLOAT_TYPE_P(kw_values[2])) {
2929
+ rb_raise(rb_eArgError, "exponent_val must be a float");
2930
+ return Qnil;
2931
+ }
2932
+
2933
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
2934
+ if (ctx_ptr->ctx == NULL) {
2935
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2936
+ return Qnil;
2937
+ }
2938
+ LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
2939
+ if (cnd_ptr->array.data == nullptr) {
2940
+ rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
2941
+ return Qnil;
2942
+ }
2943
+ const float min_temp = NUM2DBL(kw_values[0]);
2944
+ const float max_temp = NUM2DBL(kw_values[1]);
2945
+ const float exponent_val = NUM2DBL(kw_values[2]);
2946
+
2947
+ llama_sample_entropy(ctx_ptr->ctx, &(cnd_ptr->array), min_temp, max_temp, exponent_val);
2948
+
2949
+ return Qnil;
2950
+ }
2951
+
2907
2952
  static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
2908
2953
  VALUE kw_args = Qnil;
2909
2954
  ID kw_table[1] = { rb_intern("temperature") };
@@ -3214,15 +3259,29 @@ static VALUE rb_llama_time_us(VALUE self) {
3214
3259
  }
3215
3260
 
3216
3261
  static VALUE rb_llama_mmap_supported(VALUE self) {
3262
+ rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
3217
3263
  return llama_mmap_supported() ? Qtrue : Qfalse;
3218
3264
  }
3219
3265
 
3220
3266
  static VALUE rb_llama_mlock_supported(VALUE self) {
3267
+ rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
3221
3268
  return llama_mlock_supported() ? Qtrue : Qfalse;
3222
3269
  }
3223
3270
 
3224
3271
  static VALUE rb_llama_max_devices(VALUE self) {
3225
- return INT2NUM(llama_max_devices());
3272
+ return SIZET2NUM(llama_max_devices());
3273
+ }
3274
+
3275
+ static VALUE rb_llama_supports_mmap(VALUE self) {
3276
+ return llama_supports_mmap() ? Qtrue : Qfalse;
3277
+ }
3278
+
3279
+ static VALUE rb_llama_supports_mlock(VALUE self) {
3280
+ return llama_supports_mlock() ? Qtrue : Qfalse;
3281
+ }
3282
+
3283
+ static VALUE rb_llama_supports_gpu_offload(VALUE self) {
3284
+ return llama_supports_gpu_offload() ? Qtrue : Qfalse;
3226
3285
  }
3227
3286
 
3228
3287
  extern "C" void Init_llama_cpp(void) {
@@ -3249,8 +3308,9 @@ extern "C" void Init_llama_cpp(void) {
3249
3308
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
3250
3309
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
3251
3310
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
3252
-
3253
- rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
3311
+ rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
3312
+ rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
3313
+ rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
3254
3314
 
3255
3315
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3256
3316
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
@@ -3283,6 +3343,8 @@ extern "C" void Init_llama_cpp(void) {
3283
3343
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
3284
3344
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
3285
3345
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3346
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3347
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3286
3348
 
3287
3349
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3288
3350
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.2'
6
+ VERSION = '0.12.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1892'
9
+ LLAMA_CPP_VERSION = 'b2047'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,8 +3,6 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
- LLAMA_MAX_DEVICES: Integer
7
-
8
6
  LLAMA_FTYPE_ALL_F32: Integer
9
7
  LLAMA_FTYPE_MOSTLY_F16: Integer
10
8
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -25,6 +23,8 @@ module LLaMACpp
25
23
  LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
26
24
  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
27
25
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
26
+ LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
27
+ LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
28
28
 
29
29
  LLAMA_KV_OVERRIDE_INT: Integer
30
30
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -60,6 +60,9 @@ module LLaMACpp
60
60
  def self?.mmap_supported?: () -> bool
61
61
  def self?.mlock_supported?: () -> bool
62
62
  def self?.max_devices: () -> Integer
63
+ def self?.supports_mmap?: () -> bool
64
+ def self?.supports_mlock?: () -> bool
65
+ def self?.supports_gpu_offload?: () -> bool
63
66
 
64
67
  class TokenData
65
68
  public
@@ -216,6 +219,7 @@ module LLaMACpp
216
219
  def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
217
220
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
218
221
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
222
+ def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
219
223
  def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
220
224
  def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
221
225
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
@@ -9,7 +9,7 @@ TEST_TARGETS = \
9
9
  tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops
12
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
13
13
 
14
14
  # Code coverage output files
15
15
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -450,6 +450,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
450
450
  $(CXX) $(CXXFLAGS) -c $< -o $@
451
451
  endif # LLAMA_CLBLAST
452
452
 
453
+ ifdef LLAMA_VULKAN
454
+ MK_CPPFLAGS += -DGGML_USE_VULKAN
455
+ MK_LDFLAGS += -lvulkan
456
+ OBJS += ggml-vulkan.o
457
+
458
+ ifdef LLAMA_VULKAN_CHECK_RESULTS
459
+ MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
460
+ endif
461
+
462
+ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
463
+ $(CXX) $(CXXFLAGS) -c $< -o $@
464
+ endif # LLAMA_VULKAN
465
+
453
466
  ifdef LLAMA_HIPBLAS
454
467
 
455
468
  ifeq ($(wildcard /opt/rocm),)
@@ -575,12 +588,15 @@ train.o: common/train.cpp common/train.h
575
588
  libllama.so: llama.o ggml.o $(OBJS)
576
589
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
577
590
 
591
+ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
592
+ ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
593
+
578
594
  lib: llama.o ggml.o $(OBJS)
579
595
  $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
580
596
  ar rcs libllama.a $^
581
597
 
582
598
  clean:
583
- rm -vrf *.o tests/*.o *.so *.dll *.dylib *.a benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
599
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
584
600
 
585
601
  #
586
602
  # Examples
@@ -625,7 +641,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
625
641
  save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
626
642
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
627
643
 
628
- server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
644
+ server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
629
645
  $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
630
646
 
631
647
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
@@ -753,3 +769,9 @@ tests/test-c.o: tests/test-c.c llama.h
753
769
 
754
770
  tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
755
771
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
772
+
773
+ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
774
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
775
+
776
+ tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
777
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
109
109
  if (block->size >= size) {
110
110
  best_fit_block = alloc->n_free_blocks - 1;
111
111
  } else {
112
- fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
113
- __func__, size, max_avail);
112
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
113
+ __func__, tensor->name, size, max_avail);
114
114
  GGML_ASSERT(!"not enough space in the buffer");
115
115
  return;
116
116
  }
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
335
335
  }
336
336
 
337
337
  size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
338
- return alloc->max_size;
338
+ // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
339
+ // to avoid this, we add a 10% margin to the buffer size
340
+ return alloc->max_size + alloc->max_size/10;
339
341
  }
340
342
 
341
343
  // graph allocator
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
776
778
  }
777
779
 
778
780
  // utils
779
- ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
780
- GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
781
-
782
- size_t alignment = ggml_backend_buft_get_alignment(buft);
783
-
784
- size_t nbytes = 0;
785
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
786
- if (t->data == NULL && t->view_src == NULL) {
787
- nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
788
- }
789
- }
790
-
791
- if (nbytes == 0) {
792
- // all the tensors in the context are already allocated
793
- #ifndef NDEBUG
794
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
795
- #endif
796
- return NULL;
797
- }
798
781
 
799
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
782
+ static bool alloc_tensor_range(struct ggml_context * ctx,
783
+ struct ggml_tensor * first, struct ggml_tensor * last,
784
+ ggml_backend_buffer_type_t buft, size_t size,
785
+ ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
786
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
800
787
  if (buffer == NULL) {
801
- // failed to allocate buffer
802
788
  #ifndef NDEBUG
803
- fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
789
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
804
790
  #endif
805
- return NULL;
791
+ for (size_t i = 0; i < *n_buffers; i++) {
792
+ ggml_backend_buffer_free(*buffers[i]);
793
+ }
794
+ free(*buffers);
795
+ return false;
806
796
  }
807
797
 
808
798
  ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
809
799
 
810
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
800
+ for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
811
801
  if (t->data == NULL) {
812
802
  if (t->view_src == NULL) {
813
803
  ggml_tallocr_alloc(tallocr, t);
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
824
814
 
825
815
  ggml_tallocr_free(tallocr);
826
816
 
817
+ *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
818
+ (*buffers)[(*n_buffers)++] = buffer;
819
+
820
+ return true;
821
+ }
822
+
823
+ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
824
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
825
+
826
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
827
+ size_t max_size = ggml_backend_buft_get_max_size(buft);
828
+
829
+ ggml_backend_buffer_t * buffers = NULL;
830
+ size_t n_buffers = 0;
831
+
832
+ size_t cur_buf_size = 0;
833
+ struct ggml_tensor * first = ggml_get_first_tensor(ctx);
834
+ for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
835
+ size_t this_size = 0;
836
+ if (t->data == NULL && t->view_src == NULL) {
837
+ this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
838
+ }
839
+
840
+ if (this_size > max_size) {
841
+ // tensor is too large to fit in a single buffer
842
+ fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
843
+ __func__, t->name,
844
+ ggml_backend_buft_name(buft),
845
+ this_size, max_size);
846
+ for (size_t i = 0; i < n_buffers; i++) {
847
+ ggml_backend_buffer_free(buffers[i]);
848
+ }
849
+ free(buffers);
850
+ return NULL;
851
+ }
852
+
853
+ if ((cur_buf_size + this_size) > max_size) {
854
+ // allocate tensors in the current buffer
855
+ if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
856
+ return NULL;
857
+ }
858
+ first = t;
859
+ cur_buf_size = this_size;
860
+ } else {
861
+ cur_buf_size += this_size;
862
+ }
863
+ }
864
+
865
+ // allocate remaining tensors
866
+ if (cur_buf_size > 0) {
867
+ if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
868
+ return NULL;
869
+ }
870
+ }
871
+
872
+ if (n_buffers == 0) {
873
+ // all the tensors in the context are already allocated
874
+ #ifndef NDEBUG
875
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
876
+ #endif
877
+ return NULL;
878
+ }
879
+
880
+ ggml_backend_buffer_t buffer;
881
+ if (n_buffers == 1) {
882
+ buffer = buffers[0];
883
+ } else {
884
+ buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
885
+ }
886
+ free(buffers);
827
887
  return buffer;
828
888
  }
829
889
 
@@ -19,6 +19,7 @@ extern "C" {
19
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
20
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
21
  size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
22
23
  size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
24
  bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
24
25
  // check if tensor data is in host memory
@@ -63,6 +64,11 @@ extern "C" {
63
64
  // do not use directly, use ggml_backend_tensor_copy instead
64
65
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
65
66
 
67
+ // buffer that contains a collection of buffers
68
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
69
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
70
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
71
+
66
72
  //
67
73
  // Backend
68
74
  //