llama_cpp 0.12.2 → 0.12.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a65b477c93060832783d03b065dd336820bf22e985dd7b9f53a20e5834f29a0d
4
- data.tar.gz: 3ab3f5147bb207ddeea4b902e86de41398fbe497bb521ab00a4fe89ccd790d50
3
+ metadata.gz: e77376858bfb07c67b29963a898f3cf9f2494a5cadabbc4cf777e87af801b33c
4
+ data.tar.gz: 1196c932182a2c76416c326dac934e97cb9111e6bed269c4776e05587391b916
5
5
  SHA512:
6
- metadata.gz: 119a77a344ece09afda87d89321f679b9c53975c6b340150e298fa3869a0bf48849fafd49e5ef18b001311aae10e3fa9aba29c96de2c4aa8535cdad7d01382cb
7
- data.tar.gz: 444fc224413ee6fc94b0866da07460e9c95162941fcd80c831c6f7a950373503eba74b10d437724db2c9debec4719c5a9b25875f1b0a014c956bcb424ca8bf47
6
+ metadata.gz: 594f4af7e1e88f156926b7605683e29b47a7caf3afb2c18434fa0035415902fb51a9dafe845a4a108bce0dfdd9ad63b5301790826ee6995fa1799cf2bff0c1ee
7
+ data.tar.gz: 4199b0e417efc0e469172c147aa766a81b3f073158eefc13315ab50e4240a4e2f41611e3c87939f4d3012357edf339b1450e49f2bc324f37f92040396342d476
data/CHANGELOG.md CHANGED
@@ -1,3 +1,18 @@
1
+ ## [[0.12.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.3...v0.12.4)] - 2024-02-03
2
+
3
+ - Bump bundled llama.cpp from b1971 to b2047.
4
+ - Add constant for file type: `LLAMA_FTYPE_MOSTLY_IQ3_XXS`.
5
+ - Add `supports_mmap?`, `supports_mlock?`, and `supports_gpu_offload?` module functions to `LLaMACpp`.
6
+ - Add `--with-vulkan` configuration option.
7
+ - Deprecate `mmap_supported?` and `mlock_supported?` module functions in `LLaMACpp`.
8
+ - Remove `LLAMA_MAX_DEVICES` constant.
9
+
10
+ ## [[0.12.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.2...v0.12.3)] - 2024-01-27
11
+
12
+ - Bump bundled llama.cpp from b1892 to b1971.
13
+ - Add constant for file type: `LLAMA_FTYPE_MOSTLY_Q3_K_XS`.
14
+ - Add `sample_entropy` method to Context.
15
+
1
16
  ## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
2
17
 
3
18
  - Bump bundled llama.cpp from b1833 to b1892.
data/README.md CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
28
28
  # use OpenBLAS
29
29
  $ gem install llama_cpp -- --with-openblas
30
30
 
31
- # use CUDA
32
- $ gem install llama_cpp -- --with-cuda
31
+ # use cuBLAS
32
+ $ gem install llama_cpp -- --with-cublas
33
33
  ```
34
34
 
35
35
  Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
@@ -19,6 +19,7 @@ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
19
19
  make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
20
20
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
21
21
  make_envs << ' LLAMA_MPI=1' if with_config('mpi')
22
+ make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
22
23
 
23
24
  Dir.chdir(LLAMA_CPP_DIR) do
24
25
  _mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)
@@ -843,15 +843,15 @@ private:
843
843
 
844
844
  // tensor_split
845
845
  static VALUE _llama_model_params_get_tensor_split(VALUE self) {
846
- if (LLAMA_MAX_DEVICES < 1) {
846
+ if (llama_max_devices() < 1) {
847
847
  return rb_ary_new();
848
848
  }
849
- VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
849
+ VALUE ret = rb_ary_new2(llama_max_devices());
850
850
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
851
851
  if (ptr->params.tensor_split == nullptr) {
852
852
  return rb_ary_new();
853
853
  }
854
- for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
854
+ for (size_t i = 0; i < llama_max_devices(); i++) {
855
855
  rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
856
856
  }
857
857
  return ret;
@@ -2054,6 +2054,7 @@ public:
2054
2054
  rb_define_method(rb_cLLaMAContext, "sample_tail_free", RUBY_METHOD_FUNC(_llama_context_sample_tail_free), -1);
2055
2055
  rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
2056
2056
  rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
2057
+ rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
2057
2058
  rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
2058
2059
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
2059
2060
  rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
@@ -2904,6 +2905,50 @@ private:
2904
2905
  return Qnil;
2905
2906
  }
2906
2907
 
2908
+ static VALUE _llama_context_sample_entropy(int argc, VALUE* argv, VALUE self) {
2909
+ VALUE kw_args = Qnil;
2910
+ ID kw_table[3] = { rb_intern("min_temp"), rb_intern("max_temp"), rb_intern("exponent_val") };
2911
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2912
+ VALUE candidates = Qnil;
2913
+ rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
2914
+ rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
2915
+
2916
+ if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
2917
+ rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
2918
+ return Qnil;
2919
+ }
2920
+ if (!RB_FLOAT_TYPE_P(kw_values[0])) {
2921
+ rb_raise(rb_eArgError, "min_temp must be a float");
2922
+ return Qnil;
2923
+ }
2924
+ if (!RB_FLOAT_TYPE_P(kw_values[1])) {
2925
+ rb_raise(rb_eArgError, "max_temp must be a float");
2926
+ return Qnil;
2927
+ }
2928
+ if (!RB_FLOAT_TYPE_P(kw_values[2])) {
2929
+ rb_raise(rb_eArgError, "exponent_val must be a float");
2930
+ return Qnil;
2931
+ }
2932
+
2933
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
2934
+ if (ctx_ptr->ctx == NULL) {
2935
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2936
+ return Qnil;
2937
+ }
2938
+ LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
2939
+ if (cnd_ptr->array.data == nullptr) {
2940
+ rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
2941
+ return Qnil;
2942
+ }
2943
+ const float min_temp = NUM2DBL(kw_values[0]);
2944
+ const float max_temp = NUM2DBL(kw_values[1]);
2945
+ const float exponent_val = NUM2DBL(kw_values[2]);
2946
+
2947
+ llama_sample_entropy(ctx_ptr->ctx, &(cnd_ptr->array), min_temp, max_temp, exponent_val);
2948
+
2949
+ return Qnil;
2950
+ }
2951
+
2907
2952
  static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
2908
2953
  VALUE kw_args = Qnil;
2909
2954
  ID kw_table[1] = { rb_intern("temperature") };
@@ -3214,15 +3259,29 @@ static VALUE rb_llama_time_us(VALUE self) {
3214
3259
  }
3215
3260
 
3216
3261
  static VALUE rb_llama_mmap_supported(VALUE self) {
3262
+ rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
3217
3263
  return llama_mmap_supported() ? Qtrue : Qfalse;
3218
3264
  }
3219
3265
 
3220
3266
  static VALUE rb_llama_mlock_supported(VALUE self) {
3267
+ rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
3221
3268
  return llama_mlock_supported() ? Qtrue : Qfalse;
3222
3269
  }
3223
3270
 
3224
3271
  static VALUE rb_llama_max_devices(VALUE self) {
3225
- return INT2NUM(llama_max_devices());
3272
+ return SIZET2NUM(llama_max_devices());
3273
+ }
3274
+
3275
+ static VALUE rb_llama_supports_mmap(VALUE self) {
3276
+ return llama_supports_mmap() ? Qtrue : Qfalse;
3277
+ }
3278
+
3279
+ static VALUE rb_llama_supports_mlock(VALUE self) {
3280
+ return llama_supports_mlock() ? Qtrue : Qfalse;
3281
+ }
3282
+
3283
+ static VALUE rb_llama_supports_gpu_offload(VALUE self) {
3284
+ return llama_supports_gpu_offload() ? Qtrue : Qfalse;
3226
3285
  }
3227
3286
 
3228
3287
  extern "C" void Init_llama_cpp(void) {
@@ -3249,8 +3308,9 @@ extern "C" void Init_llama_cpp(void) {
3249
3308
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
3250
3309
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
3251
3310
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
3252
-
3253
- rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
3311
+ rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
3312
+ rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
3313
+ rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
3254
3314
 
3255
3315
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3256
3316
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
@@ -3283,6 +3343,8 @@ extern "C" void Init_llama_cpp(void) {
3283
3343
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
3284
3344
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
3285
3345
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3346
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3347
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3286
3348
 
3287
3349
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3288
3350
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.2'
6
+ VERSION = '0.12.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1892'
9
+ LLAMA_CPP_VERSION = 'b2047'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,8 +3,6 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
- LLAMA_MAX_DEVICES: Integer
7
-
8
6
  LLAMA_FTYPE_ALL_F32: Integer
9
7
  LLAMA_FTYPE_MOSTLY_F16: Integer
10
8
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -25,6 +23,8 @@ module LLaMACpp
25
23
  LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
26
24
  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
27
25
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
26
+ LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
27
+ LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
28
28
 
29
29
  LLAMA_KV_OVERRIDE_INT: Integer
30
30
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -60,6 +60,9 @@ module LLaMACpp
60
60
  def self?.mmap_supported?: () -> bool
61
61
  def self?.mlock_supported?: () -> bool
62
62
  def self?.max_devices: () -> Integer
63
+ def self?.supports_mmap?: () -> bool
64
+ def self?.supports_mlock?: () -> bool
65
+ def self?.supports_gpu_offload?: () -> bool
63
66
 
64
67
  class TokenData
65
68
  public
@@ -216,6 +219,7 @@ module LLaMACpp
216
219
  def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
217
220
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
218
221
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
222
+ def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
219
223
  def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
220
224
  def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
221
225
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
@@ -9,7 +9,7 @@ TEST_TARGETS = \
9
9
  tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops
12
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
13
13
 
14
14
  # Code coverage output files
15
15
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -450,6 +450,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
450
450
  $(CXX) $(CXXFLAGS) -c $< -o $@
451
451
  endif # LLAMA_CLBLAST
452
452
 
453
+ ifdef LLAMA_VULKAN
454
+ MK_CPPFLAGS += -DGGML_USE_VULKAN
455
+ MK_LDFLAGS += -lvulkan
456
+ OBJS += ggml-vulkan.o
457
+
458
+ ifdef LLAMA_VULKAN_CHECK_RESULTS
459
+ MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
460
+ endif
461
+
462
+ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
463
+ $(CXX) $(CXXFLAGS) -c $< -o $@
464
+ endif # LLAMA_VULKAN
465
+
453
466
  ifdef LLAMA_HIPBLAS
454
467
 
455
468
  ifeq ($(wildcard /opt/rocm),)
@@ -575,12 +588,15 @@ train.o: common/train.cpp common/train.h
575
588
  libllama.so: llama.o ggml.o $(OBJS)
576
589
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
577
590
 
591
+ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
592
+ ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
593
+
578
594
  lib: llama.o ggml.o $(OBJS)
579
595
  $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
580
596
  ar rcs libllama.a $^
581
597
 
582
598
  clean:
583
- rm -vrf *.o tests/*.o *.so *.dll *.dylib *.a benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
599
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
584
600
 
585
601
  #
586
602
  # Examples
@@ -625,7 +641,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
625
641
  save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
626
642
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
627
643
 
628
- server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
644
+ server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
629
645
  $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
630
646
 
631
647
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
@@ -753,3 +769,9 @@ tests/test-c.o: tests/test-c.c llama.h
753
769
 
754
770
  tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
755
771
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
772
+
773
+ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
774
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
775
+
776
+ tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
777
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
109
109
  if (block->size >= size) {
110
110
  best_fit_block = alloc->n_free_blocks - 1;
111
111
  } else {
112
- fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
113
- __func__, size, max_avail);
112
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
113
+ __func__, tensor->name, size, max_avail);
114
114
  GGML_ASSERT(!"not enough space in the buffer");
115
115
  return;
116
116
  }
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
335
335
  }
336
336
 
337
337
  size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
338
- return alloc->max_size;
338
+ // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
339
+ // to avoid this, we add a 10% margin to the buffer size
340
+ return alloc->max_size + alloc->max_size/10;
339
341
  }
340
342
 
341
343
  // graph allocator
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
776
778
  }
777
779
 
778
780
  // utils
779
- ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
780
- GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
781
-
782
- size_t alignment = ggml_backend_buft_get_alignment(buft);
783
-
784
- size_t nbytes = 0;
785
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
786
- if (t->data == NULL && t->view_src == NULL) {
787
- nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
788
- }
789
- }
790
-
791
- if (nbytes == 0) {
792
- // all the tensors in the context are already allocated
793
- #ifndef NDEBUG
794
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
795
- #endif
796
- return NULL;
797
- }
798
781
 
799
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
782
+ static bool alloc_tensor_range(struct ggml_context * ctx,
783
+ struct ggml_tensor * first, struct ggml_tensor * last,
784
+ ggml_backend_buffer_type_t buft, size_t size,
785
+ ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
786
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
800
787
  if (buffer == NULL) {
801
- // failed to allocate buffer
802
788
  #ifndef NDEBUG
803
- fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
789
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
804
790
  #endif
805
- return NULL;
791
+ for (size_t i = 0; i < *n_buffers; i++) {
792
+ ggml_backend_buffer_free(*buffers[i]);
793
+ }
794
+ free(*buffers);
795
+ return false;
806
796
  }
807
797
 
808
798
  ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
809
799
 
810
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
800
+ for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
811
801
  if (t->data == NULL) {
812
802
  if (t->view_src == NULL) {
813
803
  ggml_tallocr_alloc(tallocr, t);
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
824
814
 
825
815
  ggml_tallocr_free(tallocr);
826
816
 
817
+ *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
818
+ (*buffers)[(*n_buffers)++] = buffer;
819
+
820
+ return true;
821
+ }
822
+
823
+ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
824
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
825
+
826
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
827
+ size_t max_size = ggml_backend_buft_get_max_size(buft);
828
+
829
+ ggml_backend_buffer_t * buffers = NULL;
830
+ size_t n_buffers = 0;
831
+
832
+ size_t cur_buf_size = 0;
833
+ struct ggml_tensor * first = ggml_get_first_tensor(ctx);
834
+ for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
835
+ size_t this_size = 0;
836
+ if (t->data == NULL && t->view_src == NULL) {
837
+ this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
838
+ }
839
+
840
+ if (this_size > max_size) {
841
+ // tensor is too large to fit in a single buffer
842
+ fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
843
+ __func__, t->name,
844
+ ggml_backend_buft_name(buft),
845
+ this_size, max_size);
846
+ for (size_t i = 0; i < n_buffers; i++) {
847
+ ggml_backend_buffer_free(buffers[i]);
848
+ }
849
+ free(buffers);
850
+ return NULL;
851
+ }
852
+
853
+ if ((cur_buf_size + this_size) > max_size) {
854
+ // allocate tensors in the current buffer
855
+ if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
856
+ return NULL;
857
+ }
858
+ first = t;
859
+ cur_buf_size = this_size;
860
+ } else {
861
+ cur_buf_size += this_size;
862
+ }
863
+ }
864
+
865
+ // allocate remaining tensors
866
+ if (cur_buf_size > 0) {
867
+ if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
868
+ return NULL;
869
+ }
870
+ }
871
+
872
+ if (n_buffers == 0) {
873
+ // all the tensors in the context are already allocated
874
+ #ifndef NDEBUG
875
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
876
+ #endif
877
+ return NULL;
878
+ }
879
+
880
+ ggml_backend_buffer_t buffer;
881
+ if (n_buffers == 1) {
882
+ buffer = buffers[0];
883
+ } else {
884
+ buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
885
+ }
886
+ free(buffers);
827
887
  return buffer;
828
888
  }
829
889
 
@@ -19,6 +19,7 @@ extern "C" {
19
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
20
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
21
  size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
22
23
  size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
24
  bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
24
25
  // check if tensor data is in host memory
@@ -63,6 +64,11 @@ extern "C" {
63
64
  // do not use directly, use ggml_backend_tensor_copy instead
64
65
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
65
66
 
67
+ // buffer that contains a collection of buffers
68
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
69
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
70
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
71
+
66
72
  //
67
73
  // Backend
68
74
  //