llama_cpp 0.12.3 → 0.12.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 83b20bdc6944ddf63f11d7cc5147cc24b16f3d32c65fe3b85e88b7d432cd4091
4
- data.tar.gz: a9ce5a9b1b6586f2b015c0ef881197ad857b63a684018874e7ededf9578aa04e
3
+ metadata.gz: e77376858bfb07c67b29963a898f3cf9f2494a5cadabbc4cf777e87af801b33c
4
+ data.tar.gz: 1196c932182a2c76416c326dac934e97cb9111e6bed269c4776e05587391b916
5
5
  SHA512:
6
- metadata.gz: 96be1dd20547fc62e695be0e1725c3861a4694cd496dd45ff29da0f4d89af2b33e0f7ab89872ff21549a406e62e4bdf4cefd0986cebe42fc8102f0cf15a989bf
7
- data.tar.gz: 262feb8b262b3f20c991ddaf2081e180648a65762afd8078a1627e6fd8a6d6e552702089c0a1b9a048e220bc60de97983bbcd6d8f4b894c124a689ee59ff757b
6
+ metadata.gz: 594f4af7e1e88f156926b7605683e29b47a7caf3afb2c18434fa0035415902fb51a9dafe845a4a108bce0dfdd9ad63b5301790826ee6995fa1799cf2bff0c1ee
7
+ data.tar.gz: 4199b0e417efc0e469172c147aa766a81b3f073158eefc13315ab50e4240a4e2f41611e3c87939f4d3012357edf339b1450e49f2bc324f37f92040396342d476
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## [[0.12.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.3...v0.12.4)] - 2024-02-03
2
+
3
+ - Bump bundled llama.cpp from b1971 to b2047.
4
+ - Add constant for file type: `LLAMA_FTYPE_MOSTLY_IQ3_XXS`.
5
+ - Add `supports_mmap?`, `supports_mlock?`, and `supports_gpu_offload?` module functions to `LLaMACpp`.
6
+ - Add `--with-vulkan` configuration option.
7
+ - Deprecate `mmap_supported?` and `mlock_supported?` module functions in `LLaMACpp`.
8
+ - Remove `LLAMA_MAX_DEVICES` constant.
9
+
1
10
  ## [[0.12.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.2...v0.12.3)] - 2024-01-27
2
11
 
3
12
  - Bump bundled llama.cpp from b1892 to b1971.
@@ -19,6 +19,7 @@ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
19
19
  make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
20
20
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
21
21
  make_envs << ' LLAMA_MPI=1' if with_config('mpi')
22
+ make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
22
23
 
23
24
  Dir.chdir(LLAMA_CPP_DIR) do
24
25
  _mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)
@@ -843,15 +843,15 @@ private:
843
843
 
844
844
  // tensor_split
845
845
  static VALUE _llama_model_params_get_tensor_split(VALUE self) {
846
- if (LLAMA_MAX_DEVICES < 1) {
846
+ if (llama_max_devices() < 1) {
847
847
  return rb_ary_new();
848
848
  }
849
- VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
849
+ VALUE ret = rb_ary_new2(llama_max_devices());
850
850
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
851
851
  if (ptr->params.tensor_split == nullptr) {
852
852
  return rb_ary_new();
853
853
  }
854
- for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
854
+ for (size_t i = 0; i < llama_max_devices(); i++) {
855
855
  rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
856
856
  }
857
857
  return ret;
@@ -3259,15 +3259,29 @@ static VALUE rb_llama_time_us(VALUE self) {
3259
3259
  }
3260
3260
 
3261
3261
  static VALUE rb_llama_mmap_supported(VALUE self) {
3262
+ rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
3262
3263
  return llama_mmap_supported() ? Qtrue : Qfalse;
3263
3264
  }
3264
3265
 
3265
3266
  static VALUE rb_llama_mlock_supported(VALUE self) {
3267
+ rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
3266
3268
  return llama_mlock_supported() ? Qtrue : Qfalse;
3267
3269
  }
3268
3270
 
3269
3271
  static VALUE rb_llama_max_devices(VALUE self) {
3270
- return INT2NUM(llama_max_devices());
3272
+ return SIZET2NUM(llama_max_devices());
3273
+ }
3274
+
3275
+ static VALUE rb_llama_supports_mmap(VALUE self) {
3276
+ return llama_supports_mmap() ? Qtrue : Qfalse;
3277
+ }
3278
+
3279
+ static VALUE rb_llama_supports_mlock(VALUE self) {
3280
+ return llama_supports_mlock() ? Qtrue : Qfalse;
3281
+ }
3282
+
3283
+ static VALUE rb_llama_supports_gpu_offload(VALUE self) {
3284
+ return llama_supports_gpu_offload() ? Qtrue : Qfalse;
3271
3285
  }
3272
3286
 
3273
3287
  extern "C" void Init_llama_cpp(void) {
@@ -3294,8 +3308,9 @@ extern "C" void Init_llama_cpp(void) {
3294
3308
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
3295
3309
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
3296
3310
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
3297
-
3298
- rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
3311
+ rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
3312
+ rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
3313
+ rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
3299
3314
 
3300
3315
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3301
3316
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
@@ -3329,6 +3344,7 @@ extern "C" void Init_llama_cpp(void) {
3329
3344
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
3330
3345
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3331
3346
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3347
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3332
3348
 
3333
3349
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3334
3350
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.3'
6
+ VERSION = '0.12.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1971'
9
+ LLAMA_CPP_VERSION = 'b2047'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,8 +3,6 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
- LLAMA_MAX_DEVICES: Integer
7
-
8
6
  LLAMA_FTYPE_ALL_F32: Integer
9
7
  LLAMA_FTYPE_MOSTLY_F16: Integer
10
8
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -26,6 +24,7 @@ module LLaMACpp
26
24
  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
27
25
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
28
26
  LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
27
+ LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
29
28
 
30
29
  LLAMA_KV_OVERRIDE_INT: Integer
31
30
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -61,6 +60,9 @@ module LLaMACpp
61
60
  def self?.mmap_supported?: () -> bool
62
61
  def self?.mlock_supported?: () -> bool
63
62
  def self?.max_devices: () -> Integer
63
+ def self?.supports_mmap?: () -> bool
64
+ def self?.supports_mlock?: () -> bool
65
+ def self?.supports_gpu_offload?: () -> bool
64
66
 
65
67
  class TokenData
66
68
  public
@@ -9,7 +9,7 @@ TEST_TARGETS = \
9
9
  tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-autorelease
12
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
13
13
 
14
14
  # Code coverage output files
15
15
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -450,6 +450,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
450
450
  $(CXX) $(CXXFLAGS) -c $< -o $@
451
451
  endif # LLAMA_CLBLAST
452
452
 
453
+ ifdef LLAMA_VULKAN
454
+ MK_CPPFLAGS += -DGGML_USE_VULKAN
455
+ MK_LDFLAGS += -lvulkan
456
+ OBJS += ggml-vulkan.o
457
+
458
+ ifdef LLAMA_VULKAN_CHECK_RESULTS
459
+ MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
460
+ endif
461
+
462
+ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
463
+ $(CXX) $(CXXFLAGS) -c $< -o $@
464
+ endif # LLAMA_VULKAN
465
+
453
466
  ifdef LLAMA_HIPBLAS
454
467
 
455
468
  ifeq ($(wildcard /opt/rocm),)
@@ -575,12 +588,15 @@ train.o: common/train.cpp common/train.h
575
588
  libllama.so: llama.o ggml.o $(OBJS)
576
589
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
577
590
 
591
+ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
592
+ ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
593
+
578
594
  lib: llama.o ggml.o $(OBJS)
579
595
  $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
580
596
  ar rcs libllama.a $^
581
597
 
582
598
  clean:
583
- rm -vrf *.o tests/*.o *.so *.dll *.dylib *.a benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
599
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
584
600
 
585
601
  #
586
602
  # Examples
@@ -625,7 +641,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
625
641
  save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
626
642
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
627
643
 
628
- server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
644
+ server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
629
645
  $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
630
646
 
631
647
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
@@ -754,5 +770,8 @@ tests/test-c.o: tests/test-c.c llama.h
754
770
  tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
755
771
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
756
772
 
757
- tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
773
+ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
774
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
775
+
776
+ tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
758
777
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
335
335
  }
336
336
 
337
337
  size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
338
- return alloc->max_size;
338
+ // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
339
+ // to avoid this, we add a 10% margin to the buffer size
340
+ return alloc->max_size + alloc->max_size/10;
339
341
  }
340
342
 
341
343
  // graph allocator
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
776
778
  }
777
779
 
778
780
  // utils
779
- ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
780
- GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
781
-
782
- size_t alignment = ggml_backend_buft_get_alignment(buft);
783
-
784
- size_t nbytes = 0;
785
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
786
- if (t->data == NULL && t->view_src == NULL) {
787
- nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
788
- }
789
- }
790
-
791
- if (nbytes == 0) {
792
- // all the tensors in the context are already allocated
793
- #ifndef NDEBUG
794
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
795
- #endif
796
- return NULL;
797
- }
798
781
 
799
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
782
+ static bool alloc_tensor_range(struct ggml_context * ctx,
783
+ struct ggml_tensor * first, struct ggml_tensor * last,
784
+ ggml_backend_buffer_type_t buft, size_t size,
785
+ ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
786
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
800
787
  if (buffer == NULL) {
801
- // failed to allocate buffer
802
788
  #ifndef NDEBUG
803
- fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
789
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
804
790
  #endif
805
- return NULL;
791
+ for (size_t i = 0; i < *n_buffers; i++) {
792
+ ggml_backend_buffer_free(*buffers[i]);
793
+ }
794
+ free(*buffers);
795
+ return false;
806
796
  }
807
797
 
808
798
  ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
809
799
 
810
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
800
+ for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
811
801
  if (t->data == NULL) {
812
802
  if (t->view_src == NULL) {
813
803
  ggml_tallocr_alloc(tallocr, t);
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
824
814
 
825
815
  ggml_tallocr_free(tallocr);
826
816
 
817
+ *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
818
+ (*buffers)[(*n_buffers)++] = buffer;
819
+
820
+ return true;
821
+ }
822
+
823
+ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
824
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
825
+
826
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
827
+ size_t max_size = ggml_backend_buft_get_max_size(buft);
828
+
829
+ ggml_backend_buffer_t * buffers = NULL;
830
+ size_t n_buffers = 0;
831
+
832
+ size_t cur_buf_size = 0;
833
+ struct ggml_tensor * first = ggml_get_first_tensor(ctx);
834
+ for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
835
+ size_t this_size = 0;
836
+ if (t->data == NULL && t->view_src == NULL) {
837
+ this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
838
+ }
839
+
840
+ if (this_size > max_size) {
841
+ // tensor is too large to fit in a single buffer
842
+ fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
843
+ __func__, t->name,
844
+ ggml_backend_buft_name(buft),
845
+ this_size, max_size);
846
+ for (size_t i = 0; i < n_buffers; i++) {
847
+ ggml_backend_buffer_free(buffers[i]);
848
+ }
849
+ free(buffers);
850
+ return NULL;
851
+ }
852
+
853
+ if ((cur_buf_size + this_size) > max_size) {
854
+ // allocate tensors in the current buffer
855
+ if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
856
+ return NULL;
857
+ }
858
+ first = t;
859
+ cur_buf_size = this_size;
860
+ } else {
861
+ cur_buf_size += this_size;
862
+ }
863
+ }
864
+
865
+ // allocate remaining tensors
866
+ if (cur_buf_size > 0) {
867
+ if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
868
+ return NULL;
869
+ }
870
+ }
871
+
872
+ if (n_buffers == 0) {
873
+ // all the tensors in the context are already allocated
874
+ #ifndef NDEBUG
875
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
876
+ #endif
877
+ return NULL;
878
+ }
879
+
880
+ ggml_backend_buffer_t buffer;
881
+ if (n_buffers == 1) {
882
+ buffer = buffers[0];
883
+ } else {
884
+ buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
885
+ }
886
+ free(buffers);
827
887
  return buffer;
828
888
  }
829
889
 
@@ -19,6 +19,7 @@ extern "C" {
19
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
20
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
21
  size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
22
23
  size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
24
  bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
24
25
  // check if tensor data is in host memory
@@ -63,6 +64,11 @@ extern "C" {
63
64
  // do not use directly, use ggml_backend_tensor_copy instead
64
65
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
65
66
 
67
+ // buffer that contains a collection of buffers
68
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
69
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
70
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
71
+
66
72
  //
67
73
  // Backend
68
74
  //
@@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
27
27
  return buft->iface.get_alignment(buft);
28
28
  }
29
29
 
30
+ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
31
+ // get_max_size is optional, defaults to SIZE_MAX
32
+ if (buft->iface.get_max_size) {
33
+ return buft->iface.get_max_size(buft);
34
+ }
35
+ return SIZE_MAX;
36
+ }
37
+
30
38
  GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
31
39
  // get_alloc_size is optional, defaults to ggml_nbytes
32
40
  if (buft->iface.get_alloc_size) {
33
- return buft->iface.get_alloc_size(buft, tensor);
41
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
42
+ assert(size >= ggml_nbytes(tensor));
43
+ return size;
34
44
  }
35
45
  return ggml_nbytes(tensor);
36
46
  }
@@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
55
65
  size_t size) {
56
66
  ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
57
67
 
58
- GGML_ASSERT(iface.get_base != NULL);
59
-
60
68
  (*buffer) = (struct ggml_backend_buffer) {
61
69
  /* .interface = */ iface,
62
70
  /* .buft = */ buft,
@@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
106
114
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
107
115
  }
108
116
 
117
+ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
118
+ return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
119
+ }
120
+
109
121
  size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
110
122
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
111
123
  }
@@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
120
132
 
121
133
  void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
122
134
  buffer->usage = usage;
135
+
136
+ // FIXME: add a generic callback to the buffer interface
137
+ if (ggml_backend_buffer_is_multi_buffer(buffer)) {
138
+ ggml_backend_multi_buffer_set_usage(buffer, usage);
139
+ }
123
140
  }
124
141
 
125
142
  ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
@@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
169
186
  return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
170
187
  }
171
188
 
189
+ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
190
+ return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
191
+ }
192
+
172
193
  void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
173
194
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
174
195
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@@ -337,11 +358,26 @@ GGML_CALL static void ggml_backend_registry_init(void) {
337
358
  ggml_backend_cuda_reg_devices();
338
359
  #endif
339
360
 
361
+ #ifdef GGML_USE_SYCL
362
+ extern void ggml_backend_sycl_reg_devices(void);
363
+ ggml_backend_sycl_reg_devices();
364
+ #endif
365
+
340
366
  #ifdef GGML_USE_METAL
341
367
  extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
342
368
  extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
343
369
  ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
344
370
  #endif
371
+
372
+ #ifdef GGML_USE_VULKAN
373
+ extern GGML_CALL int ggml_backend_vk_reg_devices(void);
374
+ ggml_backend_vk_reg_devices();
375
+ #endif
376
+
377
+ #ifdef GGML_USE_KOMPUTE
378
+ extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
379
+ ggml_backend_kompute_reg_devices();
380
+ #endif
345
381
  }
346
382
 
347
383
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
@@ -545,6 +581,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
545
581
  /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
546
582
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
547
583
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
584
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
548
585
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
549
586
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
550
587
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
600
637
  /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
601
638
  /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
602
639
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
640
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
603
641
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
604
642
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
605
643
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -756,6 +794,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
756
794
  GGML_UNUSED(user_data);
757
795
  }
758
796
 
797
+ // multi-buffer buffer
798
+
799
+ struct ggml_backend_multi_buffer_context {
800
+ ggml_backend_buffer_t * buffers;
801
+ size_t n_buffers;
802
+ };
803
+
804
+ typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
805
+
806
+ GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
807
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
808
+
809
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
810
+ }
811
+
812
+ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
813
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
814
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
815
+ ggml_backend_buffer_free(ctx->buffers[i]);
816
+ }
817
+
818
+ free(ctx->buffers);
819
+ free(ctx);
820
+ }
821
+
822
+ GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
823
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
824
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
825
+ ggml_backend_buffer_clear(ctx->buffers[i], value);
826
+ }
827
+ }
828
+
829
+ static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
830
+ static struct ggml_backend_buffer_i multi_backend_buffer_i = {
831
+ /* .get_name = */ ggml_backend_multi_buffer_get_name,
832
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
833
+ /* .get_base = */ NULL,
834
+ /* .init_tensor = */ NULL,
835
+ /* .set_tensor = */ NULL,
836
+ /* .get_tensor = */ NULL,
837
+ /* .cpy_tensor = */ NULL,
838
+ /* .clear = */ ggml_backend_multi_buffer_clear,
839
+ /* .reset = */ NULL,
840
+ };
841
+
842
+ return multi_backend_buffer_i;
843
+ }
844
+
845
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
846
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
847
+ ctx->n_buffers = n_buffers;
848
+ ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
849
+
850
+ size_t total_size = 0;
851
+ for (size_t i = 0; i < n_buffers; i++) {
852
+ ctx->buffers[i] = buffers[i];
853
+ total_size += ggml_backend_buffer_get_size(buffers[i]);
854
+ }
855
+
856
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
857
+ }
858
+
859
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
860
+ return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
861
+ }
862
+
863
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
864
+ GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
865
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
866
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
867
+ ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
868
+ }
869
+ }
870
+
759
871
 
760
872
  // scheduler
761
873
 
@@ -20,6 +20,7 @@ extern "C" {
20
20
  GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
21
21
  GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
22
22
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
23
+ GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
23
24
  GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
24
25
  GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
25
26
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
@@ -36,6 +37,7 @@ extern "C" {
36
37
  GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
37
38
  GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
39
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
40
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
39
41
  GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
42
  GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
41
43
  GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
@@ -54,6 +56,7 @@ extern "C" {
54
56
  GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
55
57
  GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
56
58
  GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
59
+ GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
57
60
 
58
61
  GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
59
62
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);