llama_cpp 0.12.3 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 83b20bdc6944ddf63f11d7cc5147cc24b16f3d32c65fe3b85e88b7d432cd4091
4
- data.tar.gz: a9ce5a9b1b6586f2b015c0ef881197ad857b63a684018874e7ededf9578aa04e
3
+ metadata.gz: e77376858bfb07c67b29963a898f3cf9f2494a5cadabbc4cf777e87af801b33c
4
+ data.tar.gz: 1196c932182a2c76416c326dac934e97cb9111e6bed269c4776e05587391b916
5
5
  SHA512:
6
- metadata.gz: 96be1dd20547fc62e695be0e1725c3861a4694cd496dd45ff29da0f4d89af2b33e0f7ab89872ff21549a406e62e4bdf4cefd0986cebe42fc8102f0cf15a989bf
7
- data.tar.gz: 262feb8b262b3f20c991ddaf2081e180648a65762afd8078a1627e6fd8a6d6e552702089c0a1b9a048e220bc60de97983bbcd6d8f4b894c124a689ee59ff757b
6
+ metadata.gz: 594f4af7e1e88f156926b7605683e29b47a7caf3afb2c18434fa0035415902fb51a9dafe845a4a108bce0dfdd9ad63b5301790826ee6995fa1799cf2bff0c1ee
7
+ data.tar.gz: 4199b0e417efc0e469172c147aa766a81b3f073158eefc13315ab50e4240a4e2f41611e3c87939f4d3012357edf339b1450e49f2bc324f37f92040396342d476
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## [[0.12.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.3...v0.12.4)] - 2024-02-03
2
+
3
+ - Bump bundled llama.cpp from b1971 to b2047.
4
+ - Add constant for file type: `LLAMA_FTYPE_MOSTLY_IQ3_XXS`.
5
+ - Add `supports_mmap?`, `supports_mlock?`, and `supports_gpu_offload?` module functions to `LLaMACpp`.
6
+ - Add `--with-vulkan` configuration option.
7
+ - Deprecate `mmap_supported?` and `mlock_supported?` module functions in `LLaMACpp`.
8
+ - Remove `LLAMA_MAX_DEVICES` constant.
9
+
1
10
  ## [[0.12.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.2...v0.12.3)] - 2024-01-27
2
11
 
3
12
  - Bump bundled llama.cpp from b1892 to b1971.
@@ -19,6 +19,7 @@ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
19
19
  make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
20
20
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
21
21
  make_envs << ' LLAMA_MPI=1' if with_config('mpi')
22
+ make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
22
23
 
23
24
  Dir.chdir(LLAMA_CPP_DIR) do
24
25
  _mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)
@@ -843,15 +843,15 @@ private:
843
843
 
844
844
  // tensor_split
845
845
  static VALUE _llama_model_params_get_tensor_split(VALUE self) {
846
- if (LLAMA_MAX_DEVICES < 1) {
846
+ if (llama_max_devices() < 1) {
847
847
  return rb_ary_new();
848
848
  }
849
- VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
849
+ VALUE ret = rb_ary_new2(llama_max_devices());
850
850
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
851
851
  if (ptr->params.tensor_split == nullptr) {
852
852
  return rb_ary_new();
853
853
  }
854
- for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
854
+ for (size_t i = 0; i < llama_max_devices(); i++) {
855
855
  rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
856
856
  }
857
857
  return ret;
@@ -3259,15 +3259,29 @@ static VALUE rb_llama_time_us(VALUE self) {
3259
3259
  }
3260
3260
 
3261
3261
  static VALUE rb_llama_mmap_supported(VALUE self) {
3262
+ rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
3262
3263
  return llama_mmap_supported() ? Qtrue : Qfalse;
3263
3264
  }
3264
3265
 
3265
3266
  static VALUE rb_llama_mlock_supported(VALUE self) {
3267
+ rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
3266
3268
  return llama_mlock_supported() ? Qtrue : Qfalse;
3267
3269
  }
3268
3270
 
3269
3271
  static VALUE rb_llama_max_devices(VALUE self) {
3270
- return INT2NUM(llama_max_devices());
3272
+ return SIZET2NUM(llama_max_devices());
3273
+ }
3274
+
3275
+ static VALUE rb_llama_supports_mmap(VALUE self) {
3276
+ return llama_supports_mmap() ? Qtrue : Qfalse;
3277
+ }
3278
+
3279
+ static VALUE rb_llama_supports_mlock(VALUE self) {
3280
+ return llama_supports_mlock() ? Qtrue : Qfalse;
3281
+ }
3282
+
3283
+ static VALUE rb_llama_supports_gpu_offload(VALUE self) {
3284
+ return llama_supports_gpu_offload() ? Qtrue : Qfalse;
3271
3285
  }
3272
3286
 
3273
3287
  extern "C" void Init_llama_cpp(void) {
@@ -3294,8 +3308,9 @@ extern "C" void Init_llama_cpp(void) {
3294
3308
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
3295
3309
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
3296
3310
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
3297
-
3298
- rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
3311
+ rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
3312
+ rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
3313
+ rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
3299
3314
 
3300
3315
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3301
3316
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
@@ -3329,6 +3344,7 @@ extern "C" void Init_llama_cpp(void) {
3329
3344
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
3330
3345
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3331
3346
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3347
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3332
3348
 
3333
3349
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3334
3350
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.3'
6
+ VERSION = '0.12.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1971'
9
+ LLAMA_CPP_VERSION = 'b2047'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,8 +3,6 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
- LLAMA_MAX_DEVICES: Integer
7
-
8
6
  LLAMA_FTYPE_ALL_F32: Integer
9
7
  LLAMA_FTYPE_MOSTLY_F16: Integer
10
8
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -26,6 +24,7 @@ module LLaMACpp
26
24
  LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
27
25
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
28
26
  LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
27
+ LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
29
28
 
30
29
  LLAMA_KV_OVERRIDE_INT: Integer
31
30
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -61,6 +60,9 @@ module LLaMACpp
61
60
  def self?.mmap_supported?: () -> bool
62
61
  def self?.mlock_supported?: () -> bool
63
62
  def self?.max_devices: () -> Integer
63
+ def self?.supports_mmap?: () -> bool
64
+ def self?.supports_mlock?: () -> bool
65
+ def self?.supports_gpu_offload?: () -> bool
64
66
 
65
67
  class TokenData
66
68
  public
@@ -9,7 +9,7 @@ TEST_TARGETS = \
9
9
  tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-autorelease
12
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
13
13
 
14
14
  # Code coverage output files
15
15
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -450,6 +450,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
450
450
  $(CXX) $(CXXFLAGS) -c $< -o $@
451
451
  endif # LLAMA_CLBLAST
452
452
 
453
+ ifdef LLAMA_VULKAN
454
+ MK_CPPFLAGS += -DGGML_USE_VULKAN
455
+ MK_LDFLAGS += -lvulkan
456
+ OBJS += ggml-vulkan.o
457
+
458
+ ifdef LLAMA_VULKAN_CHECK_RESULTS
459
+ MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
460
+ endif
461
+
462
+ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
463
+ $(CXX) $(CXXFLAGS) -c $< -o $@
464
+ endif # LLAMA_VULKAN
465
+
453
466
  ifdef LLAMA_HIPBLAS
454
467
 
455
468
  ifeq ($(wildcard /opt/rocm),)
@@ -575,12 +588,15 @@ train.o: common/train.cpp common/train.h
575
588
  libllama.so: llama.o ggml.o $(OBJS)
576
589
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
577
590
 
591
+ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
592
+ ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
593
+
578
594
  lib: llama.o ggml.o $(OBJS)
579
595
  $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
580
596
  ar rcs libllama.a $^
581
597
 
582
598
  clean:
583
- rm -vrf *.o tests/*.o *.so *.dll *.dylib *.a benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
599
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
584
600
 
585
601
  #
586
602
  # Examples
@@ -625,7 +641,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
625
641
  save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
626
642
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
627
643
 
628
- server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
644
+ server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
629
645
  $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
630
646
 
631
647
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
@@ -754,5 +770,8 @@ tests/test-c.o: tests/test-c.c llama.h
754
770
  tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
755
771
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
756
772
 
757
- tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
773
+ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
774
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
775
+
776
+ tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
758
777
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
335
335
  }
336
336
 
337
337
  size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
338
- return alloc->max_size;
338
+ // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
339
+ // to avoid this, we add a 10% margin to the buffer size
340
+ return alloc->max_size + alloc->max_size/10;
339
341
  }
340
342
 
341
343
  // graph allocator
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
776
778
  }
777
779
 
778
780
  // utils
779
- ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
780
- GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
781
-
782
- size_t alignment = ggml_backend_buft_get_alignment(buft);
783
-
784
- size_t nbytes = 0;
785
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
786
- if (t->data == NULL && t->view_src == NULL) {
787
- nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
788
- }
789
- }
790
-
791
- if (nbytes == 0) {
792
- // all the tensors in the context are already allocated
793
- #ifndef NDEBUG
794
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
795
- #endif
796
- return NULL;
797
- }
798
781
 
799
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
782
+ static bool alloc_tensor_range(struct ggml_context * ctx,
783
+ struct ggml_tensor * first, struct ggml_tensor * last,
784
+ ggml_backend_buffer_type_t buft, size_t size,
785
+ ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
786
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
800
787
  if (buffer == NULL) {
801
- // failed to allocate buffer
802
788
  #ifndef NDEBUG
803
- fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
789
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
804
790
  #endif
805
- return NULL;
791
+ for (size_t i = 0; i < *n_buffers; i++) {
792
+ ggml_backend_buffer_free(*buffers[i]);
793
+ }
794
+ free(*buffers);
795
+ return false;
806
796
  }
807
797
 
808
798
  ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
809
799
 
810
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
800
+ for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
811
801
  if (t->data == NULL) {
812
802
  if (t->view_src == NULL) {
813
803
  ggml_tallocr_alloc(tallocr, t);
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
824
814
 
825
815
  ggml_tallocr_free(tallocr);
826
816
 
817
+ *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
818
+ (*buffers)[(*n_buffers)++] = buffer;
819
+
820
+ return true;
821
+ }
822
+
823
+ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
824
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
825
+
826
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
827
+ size_t max_size = ggml_backend_buft_get_max_size(buft);
828
+
829
+ ggml_backend_buffer_t * buffers = NULL;
830
+ size_t n_buffers = 0;
831
+
832
+ size_t cur_buf_size = 0;
833
+ struct ggml_tensor * first = ggml_get_first_tensor(ctx);
834
+ for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
835
+ size_t this_size = 0;
836
+ if (t->data == NULL && t->view_src == NULL) {
837
+ this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
838
+ }
839
+
840
+ if (this_size > max_size) {
841
+ // tensor is too large to fit in a single buffer
842
+ fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
843
+ __func__, t->name,
844
+ ggml_backend_buft_name(buft),
845
+ this_size, max_size);
846
+ for (size_t i = 0; i < n_buffers; i++) {
847
+ ggml_backend_buffer_free(buffers[i]);
848
+ }
849
+ free(buffers);
850
+ return NULL;
851
+ }
852
+
853
+ if ((cur_buf_size + this_size) > max_size) {
854
+ // allocate tensors in the current buffer
855
+ if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
856
+ return NULL;
857
+ }
858
+ first = t;
859
+ cur_buf_size = this_size;
860
+ } else {
861
+ cur_buf_size += this_size;
862
+ }
863
+ }
864
+
865
+ // allocate remaining tensors
866
+ if (cur_buf_size > 0) {
867
+ if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
868
+ return NULL;
869
+ }
870
+ }
871
+
872
+ if (n_buffers == 0) {
873
+ // all the tensors in the context are already allocated
874
+ #ifndef NDEBUG
875
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
876
+ #endif
877
+ return NULL;
878
+ }
879
+
880
+ ggml_backend_buffer_t buffer;
881
+ if (n_buffers == 1) {
882
+ buffer = buffers[0];
883
+ } else {
884
+ buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
885
+ }
886
+ free(buffers);
827
887
  return buffer;
828
888
  }
829
889
 
@@ -19,6 +19,7 @@ extern "C" {
19
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
20
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
21
  size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
22
23
  size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
24
  bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
24
25
  // check if tensor data is in host memory
@@ -63,6 +64,11 @@ extern "C" {
63
64
  // do not use directly, use ggml_backend_tensor_copy instead
64
65
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
65
66
 
67
+ // buffer that contains a collection of buffers
68
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
69
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
70
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
71
+
66
72
  //
67
73
  // Backend
68
74
  //
@@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
27
27
  return buft->iface.get_alignment(buft);
28
28
  }
29
29
 
30
+ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
31
+ // get_max_size is optional, defaults to SIZE_MAX
32
+ if (buft->iface.get_max_size) {
33
+ return buft->iface.get_max_size(buft);
34
+ }
35
+ return SIZE_MAX;
36
+ }
37
+
30
38
  GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
31
39
  // get_alloc_size is optional, defaults to ggml_nbytes
32
40
  if (buft->iface.get_alloc_size) {
33
- return buft->iface.get_alloc_size(buft, tensor);
41
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
42
+ assert(size >= ggml_nbytes(tensor));
43
+ return size;
34
44
  }
35
45
  return ggml_nbytes(tensor);
36
46
  }
@@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
55
65
  size_t size) {
56
66
  ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
57
67
 
58
- GGML_ASSERT(iface.get_base != NULL);
59
-
60
68
  (*buffer) = (struct ggml_backend_buffer) {
61
69
  /* .interface = */ iface,
62
70
  /* .buft = */ buft,
@@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
106
114
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
107
115
  }
108
116
 
117
+ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
118
+ return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
119
+ }
120
+
109
121
  size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
110
122
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
111
123
  }
@@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
120
132
 
121
133
  void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
122
134
  buffer->usage = usage;
135
+
136
+ // FIXME: add a generic callback to the buffer interface
137
+ if (ggml_backend_buffer_is_multi_buffer(buffer)) {
138
+ ggml_backend_multi_buffer_set_usage(buffer, usage);
139
+ }
123
140
  }
124
141
 
125
142
  ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
@@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
169
186
  return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
170
187
  }
171
188
 
189
+ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
190
+ return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
191
+ }
192
+
172
193
  void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
173
194
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
174
195
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@@ -337,11 +358,26 @@ GGML_CALL static void ggml_backend_registry_init(void) {
337
358
  ggml_backend_cuda_reg_devices();
338
359
  #endif
339
360
 
361
+ #ifdef GGML_USE_SYCL
362
+ extern void ggml_backend_sycl_reg_devices(void);
363
+ ggml_backend_sycl_reg_devices();
364
+ #endif
365
+
340
366
  #ifdef GGML_USE_METAL
341
367
  extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
342
368
  extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
343
369
  ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
344
370
  #endif
371
+
372
+ #ifdef GGML_USE_VULKAN
373
+ extern GGML_CALL int ggml_backend_vk_reg_devices(void);
374
+ ggml_backend_vk_reg_devices();
375
+ #endif
376
+
377
+ #ifdef GGML_USE_KOMPUTE
378
+ extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
379
+ ggml_backend_kompute_reg_devices();
380
+ #endif
345
381
  }
346
382
 
347
383
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
@@ -545,6 +581,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
545
581
  /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
546
582
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
547
583
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
584
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
548
585
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
549
586
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
550
587
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
600
637
  /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
601
638
  /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
602
639
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
640
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
603
641
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
604
642
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
605
643
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -756,6 +794,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
756
794
  GGML_UNUSED(user_data);
757
795
  }
758
796
 
797
+ // multi-buffer buffer
798
+
799
+ struct ggml_backend_multi_buffer_context {
800
+ ggml_backend_buffer_t * buffers;
801
+ size_t n_buffers;
802
+ };
803
+
804
+ typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
805
+
806
+ GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
807
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
808
+
809
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
810
+ }
811
+
812
+ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
813
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
814
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
815
+ ggml_backend_buffer_free(ctx->buffers[i]);
816
+ }
817
+
818
+ free(ctx->buffers);
819
+ free(ctx);
820
+ }
821
+
822
+ GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
823
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
824
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
825
+ ggml_backend_buffer_clear(ctx->buffers[i], value);
826
+ }
827
+ }
828
+
829
+ static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
830
+ static struct ggml_backend_buffer_i multi_backend_buffer_i = {
831
+ /* .get_name = */ ggml_backend_multi_buffer_get_name,
832
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
833
+ /* .get_base = */ NULL,
834
+ /* .init_tensor = */ NULL,
835
+ /* .set_tensor = */ NULL,
836
+ /* .get_tensor = */ NULL,
837
+ /* .cpy_tensor = */ NULL,
838
+ /* .clear = */ ggml_backend_multi_buffer_clear,
839
+ /* .reset = */ NULL,
840
+ };
841
+
842
+ return multi_backend_buffer_i;
843
+ }
844
+
845
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
846
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
847
+ ctx->n_buffers = n_buffers;
848
+ ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
849
+
850
+ size_t total_size = 0;
851
+ for (size_t i = 0; i < n_buffers; i++) {
852
+ ctx->buffers[i] = buffers[i];
853
+ total_size += ggml_backend_buffer_get_size(buffers[i]);
854
+ }
855
+
856
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
857
+ }
858
+
859
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
860
+ return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
861
+ }
862
+
863
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
864
+ GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
865
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
866
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
867
+ ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
868
+ }
869
+ }
870
+
759
871
 
760
872
  // scheduler
761
873
 
@@ -20,6 +20,7 @@ extern "C" {
20
20
  GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
21
21
  GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
22
22
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
23
+ GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
23
24
  GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
24
25
  GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
25
26
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
@@ -36,6 +37,7 @@ extern "C" {
36
37
  GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
37
38
  GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
39
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
40
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
39
41
  GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
42
  GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
41
43
  GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
@@ -54,6 +56,7 @@ extern "C" {
54
56
  GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
55
57
  GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
56
58
  GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
59
+ GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
57
60
 
58
61
  GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
59
62
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);