RubyGems - llama_cpp - Versions diffs - 0.12.2 → 0.12.4 - Mend

llama_cpp 0.12.2 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +15 -0
data/README.md +2 -2
data/ext/llama_cpp/extconf.rb +1 -0
data/ext/llama_cpp/llama_cpp.cpp +68 -6
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +6 -2
data/vendor/tmp/llama.cpp/Makefile +25 -3
data/vendor/tmp/llama.cpp/ggml-alloc.c +87 -27
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
data/vendor/tmp/llama.cpp/ggml-backend.c +176 -18
data/vendor/tmp/llama.cpp/ggml-backend.h +14 -0
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
data/vendor/tmp/llama.cpp/ggml-metal.m +144 -113
data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
data/vendor/tmp/llama.cpp/ggml-quants.c +736 -59
data/vendor/tmp/llama.cpp/ggml-quants.h +20 -1
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
data/vendor/tmp/llama.cpp/ggml.c +664 -117
data/vendor/tmp/llama.cpp/ggml.h +46 -11
data/vendor/tmp/llama.cpp/llama.cpp +1426 -341
data/vendor/tmp/llama.cpp/llama.h +24 -15
data/vendor/tmp/llama.cpp/unicode.h +2 -1
metadata +10 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a65b477c93060832783d03b065dd336820bf22e985dd7b9f53a20e5834f29a0d
-  data.tar.gz: 3ab3f5147bb207ddeea4b902e86de41398fbe497bb521ab00a4fe89ccd790d50
+  metadata.gz: e77376858bfb07c67b29963a898f3cf9f2494a5cadabbc4cf777e87af801b33c
+  data.tar.gz: 1196c932182a2c76416c326dac934e97cb9111e6bed269c4776e05587391b916
 SHA512:
-  metadata.gz: 119a77a344ece09afda87d89321f679b9c53975c6b340150e298fa3869a0bf48849fafd49e5ef18b001311aae10e3fa9aba29c96de2c4aa8535cdad7d01382cb
-  data.tar.gz: 444fc224413ee6fc94b0866da07460e9c95162941fcd80c831c6f7a950373503eba74b10d437724db2c9debec4719c5a9b25875f1b0a014c956bcb424ca8bf47
+  metadata.gz: 594f4af7e1e88f156926b7605683e29b47a7caf3afb2c18434fa0035415902fb51a9dafe845a4a108bce0dfdd9ad63b5301790826ee6995fa1799cf2bff0c1ee
+  data.tar.gz: 4199b0e417efc0e469172c147aa766a81b3f073158eefc13315ab50e4240a4e2f41611e3c87939f4d3012357edf339b1450e49f2bc324f37f92040396342d476

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,18 @@
+## [[0.12.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.3...v0.12.4)] - 2024-02-03
+- Bump bundled llama.cpp from b1971 to b2047.
+  - Add constant for file type: `LLAMA_FTYPE_MOSTLY_IQ3_XXS`.
+  - Add `supports_mmap?`, `supports_mlock?`, and `supports_gpu_offload?` module functions to `LLaMACpp`.
+  - Add `--with-vulkan` configuration option.
+  - Deprecate `mmap_supported?` and `mlock_supported?` module functions in `LLaMACpp`.
+  - Remove `LLAMA_MAX_DEVICES` constant.
+## [[0.12.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.2...v0.12.3)] - 2024-01-27
+- Bump bundled llama.cpp from b1892 to b1971.
+  - Add constant for file type: `LLAMA_FTYPE_MOSTLY_Q3_K_XS`.
+  - Add `sample_entropy` method to Context.
 ## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
 - Bump bundled llama.cpp from b1833 to b1892.

data/README.md CHANGED Viewed

@@ -28,8 +28,8 @@ There are several installation options:
 # use OpenBLAS
 $ gem install llama_cpp -- --with-openblas
-# use CUDA
-$ gem install llama_cpp -- --with-cuda
+# use cuBLAS
+$ gem install llama_cpp -- --with-cublas
 ```
 Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -19,6 +19,7 @@ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
 make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
 make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
 make_envs << ' LLAMA_MPI=1' if with_config('mpi')
+make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
 Dir.chdir(LLAMA_CPP_DIR) do
   _mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -843,15 +843,15 @@ private:
   // tensor_split
   static VALUE _llama_model_params_get_tensor_split(VALUE self) {
-    if (LLAMA_MAX_DEVICES < 1) {
+    if (llama_max_devices() < 1) {
       return rb_ary_new();
     }
-    VALUE ret = rb_ary_new2(LLAMA_MAX_DEVICES);
+    VALUE ret = rb_ary_new2(llama_max_devices());
     LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
     if (ptr->params.tensor_split == nullptr) {
       return rb_ary_new();
     }
-    for (size_t i = 0; i < LLAMA_MAX_DEVICES; i++) {
+    for (size_t i = 0; i < llama_max_devices(); i++) {
       rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
     }
     return ret;
@@ -2054,6 +2054,7 @@ public:
     rb_define_method(rb_cLLaMAContext, "sample_tail_free", RUBY_METHOD_FUNC(_llama_context_sample_tail_free), -1);
     rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
     rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
+    rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
     rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
     rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
     rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
@@ -2904,6 +2905,50 @@ private:
     return Qnil;
   }
+  static VALUE _llama_context_sample_entropy(int argc, VALUE* argv, VALUE self) {
+    VALUE kw_args = Qnil;
+    ID kw_table[3] = { rb_intern("min_temp"), rb_intern("max_temp"), rb_intern("exponent_val") };
+    VALUE kw_values[3] = { Qundef, Qundef, Qundef };
+    VALUE candidates = Qnil;
+    rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
+    rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
+    if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
+      rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
+      return Qnil;
+    }
+    if (!RB_FLOAT_TYPE_P(kw_values[0])) {
+      rb_raise(rb_eArgError, "min_temp must be a float");
+      return Qnil;
+    }
+    if (!RB_FLOAT_TYPE_P(kw_values[1])) {
+      rb_raise(rb_eArgError, "max_temp must be a float");
+      return Qnil;
+    }
+    if (!RB_FLOAT_TYPE_P(kw_values[2])) {
+      rb_raise(rb_eArgError, "exponent_val must be a float");
+      return Qnil;
+    }
+    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
+    if (ctx_ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
+    if (cnd_ptr->array.data == nullptr) {
+      rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
+      return Qnil;
+    }
+    const float min_temp = NUM2DBL(kw_values[0]);
+    const float max_temp = NUM2DBL(kw_values[1]);
+    const float exponent_val = NUM2DBL(kw_values[2]);
+    llama_sample_entropy(ctx_ptr->ctx, &(cnd_ptr->array), min_temp, max_temp, exponent_val);
+    return Qnil;
+  }
   static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
     ID kw_table[1] = { rb_intern("temperature") };
@@ -3214,15 +3259,29 @@ static VALUE rb_llama_time_us(VALUE self) {
 }
 static VALUE rb_llama_mmap_supported(VALUE self) {
+  rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
   return llama_mmap_supported() ? Qtrue : Qfalse;
 }
 static VALUE rb_llama_mlock_supported(VALUE self) {
+  rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
   return llama_mlock_supported() ? Qtrue : Qfalse;
 }
 static VALUE rb_llama_max_devices(VALUE self) {
-  return INT2NUM(llama_max_devices());
+  return SIZET2NUM(llama_max_devices());
+}
+static VALUE rb_llama_supports_mmap(VALUE self) {
+  return llama_supports_mmap() ? Qtrue : Qfalse;
+}
+static VALUE rb_llama_supports_mlock(VALUE self) {
+  return llama_supports_mlock() ? Qtrue : Qfalse;
+}
+static VALUE rb_llama_supports_gpu_offload(VALUE self) {
+  return llama_supports_gpu_offload() ? Qtrue : Qfalse;
 }
 extern "C" void Init_llama_cpp(void) {
@@ -3249,8 +3308,9 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
   rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
   rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
-  rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
+  rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
+  rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
+  rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
   rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
   rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
@@ -3283,6 +3343,8 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.12.2'
+  VERSION = '0.12.4'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b1892'
+  LLAMA_CPP_VERSION = 'b2047'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -3,8 +3,6 @@ module LLaMACpp
   LLAMA_CPP_VERSION: String
   LLAMA_DEFALUT_SEED: String
-  LLAMA_MAX_DEVICES: Integer
   LLAMA_FTYPE_ALL_F32: Integer
   LLAMA_FTYPE_MOSTLY_F16: Integer
   LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -25,6 +23,8 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
   LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
   LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
+  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
   LLAMA_KV_OVERRIDE_INT: Integer
   LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -60,6 +60,9 @@ module LLaMACpp
   def self?.mmap_supported?: () -> bool
   def self?.mlock_supported?: () -> bool
   def self?.max_devices: () -> Integer
+  def self?.supports_mmap?: () -> bool
+  def self?.supports_mlock?: () -> bool
+  def self?.supports_gpu_offload?: () -> bool
   class TokenData
     public
@@ -216,6 +219,7 @@ module LLaMACpp
     def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
     def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
     def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
+    def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
     def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
     def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
     def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]

data/vendor/tmp/llama.cpp/Makefile CHANGED Viewed

@@ -9,7 +9,7 @@ TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops
+	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -450,6 +450,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST
+ifdef LLAMA_VULKAN
+	MK_CPPFLAGS  += -DGGML_USE_VULKAN
+	MK_LDFLAGS += -lvulkan
+	OBJS    += ggml-vulkan.o
+ifdef LLAMA_VULKAN_CHECK_RESULTS
+	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
+endif
+ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # LLAMA_VULKAN
 ifdef LLAMA_HIPBLAS
 	ifeq ($(wildcard /opt/rocm),)
@@ -575,12 +588,15 @@ train.o: common/train.cpp common/train.h
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
+	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 lib: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
 	ar rcs libllama.a $^
 clean:
-	rm -vrf *.o tests/*.o *.so *.dll *.dylib *.a benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 #
 # Examples
@@ -625,7 +641,7 @@ embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(C
 save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
@@ -753,3 +769,9 @@ tests/test-c.o: tests/test-c.c llama.h
 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

data/vendor/tmp/llama.cpp/ggml-alloc.c CHANGED Viewed

@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
         if (block->size >= size) {
             best_fit_block = alloc->n_free_blocks - 1;
         } else {
-            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
-                    __func__, size, max_avail);
+            fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
+                    __func__, tensor->name, size, max_avail);
             GGML_ASSERT(!"not enough space in the buffer");
             return;
         }
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
 }
 size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
-    return alloc->max_size;
+    // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
+    // to avoid this, we add a 10% margin to the buffer size
+    return alloc->max_size + alloc->max_size/10;
 }
 // graph allocator
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
 }
 // utils
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
-    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
-    size_t alignment = ggml_backend_buft_get_alignment(buft);
-    size_t nbytes = 0;
-    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        if (t->data == NULL && t->view_src == NULL) {
-            nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
-        }
-    }
-    if (nbytes == 0) {
-        // all the tensors in the context are already allocated
-#ifndef NDEBUG
-        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
-#endif
-        return NULL;
-    }
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
+static bool alloc_tensor_range(struct ggml_context * ctx,
+        struct ggml_tensor * first, struct ggml_tensor * last,
+        ggml_backend_buffer_type_t buft, size_t size,
+        ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
     if (buffer == NULL) {
-        // failed to allocate buffer
 #ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
+        fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
 #endif
-        return NULL;
+        for (size_t i = 0; i < *n_buffers; i++) {
+            ggml_backend_buffer_free(*buffers[i]);
+        }
+        free(*buffers);
+        return false;
     }
     ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
-    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
         if (t->data == NULL) {
             if (t->view_src == NULL) {
                 ggml_tallocr_alloc(tallocr, t);
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
     ggml_tallocr_free(tallocr);
+    *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
+    (*buffers)[(*n_buffers)++] = buffer;
+    return true;
+}
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
+    size_t alignment = ggml_backend_buft_get_alignment(buft);
+    size_t max_size = ggml_backend_buft_get_max_size(buft);
+    ggml_backend_buffer_t * buffers = NULL;
+    size_t n_buffers = 0;
+    size_t cur_buf_size = 0;
+    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
+    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size_t this_size = 0;
+        if (t->data == NULL && t->view_src == NULL) {
+            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
+        }
+        if (this_size > max_size) {
+            // tensor is too large to fit in a single buffer
+            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+                    __func__, t->name,
+                    ggml_backend_buft_name(buft),
+                    this_size, max_size);
+            for (size_t i = 0; i < n_buffers; i++) {
+                ggml_backend_buffer_free(buffers[i]);
+            }
+            free(buffers);
+            return NULL;
+        }
+        if ((cur_buf_size + this_size) > max_size) {
+            // allocate tensors in the current buffer
+            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
+                return NULL;
+            }
+            first = t;
+            cur_buf_size = this_size;
+        } else {
+            cur_buf_size += this_size;
+        }
+    }
+    // allocate remaining tensors
+    if (cur_buf_size > 0) {
+        if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
+            return NULL;
+        }
+    }
+    if (n_buffers == 0) {
+        // all the tensors in the context are already allocated
+#ifndef NDEBUG
+        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+#endif
+        return NULL;
+    }
+    ggml_backend_buffer_t buffer;
+    if (n_buffers == 1) {
+        buffer = buffers[0];
+    } else {
+        buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
+    }
+    free(buffers);
     return buffer;
 }

data/vendor/tmp/llama.cpp/ggml-backend-impl.h CHANGED Viewed

@@ -19,6 +19,7 @@ extern "C" {
         const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
         ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
         size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
         size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
         bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
         // check if tensor data is in host memory
@@ -63,6 +64,11 @@ extern "C" {
     // do not use directly, use ggml_backend_tensor_copy instead
     bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+    // buffer that contains a collection of buffers
+    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
     //
     // Backend
     //