RubyGems - llama_cpp - Versions diffs - 0.14.0 → 0.14.2 - Mend

llama_cpp 0.14.0 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +11 -0
data/ext/llama_cpp/extconf.rb +3 -1
data/ext/llama_cpp/llama_cpp.cpp +71 -0
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +9 -0
data/vendor/tmp/llama.cpp/Makefile +28 -12
data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +14 -3
data/vendor/tmp/llama.cpp/ggml-backend.c +358 -135
data/vendor/tmp/llama.cpp/ggml-backend.h +41 -17
data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
data/vendor/tmp/llama.cpp/ggml-cuda.cu +187 -1033
data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
data/vendor/tmp/llama.cpp/ggml-metal.m +42 -20
data/vendor/tmp/llama.cpp/ggml-metal.metal +44 -910
data/vendor/tmp/llama.cpp/ggml-quants.c +457 -1074
data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +388 -565
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +6 -39
data/vendor/tmp/llama.cpp/ggml.c +509 -343
data/vendor/tmp/llama.cpp/ggml.h +61 -47
data/vendor/tmp/llama.cpp/llama.cpp +1446 -687
data/vendor/tmp/llama.cpp/llama.h +25 -11
data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
data/vendor/tmp/llama.cpp/unicode.h +16 -774
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c7d855ccd32ae097f26a671751d6a2178361cf8d8a6c1b99af37859f2c47ca03
-  data.tar.gz: 3b17318424d08c65ad34da3fa14956c86db0a2ea05ac174323a9b8d2b9e69d59
+  metadata.gz: d720138462a39f3fd9853befa19e55543a794eb4d1c379f7d9db516a4569db68
+  data.tar.gz: 9779852b62cf57ab208275b746ca2492921cf1f92d4ebf6be26a668f1a7bbb66
 SHA512:
-  metadata.gz: 2d90bf9fdd8dbaf5e67b7fb8797a9412168ae6ce5fcfc4c6aca34e194d5beb5204184b5bb36d65dc507a7a618ac9e938987e8d8bf5871e4eb6304b5e6de06020
-  data.tar.gz: eab524367ace146eb6e20786bd530cead145e1651bcdb726afbb5364609d04b22ca8a515016bb0c2d154ea97fb62f19222c122bc9bb5efe7fc389a6f259da6f0
+  metadata.gz: 7fa80468abc917099b58009a7a821c704989b8086026e92f8e71a1310ea7ec0449276aeb4653bdb4ddf499183c785b0513ab75b1fa6a94659fe15be2cf05190c
+  data.tar.gz: '091155784ead62d3ef4ec68f3b4c9f6b1fc97ef87db45327266712912501746b08df983e0d0b81b518d229d4d31f1a0d77ad36f2d7156c26141b8116049d3206'

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,14 @@
+## [[0.14.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.1...v0.14.2)] - 2024-03-16
+- Fix to use metal embed library on macOS.
+## [[0.14.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.0...v0.14.1)] - 2024-03-16
+- Bump llama.cpp from b2361 to b2435.
+  - Add constants for vocaburary type: `LLAMA_VOCAB_TYPE_NONE`.
+  - Add `n_ubatch` and `n_seq_max` accessors to `ContextParams`.
+  - Add `n_ubatch`, `n_seq_max`, `set_causal_attn`, and `synchronize` methods to `Context`.
 ## [[0.14.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.13.0...v0.14.0)] - 2024-03-09
 **Breaking Changes**

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -21,6 +21,8 @@ make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
 make_envs << ' LLAMA_MPI=1' if with_config('mpi')
 make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
+make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
 Dir.chdir(LLAMA_CPP_DIR) do
   _mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)
   abort('Failed to build llama.cpp.') unless mkstatus.success?
@@ -33,8 +35,8 @@ if RUBY_PLATFORM.match?(/darwin/)
   Dir.chdir(VENDOR_LIB_DIR) do
     _mkstdout, _mkstderr, mkstatus = Open3.capture3("install_name_tool -id #{VENDOR_LIB_DIR}/libllama.dylib libllama.dylib")
     abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
-    FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
   end
+  FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
 end
 abort('libstdc++ is not found.') unless have_library('stdc++')

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -946,6 +946,10 @@ public:
     rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
     rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
     rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
+    rb_define_method(rb_cLLaMAContextParams, "n_ubatch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ubatch), 1);
+    rb_define_method(rb_cLLaMAContextParams, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_params_get_n_ubatch), 0);
+    rb_define_method(rb_cLLaMAContextParams, "n_seq_max=", RUBY_METHOD_FUNC(_llama_context_params_set_n_seq_max), 1);
+    rb_define_method(rb_cLLaMAContextParams, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_params_get_n_seq_max), 0);
     rb_define_method(rb_cLLaMAContextParams, "n_threads=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads), 1);
     rb_define_method(rb_cLLaMAContextParams, "n_threads", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads), 0);
     rb_define_method(rb_cLLaMAContextParams, "n_threads_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads_batch), 1);
@@ -1031,6 +1035,30 @@ private:
     return INT2NUM(ptr->params.n_batch);
   }
+  // n_ubatch
+  static VALUE _llama_context_params_set_n_ubatch(VALUE self, VALUE n_ubatch) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.n_ubatch = NUM2INT(n_ubatch);
+    return INT2NUM(ptr->params.n_ubatch);
+  }
+  static VALUE _llama_context_params_get_n_ubatch(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return INT2NUM(ptr->params.n_ubatch);
+  }
+  // n_seq_max
+  static VALUE _llama_context_params_set_n_seq_max(VALUE self, VALUE n_seq_max) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.n_seq_max = NUM2INT(n_seq_max);
+    return INT2NUM(ptr->params.n_seq_max);
+  }
+  static VALUE _llama_context_params_get_n_seq_max(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return INT2NUM(ptr->params.n_seq_max);
+  }
   // n_threads
   static VALUE _llama_context_params_set_n_threads(VALUE self, VALUE n_threads) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -2019,6 +2047,8 @@ public:
     rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
     rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
     rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
+    rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
+    rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
     rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
     rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
     rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2033,6 +2063,8 @@ public:
     rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
     rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
     rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
+    rb_define_method(rb_cLLaMAContext, "set_causal_attn", RUBY_METHOD_FUNC(_llama_context_set_causal_attn), 1);
+    rb_define_method(rb_cLLaMAContext, "synchronize", RUBY_METHOD_FUNC(_llama_context_synchronize), 0);
     rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
@@ -2250,6 +2282,24 @@ private:
     return UINT2NUM(llama_n_batch(ptr->ctx));
   }
+  static VALUE _llama_context_n_ubatch(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    return UINT2NUM(llama_n_ubatch(ptr->ctx));
+  }
+  static VALUE _llama_context_n_seq_max(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    return UINT2NUM(llama_n_seq_max(ptr->ctx));
+  }
   static VALUE _llama_context_get_timings(VALUE self) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx == NULL) {
@@ -2395,6 +2445,26 @@ private:
     return Qnil;
   }
+  static VALUE _llama_context_set_causal_attn(VALUE self, VALUE causal_attn) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    llama_set_causal_attn(ptr->ctx, RTEST(causal_attn) ? true : false);
+    return Qnil;
+  }
+  static VALUE _llama_context_synchronize(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    llama_synchronize(ptr->ctx);
+    return Qnil;
+  }
   static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
     ID kw_table[1] = { rb_intern("session_path") };
@@ -3204,6 +3274,7 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
   rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
+  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_NONE", INT2NUM(LLAMA_VOCAB_TYPE_NONE));
   rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
   rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
   rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.14.0'
+  VERSION = '0.14.2'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b2361'
+  LLAMA_CPP_VERSION = 'b2435'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -3,6 +3,7 @@ module LLaMACpp
   LLAMA_CPP_VERSION: String
   LLAMA_DEFALUT_SEED: String
+  LLAMA_VOCAB_TYPE_NONE: Integer
   LLAMA_VOCAB_TYPE_SPM: Integer
   LLAMA_VOCAB_TYPE_BPE: Integer
   LLAMA_VOCAB_TYPE_WPM: Integer
@@ -207,6 +208,8 @@ module LLaMACpp
     def logits: () -> Array[Float]
     def n_ctx: () -> Integer
     def n_batch: () -> Integer
+    def n_ubatch: () -> Integer
+    def n_seq_max: () -> Integer
     def timings: () -> ::LLaMACpp::Timings
     def print_timings: () -> void
     def reset_timings: () -> void
@@ -221,6 +224,8 @@ module LLaMACpp
     def kv_cache_defrag: () -> void
     def kv_cache_update: () -> void
     def set_rng_seed: (Integer) -> void
+    def set_causal_attn: (bool) -> void
+    def synchronize: () -> void
     def load_session_file: (session_path: String) -> void
     def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
     def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
@@ -250,6 +255,10 @@ module LLaMACpp
     def n_ctx=: (Integer) -> Integer
     def n_batch: () -> Integer
     def n_batch=: (Integer) -> Integer
+    def n_ubatch: () -> Integer
+    def n_ubatch=: (Integer) -> Integer
+    def n_seq_max: () -> Integer
+    def n_seq_max=: (Integer) -> Integer
     def n_threads: () -> Integer
     def n_threads=: (Integer) -> Integer
     def n_threads_batch: () -> Integer

data/vendor/tmp/llama.cpp/Makefile CHANGED Viewed

@@ -2,7 +2,7 @@
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
 # Binaries only useful for tests
 TEST_TARGETS = \
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
 	MK_CPPFLAGS += -D_BSD_SOURCE
 endif
+ifdef LLAMA_SCHED_MAX_COPIES
+	MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
+endif
 ifdef LLAMA_DEBUG
 	MK_CFLAGS   += -O0 -g
 	MK_CXXFLAGS += -O0 -g
@@ -201,6 +205,10 @@ ifdef LLAMA_SERVER_VERBOSE
 	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
 endif
+ifdef LLAMA_SERVER_SSL
+	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
+	MK_LDFLAGS += -lssl -lcrypto
+endif
 ifdef LLAMA_CODE_COVERAGE
 	MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
@@ -451,7 +459,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 ifdef LLAMA_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
 ifdef JETSON_EOL_MODULE_DETECT
 	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 else
@@ -551,15 +559,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@
 ifdef LLAMA_METAL_EMBED_LIBRARY
-ggml-metal-embed.o: ggml-metal.metal
+ggml-metal-embed.o: ggml-metal.metal ggml-common.h
 	@echo "Embedding Metal library"
+	@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp))
-	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
-	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
+	@echo ".section __DATA, __ggml_metallib"   >  $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_start"        >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_start:"              >> $(TEMP_ASSEMBLY)
+	@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_end"          >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_end:"                >> $(TEMP_ASSEMBLY)
 	@$(AS) $(TEMP_ASSEMBLY) -o $@
 	@rm -f ${TEMP_ASSEMBLY}
 endif
@@ -628,12 +637,15 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
+ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
+unicode.o: unicode.cpp unicode.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
+llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
@@ -725,6 +737,10 @@ embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+gritlm: examples/gritlm/gritlm.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

data/vendor/tmp/llama.cpp/ggml-alloc.c CHANGED Viewed

@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
     }
 }
-// TODO: GGML_PAD ?
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
     assert(alignment && !(alignment & (alignment - 1))); // power of 2
     size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
 }
 // tallocr
-struct ggml_tallocr {
-    ggml_backend_buffer_t buffer;
-    void * base;
-    size_t alignment;
-    size_t offset;
-};
-ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
-    ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
-    if (talloc == NULL) {
-        return NULL;
-    }
+struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
     void * base = ggml_backend_buffer_get_base(buffer);
     size_t align = ggml_backend_buffer_get_alignment(buffer);
     assert(align && !(align & (align - 1))); // power of 2
-    *talloc = (struct ggml_tallocr) {
+    struct ggml_tallocr talloc = (struct ggml_tallocr) {
         /*.buffer    = */ buffer,
         /*.base      = */ base,
         /*.alignment = */ align,
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
     return talloc;
 }
-void ggml_tallocr_free(ggml_tallocr_t talloc) {
-    free(talloc);
-}
-void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
+void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
     size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
     size = GGML_PAD(size, talloc->alignment);
@@ -354,12 +338,16 @@ struct hash_node {
     bool allocated;
 };
-//
 struct tensor_alloc {
     size_t offset;
     size_t size_max; // 0 = pre-allocated, unused, or view
 };
+struct leaf_alloc {
+    int buffer_id;
+    struct tensor_alloc leaf;
+};
 struct node_alloc {
     int buffer_id;
     struct tensor_alloc dst;
@@ -378,7 +366,7 @@ struct ggml_gallocr {
     struct node_alloc * node_allocs; // [n_nodes]
     int n_nodes;
-    struct tensor_alloc * leaf_allocs; // [n_leafs]
+    struct leaf_alloc * leaf_allocs; // [n_leafs]
     int n_leafs;
 };
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
     return node_buffer_ids ? node_buffer_ids[i] : 0;
 }
-static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
+static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
     // clear hash tables
     memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
     memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
+    // allocate leafs
+    // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
+    }
     // count number of children and views
-    // allocate all graph inputs and leafs first to avoid overwriting them
+    // allocate other graph inputs and leafs first to avoid overwriting them
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
         }
     }
-    // allocate the remaining leafs that are unused on the graph
-    // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-        if (hn->n_children == 0) {
-            assert(!hn->allocated);
-            // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
-            ggml_gallocr_allocate_node(galloc, leaf, 0);
-        }
-    }
     // allocate tensors
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
 }
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
     size_t hash_size = graph->visited_hash_table.size;
     // initialize hash table
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     }
     // allocate in hash table
-    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
+    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
     // set the node_allocs from the hash table
     if (galloc->n_nodes < graph->n_nodes) {
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     }
     if (galloc->n_leafs < graph->n_leafs) {
         free(galloc->leaf_allocs);
-        galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
+        galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
         GGML_ASSERT(galloc->leaf_allocs != NULL);
     }
     galloc->n_leafs = graph->n_leafs;
     for (int i = 0; i < graph->n_leafs; i++) {
         struct ggml_tensor * leaf = graph->leafs[i];
         struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-        galloc->leaf_allocs[i].offset = hn->offset;
-        galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
+        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
+        galloc->leaf_allocs[i].leaf.offset = hn->offset;
+        galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
     }
     // reallocate buffers if needed
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
         size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
-        if (new_size > cur_size) {
+        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
+        if (new_size > cur_size || galloc->buffers[i] == NULL) {
 #ifndef NDEBUG
             fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 }
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-    return ggml_gallocr_reserve_n(galloc, graph, NULL);
+    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
-    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
+    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
-    if (node->view_src != NULL) {
-        if (node->buffer == NULL) {
+    if (tensor->view_src != NULL) {
+        if (tensor->buffer == NULL) {
             assert(tensor_alloc->offset == SIZE_MAX);
-            if (node->view_src->buffer == NULL) {
+            if (tensor->view_src->buffer == NULL) {
                 // this tensor was allocated without ggml-backend
                 return;
             }
-            ggml_backend_view_init(galloc->buffers[buffer_id], node);
+            ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
         }
     } else {
-        if (node->data == NULL) {
+        if (tensor->data == NULL) {
             assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
             void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
             void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
+            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
         } else {
-            if (node->buffer == NULL) {
+            if (tensor->buffer == NULL) {
                 // this tensor was allocated without ggml-backend
                 return;
             }
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
     // reset buffers
     for (int i = 0; i < galloc->n_buffers; i++) {
-        // zero size buffers are not allocated
         if (galloc->buffers[i] != NULL) {
             ggml_backend_buffer_reset(galloc->buffers[i]);
         }
     }
     // allocate the graph tensors from the previous assignments
+    // leafs
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
+        ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
+    }
     // nodes
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
         }
         ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
     }
-    // leafs
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
-        ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
-    }
     return true;
 }
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
         return false;
     }
-    struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
+    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
     for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
         if (t->data == NULL) {
             if (t->view_src == NULL) {
-                ggml_tallocr_alloc(tallocr, t);
+                ggml_tallocr_alloc(&tallocr, t);
             } else if (t->buffer == NULL) {
                 ggml_backend_view_init(buffer, t);
             }
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
         }
     }
-    ggml_tallocr_free(tallocr);
     *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
     (*buffers)[(*n_buffers)++] = buffer;

data/vendor/tmp/llama.cpp/ggml-alloc.h CHANGED Viewed

@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
 typedef struct ggml_backend * ggml_backend_t;
 // Tensor allocator
-typedef struct ggml_tallocr * ggml_tallocr_t;
+struct ggml_tallocr {
+    ggml_backend_buffer_t buffer;
+    void * base;
+    size_t alignment;
+    size_t offset;
+};
-GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API void           ggml_tallocr_free(ggml_tallocr_t talloc);
-GGML_API void           ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
 // Graph allocator
 /*
@@ -50,7 +54,11 @@ GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
 GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
+GGML_API bool ggml_gallocr_reserve_n(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids);
 // automatic reallocation if the topology changes when using a single buffer
 // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)

data/vendor/tmp/llama.cpp/ggml-backend-impl.h CHANGED Viewed

@@ -86,12 +86,12 @@ extern "C" {
         // (optional) asynchronous tensor data access
         void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
         // (optional) complete all pending operations
         void (*GGML_CALL synchronize)(ggml_backend_t backend);
-        // create a plan for ggml_cgraph and free it
+        // compute graph with a plan (not used currently)
         ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
         void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
@@ -102,16 +102,27 @@ extern "C" {
         // check if the backend supports an operation
         bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+        // (optional) event synchronization
+        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
+        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
+        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
+        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
+        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
     };
     struct ggml_backend {
         ggml_guid_t guid;
         struct ggml_backend_i iface;
         ggml_backend_context_t context;
     };
+    struct ggml_backend_event {
+        ggml_backend_t backend;
+        void * context;
+    };
     //
     // Backend registry
     //