llama_cpp 0.14.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c7d855ccd32ae097f26a671751d6a2178361cf8d8a6c1b99af37859f2c47ca03
4
- data.tar.gz: 3b17318424d08c65ad34da3fa14956c86db0a2ea05ac174323a9b8d2b9e69d59
3
+ metadata.gz: c2a192fa17c1d313a93306e415ec27dfb8fb6ce993b9fc78797ed6e1d38ca63f
4
+ data.tar.gz: f800e54961a8bea5de95373d15f0cda30f7e95edd655cc0504247dfefcff473a
5
5
  SHA512:
6
- metadata.gz: 2d90bf9fdd8dbaf5e67b7fb8797a9412168ae6ce5fcfc4c6aca34e194d5beb5204184b5bb36d65dc507a7a618ac9e938987e8d8bf5871e4eb6304b5e6de06020
7
- data.tar.gz: eab524367ace146eb6e20786bd530cead145e1651bcdb726afbb5364609d04b22ca8a515016bb0c2d154ea97fb62f19222c122bc9bb5efe7fc389a6f259da6f0
6
+ metadata.gz: 48cefba1491319f82d52a46e8be34b5f0115dbe80bd6a9fdbf4fe0e190581a6b1ff8c3e2b2dfdaefeaa0b7cb11c8b9f5a84bcb60354f64248abbee3d488378ee
7
+ data.tar.gz: 9c6d75d3818b61192bd5c93a8b091003e2342f28102de1fbc9a1a02955a7c89e2a144b82bbe83e805b3f741261e967469c3ad2f6d347b1b870fb51880b850d89
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## [[0.14.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.0...v0.14.1)] - 2024-03-16
2
+
3
+ - Bump llama.cpp from b2361 to b2435.
4
+ - Add constants for vocaburary type: `LLAMA_VOCAB_TYPE_NONE`.
5
+ - Add `n_ubatch` and `n_seq_max` accessors to `ContextParams`.
6
+ - Add `n_ubatch`, `n_seq_max`, `set_causal_attn`, and `synchronize` methods to `Context`.
7
+
1
8
  ## [[0.14.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.13.0...v0.14.0)] - 2024-03-09
2
9
 
3
10
  **Breaking Changes**
@@ -946,6 +946,10 @@ public:
946
946
  rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
947
947
  rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
948
948
  rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
949
+ rb_define_method(rb_cLLaMAContextParams, "n_ubatch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ubatch), 1);
950
+ rb_define_method(rb_cLLaMAContextParams, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_params_get_n_ubatch), 0);
951
+ rb_define_method(rb_cLLaMAContextParams, "n_seq_max=", RUBY_METHOD_FUNC(_llama_context_params_set_n_seq_max), 1);
952
+ rb_define_method(rb_cLLaMAContextParams, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_params_get_n_seq_max), 0);
949
953
  rb_define_method(rb_cLLaMAContextParams, "n_threads=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads), 1);
950
954
  rb_define_method(rb_cLLaMAContextParams, "n_threads", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads), 0);
951
955
  rb_define_method(rb_cLLaMAContextParams, "n_threads_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads_batch), 1);
@@ -1031,6 +1035,30 @@ private:
1031
1035
  return INT2NUM(ptr->params.n_batch);
1032
1036
  }
1033
1037
 
1038
+ // n_ubatch
1039
+ static VALUE _llama_context_params_set_n_ubatch(VALUE self, VALUE n_ubatch) {
1040
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1041
+ ptr->params.n_ubatch = NUM2INT(n_ubatch);
1042
+ return INT2NUM(ptr->params.n_ubatch);
1043
+ }
1044
+
1045
+ static VALUE _llama_context_params_get_n_ubatch(VALUE self) {
1046
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1047
+ return INT2NUM(ptr->params.n_ubatch);
1048
+ }
1049
+
1050
+ // n_seq_max
1051
+ static VALUE _llama_context_params_set_n_seq_max(VALUE self, VALUE n_seq_max) {
1052
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1053
+ ptr->params.n_seq_max = NUM2INT(n_seq_max);
1054
+ return INT2NUM(ptr->params.n_seq_max);
1055
+ }
1056
+
1057
+ static VALUE _llama_context_params_get_n_seq_max(VALUE self) {
1058
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1059
+ return INT2NUM(ptr->params.n_seq_max);
1060
+ }
1061
+
1034
1062
  // n_threads
1035
1063
  static VALUE _llama_context_params_set_n_threads(VALUE self, VALUE n_threads) {
1036
1064
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -2019,6 +2047,8 @@ public:
2019
2047
  rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2020
2048
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2021
2049
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2050
+ rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
2051
+ rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
2022
2052
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
2023
2053
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
2024
2054
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2033,6 +2063,8 @@ public:
2033
2063
  rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
2034
2064
  rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
2035
2065
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
2066
+ rb_define_method(rb_cLLaMAContext, "set_causal_attn", RUBY_METHOD_FUNC(_llama_context_set_causal_attn), 1);
2067
+ rb_define_method(rb_cLLaMAContext, "synchronize", RUBY_METHOD_FUNC(_llama_context_synchronize), 0);
2036
2068
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
2037
2069
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
2038
2070
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
@@ -2250,6 +2282,24 @@ private:
2250
2282
  return UINT2NUM(llama_n_batch(ptr->ctx));
2251
2283
  }
2252
2284
 
2285
+ static VALUE _llama_context_n_ubatch(VALUE self) {
2286
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2287
+ if (ptr->ctx == NULL) {
2288
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2289
+ return Qnil;
2290
+ }
2291
+ return UINT2NUM(llama_n_ubatch(ptr->ctx));
2292
+ }
2293
+
2294
+ static VALUE _llama_context_n_seq_max(VALUE self) {
2295
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2296
+ if (ptr->ctx == NULL) {
2297
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2298
+ return Qnil;
2299
+ }
2300
+ return UINT2NUM(llama_n_seq_max(ptr->ctx));
2301
+ }
2302
+
2253
2303
  static VALUE _llama_context_get_timings(VALUE self) {
2254
2304
  LLaMAContextWrapper* ptr = get_llama_context(self);
2255
2305
  if (ptr->ctx == NULL) {
@@ -2395,6 +2445,26 @@ private:
2395
2445
  return Qnil;
2396
2446
  }
2397
2447
 
2448
+ static VALUE _llama_context_set_causal_attn(VALUE self, VALUE causal_attn) {
2449
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2450
+ if (ptr->ctx == NULL) {
2451
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2452
+ return Qnil;
2453
+ }
2454
+ llama_set_causal_attn(ptr->ctx, RTEST(causal_attn) ? true : false);
2455
+ return Qnil;
2456
+ }
2457
+
2458
+ static VALUE _llama_context_synchronize(VALUE self) {
2459
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2460
+ if (ptr->ctx == NULL) {
2461
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2462
+ return Qnil;
2463
+ }
2464
+ llama_synchronize(ptr->ctx);
2465
+ return Qnil;
2466
+ }
2467
+
2398
2468
  static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
2399
2469
  VALUE kw_args = Qnil;
2400
2470
  ID kw_table[1] = { rb_intern("session_path") };
@@ -3204,6 +3274,7 @@ extern "C" void Init_llama_cpp(void) {
3204
3274
  rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
3205
3275
  rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
3206
3276
 
3277
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_NONE", INT2NUM(LLAMA_VOCAB_TYPE_NONE));
3207
3278
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3208
3279
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3209
3280
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.0'
6
+ VERSION = '0.14.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2361'
9
+ LLAMA_CPP_VERSION = 'b2435'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,7 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_VOCAB_TYPE_NONE: Integer
6
7
  LLAMA_VOCAB_TYPE_SPM: Integer
7
8
  LLAMA_VOCAB_TYPE_BPE: Integer
8
9
  LLAMA_VOCAB_TYPE_WPM: Integer
@@ -207,6 +208,8 @@ module LLaMACpp
207
208
  def logits: () -> Array[Float]
208
209
  def n_ctx: () -> Integer
209
210
  def n_batch: () -> Integer
211
+ def n_ubatch: () -> Integer
212
+ def n_seq_max: () -> Integer
210
213
  def timings: () -> ::LLaMACpp::Timings
211
214
  def print_timings: () -> void
212
215
  def reset_timings: () -> void
@@ -221,6 +224,8 @@ module LLaMACpp
221
224
  def kv_cache_defrag: () -> void
222
225
  def kv_cache_update: () -> void
223
226
  def set_rng_seed: (Integer) -> void
227
+ def set_causal_attn: (bool) -> void
228
+ def synchronize: () -> void
224
229
  def load_session_file: (session_path: String) -> void
225
230
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
226
231
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
@@ -250,6 +255,10 @@ module LLaMACpp
250
255
  def n_ctx=: (Integer) -> Integer
251
256
  def n_batch: () -> Integer
252
257
  def n_batch=: (Integer) -> Integer
258
+ def n_ubatch: () -> Integer
259
+ def n_ubatch=: (Integer) -> Integer
260
+ def n_seq_max: () -> Integer
261
+ def n_seq_max=: (Integer) -> Integer
253
262
  def n_threads: () -> Integer
254
263
  def n_threads=: (Integer) -> Integer
255
264
  def n_threads_batch: () -> Integer
@@ -2,7 +2,7 @@
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
4
  simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
5
+ speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
167
167
  MK_CPPFLAGS += -D_BSD_SOURCE
168
168
  endif
169
169
 
170
+ ifdef LLAMA_SCHED_MAX_COPIES
171
+ MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
172
+ endif
173
+
170
174
  ifdef LLAMA_DEBUG
171
175
  MK_CFLAGS += -O0 -g
172
176
  MK_CXXFLAGS += -O0 -g
@@ -201,6 +205,10 @@ ifdef LLAMA_SERVER_VERBOSE
201
205
  MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
202
206
  endif
203
207
 
208
+ ifdef LLAMA_SERVER_SSL
209
+ MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
210
+ MK_LDFLAGS += -lssl -lcrypto
211
+ endif
204
212
 
205
213
  ifdef LLAMA_CODE_COVERAGE
206
214
  MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
@@ -451,7 +459,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
451
459
  ifdef LLAMA_CUDA_CCBIN
452
460
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
453
461
  endif
454
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
462
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
455
463
  ifdef JETSON_EOL_MODULE_DETECT
456
464
  $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
457
465
  else
@@ -551,15 +559,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
551
559
  $(CC) $(CFLAGS) -c $< -o $@
552
560
 
553
561
  ifdef LLAMA_METAL_EMBED_LIBRARY
554
- ggml-metal-embed.o: ggml-metal.metal
562
+ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
555
563
  @echo "Embedding Metal library"
564
+ @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
556
565
  $(eval TEMP_ASSEMBLY=$(shell mktemp))
557
- @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
558
- @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
559
- @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
560
- @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
561
- @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
562
- @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
566
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
567
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
568
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
569
+ @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
570
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
571
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
563
572
  @$(AS) $(TEMP_ASSEMBLY) -o $@
564
573
  @rm -f ${TEMP_ASSEMBLY}
565
574
  endif
@@ -628,12 +637,15 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
628
637
  ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
629
638
  $(CC) $(CFLAGS) -c $< -o $@
630
639
 
631
- ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
640
+ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
632
641
  $(CC) $(CFLAGS) -c $< -o $@
633
642
 
634
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
643
+ unicode.o: unicode.cpp unicode.h
644
+ $(CXX) $(CXXFLAGS) -c $< -o $@
635
645
 
636
- llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
646
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
647
+
648
+ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
637
649
  $(CXX) $(CXXFLAGS) -c $< -o $@
638
650
 
639
651
  COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
@@ -725,6 +737,10 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
725
737
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
726
738
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
727
739
 
740
+ gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
741
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
742
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
743
+
728
744
  save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
729
745
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
730
746
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
61
61
  }
62
62
  }
63
63
 
64
- // TODO: GGML_PAD ?
65
64
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
66
65
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
67
66
  size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
69
68
  }
70
69
 
71
70
  // tallocr
72
- struct ggml_tallocr {
73
- ggml_backend_buffer_t buffer;
74
- void * base;
75
- size_t alignment;
76
- size_t offset;
77
- };
78
-
79
- ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
80
- ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
81
- if (talloc == NULL) {
82
- return NULL;
83
- }
84
71
 
72
+ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
85
73
  void * base = ggml_backend_buffer_get_base(buffer);
86
74
  size_t align = ggml_backend_buffer_get_alignment(buffer);
87
75
 
88
76
  assert(align && !(align & (align - 1))); // power of 2
89
77
 
90
- *talloc = (struct ggml_tallocr) {
78
+ struct ggml_tallocr talloc = (struct ggml_tallocr) {
91
79
  /*.buffer = */ buffer,
92
80
  /*.base = */ base,
93
81
  /*.alignment = */ align,
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
96
84
  return talloc;
97
85
  }
98
86
 
99
- void ggml_tallocr_free(ggml_tallocr_t talloc) {
100
- free(talloc);
101
- }
102
-
103
- void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
87
+ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
104
88
  size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
105
89
  size = GGML_PAD(size, talloc->alignment);
106
90
 
@@ -354,12 +338,16 @@ struct hash_node {
354
338
  bool allocated;
355
339
  };
356
340
 
357
- //
358
341
  struct tensor_alloc {
359
342
  size_t offset;
360
343
  size_t size_max; // 0 = pre-allocated, unused, or view
361
344
  };
362
345
 
346
+ struct leaf_alloc {
347
+ int buffer_id;
348
+ struct tensor_alloc leaf;
349
+ };
350
+
363
351
  struct node_alloc {
364
352
  int buffer_id;
365
353
  struct tensor_alloc dst;
@@ -378,7 +366,7 @@ struct ggml_gallocr {
378
366
  struct node_alloc * node_allocs; // [n_nodes]
379
367
  int n_nodes;
380
368
 
381
- struct tensor_alloc * leaf_allocs; // [n_leafs]
369
+ struct leaf_alloc * leaf_allocs; // [n_leafs]
382
370
  int n_leafs;
383
371
  };
384
372
 
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
543
531
  return node_buffer_ids ? node_buffer_ids[i] : 0;
544
532
  }
545
533
 
546
- static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
534
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
547
535
  // clear hash tables
548
536
  memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
549
537
  memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
550
538
 
539
+ // allocate leafs
540
+ // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
541
+ for (int i = 0; i < graph->n_leafs; i++) {
542
+ struct ggml_tensor * leaf = graph->leafs[i];
543
+ ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
544
+ }
545
+
551
546
  // count number of children and views
552
- // allocate all graph inputs and leafs first to avoid overwriting them
547
+ // allocate other graph inputs and leafs first to avoid overwriting them
553
548
  for (int i = 0; i < graph->n_nodes; i++) {
554
549
  struct ggml_tensor * node = graph->nodes[i];
555
550
 
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
577
572
  }
578
573
  }
579
574
 
580
- // allocate the remaining leafs that are unused on the graph
581
- // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
582
- for (int i = 0; i < graph->n_leafs; i++) {
583
- struct ggml_tensor * leaf = graph->leafs[i];
584
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
585
-
586
- if (hn->n_children == 0) {
587
- assert(!hn->allocated);
588
- // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
589
- ggml_gallocr_allocate_node(galloc, leaf, 0);
590
- }
591
- }
592
-
593
575
  // allocate tensors
594
576
  for (int i = 0; i < graph->n_nodes; i++) {
595
577
  struct ggml_tensor * node = graph->nodes[i];
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
652
634
  }
653
635
  }
654
636
 
655
- bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
637
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
656
638
  size_t hash_size = graph->visited_hash_table.size;
657
639
 
658
640
  // initialize hash table
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
676
658
  }
677
659
 
678
660
  // allocate in hash table
679
- ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
661
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
680
662
 
681
663
  // set the node_allocs from the hash table
682
664
  if (galloc->n_nodes < graph->n_nodes) {
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
711
693
  }
712
694
  if (galloc->n_leafs < graph->n_leafs) {
713
695
  free(galloc->leaf_allocs);
714
- galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
696
+ galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
715
697
  GGML_ASSERT(galloc->leaf_allocs != NULL);
716
698
  }
717
699
  galloc->n_leafs = graph->n_leafs;
718
700
  for (int i = 0; i < graph->n_leafs; i++) {
719
701
  struct ggml_tensor * leaf = graph->leafs[i];
720
702
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
721
- galloc->leaf_allocs[i].offset = hn->offset;
722
- galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
703
+ galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
704
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
705
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
723
706
  }
724
707
 
725
708
  // reallocate buffers if needed
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
727
710
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
728
711
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
729
712
 
730
- if (new_size > cur_size) {
713
+ // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
714
+ if (new_size > cur_size || galloc->buffers[i] == NULL) {
731
715
  #ifndef NDEBUG
732
716
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
733
717
  #endif
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
744
728
  }
745
729
 
746
730
  bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
747
- return ggml_gallocr_reserve_n(galloc, graph, NULL);
731
+ return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
748
732
  }
749
733
 
750
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
751
- assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
734
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
735
+ assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
752
736
 
753
- if (node->view_src != NULL) {
754
- if (node->buffer == NULL) {
737
+ if (tensor->view_src != NULL) {
738
+ if (tensor->buffer == NULL) {
755
739
  assert(tensor_alloc->offset == SIZE_MAX);
756
- if (node->view_src->buffer == NULL) {
740
+ if (tensor->view_src->buffer == NULL) {
757
741
  // this tensor was allocated without ggml-backend
758
742
  return;
759
743
  }
760
- ggml_backend_view_init(galloc->buffers[buffer_id], node);
744
+ ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
761
745
  }
762
746
  } else {
763
- if (node->data == NULL) {
747
+ if (tensor->data == NULL) {
764
748
  assert(tensor_alloc->offset != SIZE_MAX);
765
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
749
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
766
750
  void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
767
751
  void * addr = (char *)base + tensor_alloc->offset;
768
- ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
752
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
769
753
  } else {
770
- if (node->buffer == NULL) {
754
+ if (tensor->buffer == NULL) {
771
755
  // this tensor was allocated without ggml-backend
772
756
  return;
773
757
  }
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
843
827
 
844
828
  // reset buffers
845
829
  for (int i = 0; i < galloc->n_buffers; i++) {
846
- // zero size buffers are not allocated
847
830
  if (galloc->buffers[i] != NULL) {
848
831
  ggml_backend_buffer_reset(galloc->buffers[i]);
849
832
  }
850
833
  }
851
834
 
852
835
  // allocate the graph tensors from the previous assignments
836
+ // leafs
837
+ for (int i = 0; i < graph->n_leafs; i++) {
838
+ struct ggml_tensor * leaf = graph->leafs[i];
839
+ struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
840
+ ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
841
+ }
853
842
  // nodes
854
843
  for (int i = 0; i < graph->n_nodes; i++) {
855
844
  struct ggml_tensor * node = graph->nodes[i];
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
863
852
  }
864
853
  ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
865
854
  }
866
- // leafs
867
- for (int i = 0; i < graph->n_leafs; i++) {
868
- struct ggml_tensor * leaf = graph->leafs[i];
869
- struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
870
- ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
871
- }
872
855
 
873
856
  return true;
874
857
  }
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
900
883
  return false;
901
884
  }
902
885
 
903
- struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
886
+ struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
904
887
 
905
888
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
906
889
  if (t->data == NULL) {
907
890
  if (t->view_src == NULL) {
908
- ggml_tallocr_alloc(tallocr, t);
891
+ ggml_tallocr_alloc(&tallocr, t);
909
892
  } else if (t->buffer == NULL) {
910
893
  ggml_backend_view_init(buffer, t);
911
894
  }
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
917
900
  }
918
901
  }
919
902
 
920
- ggml_tallocr_free(tallocr);
921
-
922
903
  *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
923
904
  (*buffers)[(*n_buffers)++] = buffer;
924
905
 
@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
11
11
  typedef struct ggml_backend * ggml_backend_t;
12
12
 
13
13
  // Tensor allocator
14
- typedef struct ggml_tallocr * ggml_tallocr_t;
14
+ struct ggml_tallocr {
15
+ ggml_backend_buffer_t buffer;
16
+ void * base;
17
+ size_t alignment;
18
+ size_t offset;
19
+ };
15
20
 
16
- GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
17
- GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
18
- GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
21
+ GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
22
+ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
19
23
 
20
24
  // Graph allocator
21
25
  /*
@@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
50
54
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
51
55
  // returns false if the buffer allocation failed
52
56
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
53
- GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
57
+ GGML_API bool ggml_gallocr_reserve_n(
58
+ ggml_gallocr_t galloc,
59
+ struct ggml_cgraph * graph,
60
+ const int * node_buffer_ids,
61
+ const int * leaf_buffer_ids);
54
62
 
55
63
  // automatic reallocation if the topology changes when using a single buffer
56
64
  // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
@@ -86,12 +86,12 @@ extern "C" {
86
86
  // (optional) asynchronous tensor data access
87
87
  void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
88
88
  void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
89
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
89
+ bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
90
90
 
91
91
  // (optional) complete all pending operations
92
92
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
93
 
94
- // create a plan for ggml_cgraph and free it
94
+ // compute graph with a plan (not used currently)
95
95
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
96
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
97
 
@@ -102,16 +102,27 @@ extern "C" {
102
102
 
103
103
  // check if the backend supports an operation
104
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
+
106
+ // (optional) event synchronization
107
+ ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
+ void (*GGML_CALL event_free) (ggml_backend_event_t event);
109
+ void (*GGML_CALL event_record) (ggml_backend_event_t event);
110
+ void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
111
+ void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
105
112
  };
106
113
 
107
114
  struct ggml_backend {
108
115
  ggml_guid_t guid;
109
116
 
110
117
  struct ggml_backend_i iface;
111
-
112
118
  ggml_backend_context_t context;
113
119
  };
114
120
 
121
+ struct ggml_backend_event {
122
+ ggml_backend_t backend;
123
+ void * context;
124
+ };
125
+
115
126
  //
116
127
  // Backend registry
117
128
  //