llama_cpp 0.14.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c7d855ccd32ae097f26a671751d6a2178361cf8d8a6c1b99af37859f2c47ca03
4
- data.tar.gz: 3b17318424d08c65ad34da3fa14956c86db0a2ea05ac174323a9b8d2b9e69d59
3
+ metadata.gz: c2a192fa17c1d313a93306e415ec27dfb8fb6ce993b9fc78797ed6e1d38ca63f
4
+ data.tar.gz: f800e54961a8bea5de95373d15f0cda30f7e95edd655cc0504247dfefcff473a
5
5
  SHA512:
6
- metadata.gz: 2d90bf9fdd8dbaf5e67b7fb8797a9412168ae6ce5fcfc4c6aca34e194d5beb5204184b5bb36d65dc507a7a618ac9e938987e8d8bf5871e4eb6304b5e6de06020
7
- data.tar.gz: eab524367ace146eb6e20786bd530cead145e1651bcdb726afbb5364609d04b22ca8a515016bb0c2d154ea97fb62f19222c122bc9bb5efe7fc389a6f259da6f0
6
+ metadata.gz: 48cefba1491319f82d52a46e8be34b5f0115dbe80bd6a9fdbf4fe0e190581a6b1ff8c3e2b2dfdaefeaa0b7cb11c8b9f5a84bcb60354f64248abbee3d488378ee
7
+ data.tar.gz: 9c6d75d3818b61192bd5c93a8b091003e2342f28102de1fbc9a1a02955a7c89e2a144b82bbe83e805b3f741261e967469c3ad2f6d347b1b870fb51880b850d89
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## [[0.14.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.0...v0.14.1)] - 2024-03-16
2
+
3
+ - Bump llama.cpp from b2361 to b2435.
4
+ - Add constants for vocaburary type: `LLAMA_VOCAB_TYPE_NONE`.
5
+ - Add `n_ubatch` and `n_seq_max` accessors to `ContextParams`.
6
+ - Add `n_ubatch`, `n_seq_max`, `set_causal_attn`, and `synchronize` methods to `Context`.
7
+
1
8
  ## [[0.14.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.13.0...v0.14.0)] - 2024-03-09
2
9
 
3
10
  **Breaking Changes**
@@ -946,6 +946,10 @@ public:
946
946
  rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
947
947
  rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
948
948
  rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
949
+ rb_define_method(rb_cLLaMAContextParams, "n_ubatch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ubatch), 1);
950
+ rb_define_method(rb_cLLaMAContextParams, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_params_get_n_ubatch), 0);
951
+ rb_define_method(rb_cLLaMAContextParams, "n_seq_max=", RUBY_METHOD_FUNC(_llama_context_params_set_n_seq_max), 1);
952
+ rb_define_method(rb_cLLaMAContextParams, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_params_get_n_seq_max), 0);
949
953
  rb_define_method(rb_cLLaMAContextParams, "n_threads=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads), 1);
950
954
  rb_define_method(rb_cLLaMAContextParams, "n_threads", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads), 0);
951
955
  rb_define_method(rb_cLLaMAContextParams, "n_threads_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads_batch), 1);
@@ -1031,6 +1035,30 @@ private:
1031
1035
  return INT2NUM(ptr->params.n_batch);
1032
1036
  }
1033
1037
 
1038
+ // n_ubatch
1039
+ static VALUE _llama_context_params_set_n_ubatch(VALUE self, VALUE n_ubatch) {
1040
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1041
+ ptr->params.n_ubatch = NUM2INT(n_ubatch);
1042
+ return INT2NUM(ptr->params.n_ubatch);
1043
+ }
1044
+
1045
+ static VALUE _llama_context_params_get_n_ubatch(VALUE self) {
1046
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1047
+ return INT2NUM(ptr->params.n_ubatch);
1048
+ }
1049
+
1050
+ // n_seq_max
1051
+ static VALUE _llama_context_params_set_n_seq_max(VALUE self, VALUE n_seq_max) {
1052
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1053
+ ptr->params.n_seq_max = NUM2INT(n_seq_max);
1054
+ return INT2NUM(ptr->params.n_seq_max);
1055
+ }
1056
+
1057
+ static VALUE _llama_context_params_get_n_seq_max(VALUE self) {
1058
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1059
+ return INT2NUM(ptr->params.n_seq_max);
1060
+ }
1061
+
1034
1062
  // n_threads
1035
1063
  static VALUE _llama_context_params_set_n_threads(VALUE self, VALUE n_threads) {
1036
1064
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -2019,6 +2047,8 @@ public:
2019
2047
  rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2020
2048
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2021
2049
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2050
+ rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
2051
+ rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
2022
2052
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
2023
2053
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
2024
2054
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2033,6 +2063,8 @@ public:
2033
2063
  rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
2034
2064
  rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
2035
2065
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
2066
+ rb_define_method(rb_cLLaMAContext, "set_causal_attn", RUBY_METHOD_FUNC(_llama_context_set_causal_attn), 1);
2067
+ rb_define_method(rb_cLLaMAContext, "synchronize", RUBY_METHOD_FUNC(_llama_context_synchronize), 0);
2036
2068
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
2037
2069
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
2038
2070
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
@@ -2250,6 +2282,24 @@ private:
2250
2282
  return UINT2NUM(llama_n_batch(ptr->ctx));
2251
2283
  }
2252
2284
 
2285
+ static VALUE _llama_context_n_ubatch(VALUE self) {
2286
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2287
+ if (ptr->ctx == NULL) {
2288
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2289
+ return Qnil;
2290
+ }
2291
+ return UINT2NUM(llama_n_ubatch(ptr->ctx));
2292
+ }
2293
+
2294
+ static VALUE _llama_context_n_seq_max(VALUE self) {
2295
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2296
+ if (ptr->ctx == NULL) {
2297
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2298
+ return Qnil;
2299
+ }
2300
+ return UINT2NUM(llama_n_seq_max(ptr->ctx));
2301
+ }
2302
+
2253
2303
  static VALUE _llama_context_get_timings(VALUE self) {
2254
2304
  LLaMAContextWrapper* ptr = get_llama_context(self);
2255
2305
  if (ptr->ctx == NULL) {
@@ -2395,6 +2445,26 @@ private:
2395
2445
  return Qnil;
2396
2446
  }
2397
2447
 
2448
+ static VALUE _llama_context_set_causal_attn(VALUE self, VALUE causal_attn) {
2449
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2450
+ if (ptr->ctx == NULL) {
2451
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2452
+ return Qnil;
2453
+ }
2454
+ llama_set_causal_attn(ptr->ctx, RTEST(causal_attn) ? true : false);
2455
+ return Qnil;
2456
+ }
2457
+
2458
+ static VALUE _llama_context_synchronize(VALUE self) {
2459
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2460
+ if (ptr->ctx == NULL) {
2461
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2462
+ return Qnil;
2463
+ }
2464
+ llama_synchronize(ptr->ctx);
2465
+ return Qnil;
2466
+ }
2467
+
2398
2468
  static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
2399
2469
  VALUE kw_args = Qnil;
2400
2470
  ID kw_table[1] = { rb_intern("session_path") };
@@ -3204,6 +3274,7 @@ extern "C" void Init_llama_cpp(void) {
3204
3274
  rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
3205
3275
  rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
3206
3276
 
3277
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_NONE", INT2NUM(LLAMA_VOCAB_TYPE_NONE));
3207
3278
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3208
3279
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3209
3280
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.0'
6
+ VERSION = '0.14.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2361'
9
+ LLAMA_CPP_VERSION = 'b2435'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,7 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_VOCAB_TYPE_NONE: Integer
6
7
  LLAMA_VOCAB_TYPE_SPM: Integer
7
8
  LLAMA_VOCAB_TYPE_BPE: Integer
8
9
  LLAMA_VOCAB_TYPE_WPM: Integer
@@ -207,6 +208,8 @@ module LLaMACpp
207
208
  def logits: () -> Array[Float]
208
209
  def n_ctx: () -> Integer
209
210
  def n_batch: () -> Integer
211
+ def n_ubatch: () -> Integer
212
+ def n_seq_max: () -> Integer
210
213
  def timings: () -> ::LLaMACpp::Timings
211
214
  def print_timings: () -> void
212
215
  def reset_timings: () -> void
@@ -221,6 +224,8 @@ module LLaMACpp
221
224
  def kv_cache_defrag: () -> void
222
225
  def kv_cache_update: () -> void
223
226
  def set_rng_seed: (Integer) -> void
227
+ def set_causal_attn: (bool) -> void
228
+ def synchronize: () -> void
224
229
  def load_session_file: (session_path: String) -> void
225
230
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
226
231
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
@@ -250,6 +255,10 @@ module LLaMACpp
250
255
  def n_ctx=: (Integer) -> Integer
251
256
  def n_batch: () -> Integer
252
257
  def n_batch=: (Integer) -> Integer
258
+ def n_ubatch: () -> Integer
259
+ def n_ubatch=: (Integer) -> Integer
260
+ def n_seq_max: () -> Integer
261
+ def n_seq_max=: (Integer) -> Integer
253
262
  def n_threads: () -> Integer
254
263
  def n_threads=: (Integer) -> Integer
255
264
  def n_threads_batch: () -> Integer
@@ -2,7 +2,7 @@
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
4
  simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
5
+ speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
167
167
  MK_CPPFLAGS += -D_BSD_SOURCE
168
168
  endif
169
169
 
170
+ ifdef LLAMA_SCHED_MAX_COPIES
171
+ MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
172
+ endif
173
+
170
174
  ifdef LLAMA_DEBUG
171
175
  MK_CFLAGS += -O0 -g
172
176
  MK_CXXFLAGS += -O0 -g
@@ -201,6 +205,10 @@ ifdef LLAMA_SERVER_VERBOSE
201
205
  MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
202
206
  endif
203
207
 
208
+ ifdef LLAMA_SERVER_SSL
209
+ MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
210
+ MK_LDFLAGS += -lssl -lcrypto
211
+ endif
204
212
 
205
213
  ifdef LLAMA_CODE_COVERAGE
206
214
  MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
@@ -451,7 +459,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
451
459
  ifdef LLAMA_CUDA_CCBIN
452
460
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
453
461
  endif
454
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
462
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
455
463
  ifdef JETSON_EOL_MODULE_DETECT
456
464
  $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
457
465
  else
@@ -551,15 +559,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
551
559
  $(CC) $(CFLAGS) -c $< -o $@
552
560
 
553
561
  ifdef LLAMA_METAL_EMBED_LIBRARY
554
- ggml-metal-embed.o: ggml-metal.metal
562
+ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
555
563
  @echo "Embedding Metal library"
564
+ @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
556
565
  $(eval TEMP_ASSEMBLY=$(shell mktemp))
557
- @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
558
- @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
559
- @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
560
- @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
561
- @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
562
- @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
566
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
567
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
568
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
569
+ @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
570
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
571
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
563
572
  @$(AS) $(TEMP_ASSEMBLY) -o $@
564
573
  @rm -f ${TEMP_ASSEMBLY}
565
574
  endif
@@ -628,12 +637,15 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
628
637
  ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
629
638
  $(CC) $(CFLAGS) -c $< -o $@
630
639
 
631
- ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
640
+ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
632
641
  $(CC) $(CFLAGS) -c $< -o $@
633
642
 
634
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
643
+ unicode.o: unicode.cpp unicode.h
644
+ $(CXX) $(CXXFLAGS) -c $< -o $@
635
645
 
636
- llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
646
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
647
+
648
+ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
637
649
  $(CXX) $(CXXFLAGS) -c $< -o $@
638
650
 
639
651
  COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
@@ -725,6 +737,10 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
725
737
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
726
738
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
727
739
 
740
+ gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
741
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
742
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
743
+
728
744
  save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
729
745
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
730
746
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
61
61
  }
62
62
  }
63
63
 
64
- // TODO: GGML_PAD ?
65
64
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
66
65
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
67
66
  size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
69
68
  }
70
69
 
71
70
  // tallocr
72
- struct ggml_tallocr {
73
- ggml_backend_buffer_t buffer;
74
- void * base;
75
- size_t alignment;
76
- size_t offset;
77
- };
78
-
79
- ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
80
- ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
81
- if (talloc == NULL) {
82
- return NULL;
83
- }
84
71
 
72
+ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
85
73
  void * base = ggml_backend_buffer_get_base(buffer);
86
74
  size_t align = ggml_backend_buffer_get_alignment(buffer);
87
75
 
88
76
  assert(align && !(align & (align - 1))); // power of 2
89
77
 
90
- *talloc = (struct ggml_tallocr) {
78
+ struct ggml_tallocr talloc = (struct ggml_tallocr) {
91
79
  /*.buffer = */ buffer,
92
80
  /*.base = */ base,
93
81
  /*.alignment = */ align,
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
96
84
  return talloc;
97
85
  }
98
86
 
99
- void ggml_tallocr_free(ggml_tallocr_t talloc) {
100
- free(talloc);
101
- }
102
-
103
- void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
87
+ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
104
88
  size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
105
89
  size = GGML_PAD(size, talloc->alignment);
106
90
 
@@ -354,12 +338,16 @@ struct hash_node {
354
338
  bool allocated;
355
339
  };
356
340
 
357
- //
358
341
  struct tensor_alloc {
359
342
  size_t offset;
360
343
  size_t size_max; // 0 = pre-allocated, unused, or view
361
344
  };
362
345
 
346
+ struct leaf_alloc {
347
+ int buffer_id;
348
+ struct tensor_alloc leaf;
349
+ };
350
+
363
351
  struct node_alloc {
364
352
  int buffer_id;
365
353
  struct tensor_alloc dst;
@@ -378,7 +366,7 @@ struct ggml_gallocr {
378
366
  struct node_alloc * node_allocs; // [n_nodes]
379
367
  int n_nodes;
380
368
 
381
- struct tensor_alloc * leaf_allocs; // [n_leafs]
369
+ struct leaf_alloc * leaf_allocs; // [n_leafs]
382
370
  int n_leafs;
383
371
  };
384
372
 
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
543
531
  return node_buffer_ids ? node_buffer_ids[i] : 0;
544
532
  }
545
533
 
546
- static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
534
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
547
535
  // clear hash tables
548
536
  memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
549
537
  memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
550
538
 
539
+ // allocate leafs
540
+ // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
541
+ for (int i = 0; i < graph->n_leafs; i++) {
542
+ struct ggml_tensor * leaf = graph->leafs[i];
543
+ ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
544
+ }
545
+
551
546
  // count number of children and views
552
- // allocate all graph inputs and leafs first to avoid overwriting them
547
+ // allocate other graph inputs and leafs first to avoid overwriting them
553
548
  for (int i = 0; i < graph->n_nodes; i++) {
554
549
  struct ggml_tensor * node = graph->nodes[i];
555
550
 
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
577
572
  }
578
573
  }
579
574
 
580
- // allocate the remaining leafs that are unused on the graph
581
- // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
582
- for (int i = 0; i < graph->n_leafs; i++) {
583
- struct ggml_tensor * leaf = graph->leafs[i];
584
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
585
-
586
- if (hn->n_children == 0) {
587
- assert(!hn->allocated);
588
- // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
589
- ggml_gallocr_allocate_node(galloc, leaf, 0);
590
- }
591
- }
592
-
593
575
  // allocate tensors
594
576
  for (int i = 0; i < graph->n_nodes; i++) {
595
577
  struct ggml_tensor * node = graph->nodes[i];
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
652
634
  }
653
635
  }
654
636
 
655
- bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
637
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
656
638
  size_t hash_size = graph->visited_hash_table.size;
657
639
 
658
640
  // initialize hash table
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
676
658
  }
677
659
 
678
660
  // allocate in hash table
679
- ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
661
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
680
662
 
681
663
  // set the node_allocs from the hash table
682
664
  if (galloc->n_nodes < graph->n_nodes) {
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
711
693
  }
712
694
  if (galloc->n_leafs < graph->n_leafs) {
713
695
  free(galloc->leaf_allocs);
714
- galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
696
+ galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
715
697
  GGML_ASSERT(galloc->leaf_allocs != NULL);
716
698
  }
717
699
  galloc->n_leafs = graph->n_leafs;
718
700
  for (int i = 0; i < graph->n_leafs; i++) {
719
701
  struct ggml_tensor * leaf = graph->leafs[i];
720
702
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
721
- galloc->leaf_allocs[i].offset = hn->offset;
722
- galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
703
+ galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
704
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
705
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
723
706
  }
724
707
 
725
708
  // reallocate buffers if needed
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
727
710
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
728
711
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
729
712
 
730
- if (new_size > cur_size) {
713
+ // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
714
+ if (new_size > cur_size || galloc->buffers[i] == NULL) {
731
715
  #ifndef NDEBUG
732
716
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
733
717
  #endif
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
744
728
  }
745
729
 
746
730
  bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
747
- return ggml_gallocr_reserve_n(galloc, graph, NULL);
731
+ return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
748
732
  }
749
733
 
750
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
751
- assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
734
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
735
+ assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
752
736
 
753
- if (node->view_src != NULL) {
754
- if (node->buffer == NULL) {
737
+ if (tensor->view_src != NULL) {
738
+ if (tensor->buffer == NULL) {
755
739
  assert(tensor_alloc->offset == SIZE_MAX);
756
- if (node->view_src->buffer == NULL) {
740
+ if (tensor->view_src->buffer == NULL) {
757
741
  // this tensor was allocated without ggml-backend
758
742
  return;
759
743
  }
760
- ggml_backend_view_init(galloc->buffers[buffer_id], node);
744
+ ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
761
745
  }
762
746
  } else {
763
- if (node->data == NULL) {
747
+ if (tensor->data == NULL) {
764
748
  assert(tensor_alloc->offset != SIZE_MAX);
765
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
749
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
766
750
  void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
767
751
  void * addr = (char *)base + tensor_alloc->offset;
768
- ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
752
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
769
753
  } else {
770
- if (node->buffer == NULL) {
754
+ if (tensor->buffer == NULL) {
771
755
  // this tensor was allocated without ggml-backend
772
756
  return;
773
757
  }
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
843
827
 
844
828
  // reset buffers
845
829
  for (int i = 0; i < galloc->n_buffers; i++) {
846
- // zero size buffers are not allocated
847
830
  if (galloc->buffers[i] != NULL) {
848
831
  ggml_backend_buffer_reset(galloc->buffers[i]);
849
832
  }
850
833
  }
851
834
 
852
835
  // allocate the graph tensors from the previous assignments
836
+ // leafs
837
+ for (int i = 0; i < graph->n_leafs; i++) {
838
+ struct ggml_tensor * leaf = graph->leafs[i];
839
+ struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
840
+ ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
841
+ }
853
842
  // nodes
854
843
  for (int i = 0; i < graph->n_nodes; i++) {
855
844
  struct ggml_tensor * node = graph->nodes[i];
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
863
852
  }
864
853
  ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
865
854
  }
866
- // leafs
867
- for (int i = 0; i < graph->n_leafs; i++) {
868
- struct ggml_tensor * leaf = graph->leafs[i];
869
- struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
870
- ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
871
- }
872
855
 
873
856
  return true;
874
857
  }
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
900
883
  return false;
901
884
  }
902
885
 
903
- struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
886
+ struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
904
887
 
905
888
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
906
889
  if (t->data == NULL) {
907
890
  if (t->view_src == NULL) {
908
- ggml_tallocr_alloc(tallocr, t);
891
+ ggml_tallocr_alloc(&tallocr, t);
909
892
  } else if (t->buffer == NULL) {
910
893
  ggml_backend_view_init(buffer, t);
911
894
  }
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
917
900
  }
918
901
  }
919
902
 
920
- ggml_tallocr_free(tallocr);
921
-
922
903
  *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
923
904
  (*buffers)[(*n_buffers)++] = buffer;
924
905
 
@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
11
11
  typedef struct ggml_backend * ggml_backend_t;
12
12
 
13
13
  // Tensor allocator
14
- typedef struct ggml_tallocr * ggml_tallocr_t;
14
+ struct ggml_tallocr {
15
+ ggml_backend_buffer_t buffer;
16
+ void * base;
17
+ size_t alignment;
18
+ size_t offset;
19
+ };
15
20
 
16
- GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
17
- GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
18
- GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
21
+ GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
22
+ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
19
23
 
20
24
  // Graph allocator
21
25
  /*
@@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
50
54
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
51
55
  // returns false if the buffer allocation failed
52
56
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
53
- GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
57
+ GGML_API bool ggml_gallocr_reserve_n(
58
+ ggml_gallocr_t galloc,
59
+ struct ggml_cgraph * graph,
60
+ const int * node_buffer_ids,
61
+ const int * leaf_buffer_ids);
54
62
 
55
63
  // automatic reallocation if the topology changes when using a single buffer
56
64
  // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
@@ -86,12 +86,12 @@ extern "C" {
86
86
  // (optional) asynchronous tensor data access
87
87
  void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
88
88
  void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
89
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
89
+ bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
90
90
 
91
91
  // (optional) complete all pending operations
92
92
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
93
 
94
- // create a plan for ggml_cgraph and free it
94
+ // compute graph with a plan (not used currently)
95
95
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
96
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
97
 
@@ -102,16 +102,27 @@ extern "C" {
102
102
 
103
103
  // check if the backend supports an operation
104
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
+
106
+ // (optional) event synchronization
107
+ ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
+ void (*GGML_CALL event_free) (ggml_backend_event_t event);
109
+ void (*GGML_CALL event_record) (ggml_backend_event_t event);
110
+ void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
111
+ void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
105
112
  };
106
113
 
107
114
  struct ggml_backend {
108
115
  ggml_guid_t guid;
109
116
 
110
117
  struct ggml_backend_i iface;
111
-
112
118
  ggml_backend_context_t context;
113
119
  };
114
120
 
121
+ struct ggml_backend_event {
122
+ ggml_backend_t backend;
123
+ void * context;
124
+ };
125
+
115
126
  //
116
127
  // Backend registry
117
128
  //