llama_cpp 0.12.0 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 87010edca1b352ae7bdd3a693451893b13dd75e9e109f9e2b42f6164cc186b08
4
- data.tar.gz: ff34254b6377698903dcf771663b91c3c804111228888d96e91363bd0f29d3a6
3
+ metadata.gz: a65b477c93060832783d03b065dd336820bf22e985dd7b9f53a20e5834f29a0d
4
+ data.tar.gz: 3ab3f5147bb207ddeea4b902e86de41398fbe497bb521ab00a4fe89ccd790d50
5
5
  SHA512:
6
- metadata.gz: a23aa59fa4936940b28942398bfe98bdb09574162943ebaff31cdbda19394c7690f6c780f49da31eecc4b77427718a8b7ee58e62b2adb087100e1eee66310abc
7
- data.tar.gz: 5cc105e69fc81d4616d93cd036af70f809be0c99b9155a6d3e386c9900ca012123353c23417ce56a5a64a1d805108b35de2d9feb5a6265c110d9341e5a2e242b
6
+ metadata.gz: 119a77a344ece09afda87d89321f679b9c53975c6b340150e298fa3869a0bf48849fafd49e5ef18b001311aae10e3fa9aba29c96de2c4aa8535cdad7d01382cb
7
+ data.tar.gz: 444fc224413ee6fc94b0866da07460e9c95162941fcd80c831c6f7a950373503eba74b10d437724db2c9debec4719c5a9b25875f1b0a014c956bcb424ca8bf47
data/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ ## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
2
+
3
+ - Bump bundled llama.cpp from b1833 to b1892.
4
+ - Change `LLAMA_SESSION_VERSION` value from 3 to 4.
5
+ - Add constants for split mode: `LLAMA_SPLIT_NONE`, `LLAMA_SPLIT_LAYER`, and `LLAMA_SPLIT_ROW`
6
+ - Add `split_mode` accessor to ModelParams.
7
+ - Add `sample_apply_guidance` method to Context.
8
+
9
+ ## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
10
+
11
+ - Bump bundled llama.cpp from b1768 to b1833.
12
+ - Add model file type constants.
13
+ - Add `kv_cache_seq_div` method to `Context`.
14
+
1
15
  ## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
2
16
 
3
17
  - Add `get_one` singleton method to `Batch`.
@@ -789,6 +789,8 @@ public:
789
789
  rb_define_alloc_func(rb_cLLaMAModelParams, llama_model_params_alloc);
790
790
  rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_model_params_set_n_gpu_layers), 1);
791
791
  rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_model_params_get_n_gpu_layers), 0);
792
+ rb_define_method(rb_cLLaMAModelParams, "split_mode=", RUBY_METHOD_FUNC(_llama_model_params_set_split_mode), 1);
793
+ rb_define_method(rb_cLLaMAModelParams, "split_mode", RUBY_METHOD_FUNC(_llama_model_params_get_split_mode), 0);
792
794
  rb_define_method(rb_cLLaMAModelParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_model_params_set_main_gpu), 1);
793
795
  rb_define_method(rb_cLLaMAModelParams, "main_gpu", RUBY_METHOD_FUNC(_llama_model_params_get_main_gpu), 0);
794
796
  rb_define_method(rb_cLLaMAModelParams, "tensor_split", RUBY_METHOD_FUNC(_llama_model_params_get_tensor_split), 0);
@@ -815,6 +817,18 @@ private:
815
817
  return INT2NUM(ptr->params.n_gpu_layers);
816
818
  }
817
819
 
820
+ // split_mode
821
+ static VALUE _llama_model_params_set_split_mode(VALUE self, VALUE split_mode) {
822
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
823
+ ptr->params.split_mode = static_cast<enum llama_split_mode>(NUM2INT(split_mode));
824
+ return INT2NUM(ptr->params.split_mode);
825
+ }
826
+
827
+ static VALUE _llama_model_params_get_split_mode(VALUE self) {
828
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
829
+ return INT2NUM(ptr->params.split_mode);
830
+ }
831
+
818
832
  // main_gpu
819
833
  static VALUE _llama_model_params_set_main_gpu(VALUE self, VALUE main_gpu) {
820
834
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
@@ -2026,10 +2040,12 @@ public:
2026
2040
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
2027
2041
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
2028
2042
  rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
2043
+ rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
2029
2044
  rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
2030
2045
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
2031
2046
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
2032
2047
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
2048
+ rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
2033
2049
  rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
2034
2050
  rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
2035
2051
  rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
@@ -2378,6 +2394,16 @@ private:
2378
2394
  return Qnil;
2379
2395
  }
2380
2396
 
2397
+ static VALUE _llama_context_kv_cache_seq_div(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE d) {
2398
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2399
+ if (ptr->ctx == NULL) {
2400
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2401
+ return Qnil;
2402
+ }
2403
+ llama_kv_cache_seq_div(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(d));
2404
+ return Qnil;
2405
+ }
2406
+
2381
2407
  static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
2382
2408
  LLaMAContextWrapper* ptr = get_llama_context(self);
2383
2409
  if (ptr->ctx == NULL) {
@@ -2542,6 +2568,51 @@ private:
2542
2568
  return Qnil;
2543
2569
  }
2544
2570
 
2571
+ static VALUE _llama_context_sample_apply_guidance(int argc, VALUE* argv, VALUE self) {
2572
+ VALUE kw_args = Qnil;
2573
+ ID kw_table[3] = { rb_intern("logits"), rb_intern("logits_guidance"), rb_intern("scale") };
2574
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2575
+ rb_scan_args(argc, argv, ":", &kw_args);
2576
+ rb_get_kwargs(kw_args, kw_table, 0, 3, kw_values);
2577
+
2578
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2579
+ rb_raise(rb_eArgError, "logits must be an Array");
2580
+ return Qnil;
2581
+ }
2582
+ if (!RB_TYPE_P(kw_values[1], T_ARRAY)) {
2583
+ rb_raise(rb_eArgError, "logits_guidance must be an Array");
2584
+ return Qnil;
2585
+ }
2586
+ if (!RB_FLOAT_TYPE_P(kw_values[2])) {
2587
+ rb_raise(rb_eArgError, "scale must be a float");
2588
+ return Qnil;
2589
+ }
2590
+
2591
+ const size_t sz_logits = RARRAY_LEN(kw_values[0]);
2592
+ std::vector<float> logits(sz_logits);
2593
+ for (size_t i = 0; i < sz_logits; i++) {
2594
+ logits[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
2595
+ }
2596
+
2597
+ const size_t sz_logits_guidance = RARRAY_LEN(kw_values[1]);
2598
+ std::vector<float> logits_guidance(sz_logits_guidance);
2599
+ for (size_t i = 0; i < sz_logits_guidance; i++) {
2600
+ logits_guidance[i] = NUM2DBL(rb_ary_entry(kw_values[1], i));
2601
+ }
2602
+
2603
+ const float scale = NUM2DBL(kw_values[2]);
2604
+
2605
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
2606
+ if (ctx_ptr->ctx == NULL) {
2607
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2608
+ return Qnil;
2609
+ }
2610
+
2611
+ llama_sample_apply_guidance(ctx_ptr->ctx, logits.data(), logits_guidance.data(), scale);
2612
+
2613
+ return Qnil;
2614
+ }
2615
+
2545
2616
  static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
2546
2617
  VALUE kw_args = Qnil;
2547
2618
  ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
@@ -3209,6 +3280,9 @@ extern "C" void Init_llama_cpp(void) {
3209
3280
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
3210
3281
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
3211
3282
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
3283
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
3284
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
3285
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3212
3286
 
3213
3287
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3214
3288
 
@@ -3230,6 +3304,10 @@ extern "C" void Init_llama_cpp(void) {
3230
3304
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
3231
3305
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3232
3306
 
3307
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
3308
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
3309
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
3310
+
3233
3311
  std::stringstream ss_magic;
3234
3312
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
3235
3313
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.0'
6
+ VERSION = '0.12.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1768'
9
+ LLAMA_CPP_VERSION = 'b1892'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -22,6 +22,9 @@ module LLaMACpp
22
22
  LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
23
23
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
24
24
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
25
+ LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
26
+ LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
27
+ LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
25
28
 
26
29
  LLAMA_KV_OVERRIDE_INT: Integer
27
30
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -41,6 +44,10 @@ module LLaMACpp
41
44
  LLAMA_ROPE_SCALING_YARN: Integer
42
45
  LLAMA_ROPE_SCALING_MAX_VALUE: Integer
43
46
 
47
+ LLAMA_SPLIT_NONE: Integer
48
+ LLAMA_SPLIT_LAYER: Integer
49
+ LLAMA_SPLIT_ROW: Integer
50
+
44
51
  def self?.backend_init: (?numa: bool) -> void
45
52
  def self?.backend_free: () -> void
46
53
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -135,6 +142,8 @@ module LLaMACpp
135
142
 
136
143
  def n_gpu_layers: () -> Integer
137
144
  def n_gpu_layers=: (Integer) -> Integer
145
+ def split_mode: () -> Integer
146
+ def split_mode=: (Integer) -> Integer
138
147
  def main_gpu: () -> Integer
139
148
  def main_gpu=: (Integer) -> Integer
140
149
  def tensor_split: () -> Array[Float]
@@ -193,10 +202,12 @@ module LLaMACpp
193
202
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
194
203
  def kv_cache_seq_keep: (Integer) -> void
195
204
  def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
205
+ def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
196
206
  def set_rng_seed: (Integer) -> void
197
207
  def load_session_file: (session_path: String) -> void
198
208
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
199
209
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
210
+ def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
200
211
  def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
201
212
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
202
213
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
@@ -1,8 +1,8 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
- main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
3
+ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
4
  simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
5
+ speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
43
43
  endif
44
44
  endif
45
45
 
46
- ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
47
- BUILD_TARGETS += metal
48
- endif
49
-
50
46
  default: $(BUILD_TARGETS)
51
47
 
52
48
  test: $(TEST_TARGETS)
@@ -620,6 +616,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
620
616
  perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
621
617
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
622
618
 
619
+ imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
620
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
621
+
623
622
  embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
624
623
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
625
624
 
@@ -671,10 +670,8 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
671
670
  lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
672
671
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
673
672
 
674
- ifdef LLAMA_METAL
675
- metal: examples/metal/metal.cpp ggml.o $(OBJS)
676
- $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
677
- endif
673
+ passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
674
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
678
675
 
679
676
  ifeq ($(UNAME_S),Darwin)
680
677
  swift: examples/batched.swift
@@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
102
102
  }
103
103
  }
104
104
 
105
- AT_PRINTF("block %d\n", best_fit_block);
106
-
107
105
  if (best_fit_block == -1) {
108
106
  // the last block is our last resort
109
107
  struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
@@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
117
115
  return;
118
116
  }
119
117
  }
118
+
120
119
  struct free_block * block = &alloc->free_blocks[best_fit_block];
121
120
  void * addr = block->addr;
122
121
  block->addr = (char*)block->addr + size;
@@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
129
128
  }
130
129
  }
131
130
 
131
+ AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
132
+
132
133
  tensor->data = addr;
133
134
  tensor->buffer = alloc->buffer;
134
135
  if (!alloc->measure) {
@@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
229
230
  alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
230
231
  } else {
231
232
  alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
233
+ ggml_backend_buffer_reset(alloc->buffer);
232
234
  }
233
235
  }
234
236
 
@@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
263
265
  return alloc;
264
266
  }
265
267
 
266
- ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
268
+ ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
267
269
  // create a backend buffer to get the correct tensor allocation sizes
268
- ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
270
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
269
271
 
270
272
  // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
271
273
  ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
@@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
275
277
  return alloc;
276
278
  }
277
279
 
278
- ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
279
- ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
280
+ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
281
+ return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
282
+ }
283
+
284
+ ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
285
+ // create a backend buffer to get the correct tensor allocation sizes
286
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
280
287
  ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
281
288
  alloc->buffer_owned = true;
282
289
  return alloc;
283
290
  }
284
291
 
292
+ ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
293
+ return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
294
+ }
295
+
285
296
  ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
286
297
  ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
287
298
 
@@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
779
790
 
780
791
  if (nbytes == 0) {
781
792
  // all the tensors in the context are already allocated
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
795
+ #endif
782
796
  return NULL;
783
797
  }
784
798
 
785
799
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
800
+ if (buffer == NULL) {
801
+ // failed to allocate buffer
802
+ #ifndef NDEBUG
803
+ fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
804
+ #endif
805
+ return NULL;
806
+ }
807
+
786
808
  ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
787
809
 
788
810
  for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
@@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
52
52
 
53
53
  GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
54
54
  GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
55
- GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
55
+ GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
56
56
  GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
57
+ GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
58
+ GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
57
59
  GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
58
60
 
59
61
  GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
@@ -16,13 +16,14 @@ extern "C" {
16
16
  typedef void * ggml_backend_buffer_type_context_t;
17
17
 
18
18
  struct ggml_backend_buffer_type_i {
19
- ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
20
- size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
21
- size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
22
- bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
19
+ const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
+ ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
+ size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
+ size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
+ bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23
24
  // check if tensor data is in host memory
24
25
  // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
25
- bool (*is_host) (ggml_backend_buffer_type_t buft);
26
+ bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
26
27
  };
27
28
 
28
29
  struct ggml_backend_buffer_type {
@@ -34,16 +35,15 @@ extern "C" {
34
35
  typedef void * ggml_backend_buffer_context_t;
35
36
 
36
37
  struct ggml_backend_buffer_i {
37
- void (*free_buffer) (ggml_backend_buffer_t buffer);
38
- //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
39
- void * (*get_base) (ggml_backend_buffer_t buffer);
40
- void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
41
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
42
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
43
- // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
44
- void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
45
- void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
46
- void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
38
+ const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
39
+ void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
40
+ void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
41
+ void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
42
+ void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
43
+ void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
44
+ bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
45
+ void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
46
+ void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
47
47
  };
48
48
 
49
49
  struct ggml_backend_buffer {
@@ -51,14 +51,17 @@ extern "C" {
51
51
  ggml_backend_buffer_type_t buft;
52
52
  ggml_backend_buffer_context_t context;
53
53
  size_t size;
54
+ enum ggml_backend_buffer_usage usage;
54
55
  };
55
56
 
56
- ggml_backend_buffer_t ggml_backend_buffer_init(
57
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
57
58
  ggml_backend_buffer_type_t buft,
58
59
  struct ggml_backend_buffer_i iface,
59
60
  ggml_backend_buffer_context_t context,
60
61
  size_t size);
61
62
 
63
+ // do not use directly, use ggml_backend_tensor_copy instead
64
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
62
65
 
63
66
  //
64
67
  // Backend
@@ -67,33 +70,31 @@ extern "C" {
67
70
  typedef void * ggml_backend_context_t;
68
71
 
69
72
  struct ggml_backend_i {
70
- const char * (*get_name)(ggml_backend_t backend);
73
+ const char * (*GGML_CALL get_name)(ggml_backend_t backend);
71
74
 
72
- void (*free)(ggml_backend_t backend);
75
+ void (*GGML_CALL free)(ggml_backend_t backend);
73
76
 
74
77
  // buffer allocation
75
- ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
78
+ ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
76
79
 
77
- // (optional) asynchroneous tensor data access
78
- void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
79
- void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
80
+ // (optional) asynchronous tensor data access
81
+ void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
82
+ void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
83
+ bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
80
84
 
81
- // (optional) asynchroneous tensor copy
82
- void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
83
- void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
84
-
85
- void (*synchronize)(ggml_backend_t backend);
85
+ // (optional) complete all pending operations
86
+ void (*GGML_CALL synchronize)(ggml_backend_t backend);
86
87
 
87
88
  // compute graph with a plan
88
- ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
89
- void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
90
- void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
89
+ ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
90
+ void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
91
+ void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
91
92
 
92
- // compute graph without a plan
93
- void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
93
+ // compute graph without a plan (async)
94
+ bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
94
95
 
95
96
  // check if the backend supports an operation
96
- bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
97
+ bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
97
98
  };
98
99
 
99
100
  struct ggml_backend {
@@ -102,14 +103,13 @@ extern "C" {
102
103
  ggml_backend_context_t context;
103
104
  };
104
105
 
105
-
106
106
  //
107
107
  // Backend registry
108
108
  //
109
109
 
110
- typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
110
+ typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
111
111
 
112
- void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
112
+ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
113
113
 
114
114
  #ifdef __cplusplus
115
115
  }