llama_cpp 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 13381408318e71cc1fc55c40ee9be6e62ad9e3ad6a8ce39279bb8040614e9b3b
4
- data.tar.gz: 6456734b18865a7811f08d0d9d599771f574f4b59bd5b54a964ece7428115907
3
+ metadata.gz: a65b477c93060832783d03b065dd336820bf22e985dd7b9f53a20e5834f29a0d
4
+ data.tar.gz: 3ab3f5147bb207ddeea4b902e86de41398fbe497bb521ab00a4fe89ccd790d50
5
5
  SHA512:
6
- metadata.gz: 1014349771d7aa3c318027de11603e96d5482e4bd5b1bcf0fd4874040245daf44c4cfb801077a698846459a7619ca9e01e0afc3507fc7bd519e7ba68a000a15d
7
- data.tar.gz: 1315ca8954397edb0db93347a10762e35f829377ef3dba0ea9cf6c67f986972ac8e75b46c410a3ceceefc0474f2abbe6f441e56a60e789ef1d2617fc15cfb29e
6
+ metadata.gz: 119a77a344ece09afda87d89321f679b9c53975c6b340150e298fa3869a0bf48849fafd49e5ef18b001311aae10e3fa9aba29c96de2c4aa8535cdad7d01382cb
7
+ data.tar.gz: 444fc224413ee6fc94b0866da07460e9c95162941fcd80c831c6f7a950373503eba74b10d437724db2c9debec4719c5a9b25875f1b0a014c956bcb424ca8bf47
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
2
+
3
+ - Bump bundled llama.cpp from b1833 to b1892.
4
+ - Change `LLAMA_SESSION_VERSION` value from 3 to 4.
5
+ - Add constants for split mode: `LLAMA_SPLIT_NONE`, `LLAMA_SPLIT_LAYER`, and `LLAMA_SPLIT_ROW`
6
+ - Add `split_mode` accessor to ModelParams.
7
+ - Add `sample_apply_guidance` method to Context.
8
+
1
9
  ## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
2
10
 
3
11
  - Bump bundled llama.cpp from b1768 to b1833.
@@ -789,6 +789,8 @@ public:
789
789
  rb_define_alloc_func(rb_cLLaMAModelParams, llama_model_params_alloc);
790
790
  rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_model_params_set_n_gpu_layers), 1);
791
791
  rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_model_params_get_n_gpu_layers), 0);
792
+ rb_define_method(rb_cLLaMAModelParams, "split_mode=", RUBY_METHOD_FUNC(_llama_model_params_set_split_mode), 1);
793
+ rb_define_method(rb_cLLaMAModelParams, "split_mode", RUBY_METHOD_FUNC(_llama_model_params_get_split_mode), 0);
792
794
  rb_define_method(rb_cLLaMAModelParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_model_params_set_main_gpu), 1);
793
795
  rb_define_method(rb_cLLaMAModelParams, "main_gpu", RUBY_METHOD_FUNC(_llama_model_params_get_main_gpu), 0);
794
796
  rb_define_method(rb_cLLaMAModelParams, "tensor_split", RUBY_METHOD_FUNC(_llama_model_params_get_tensor_split), 0);
@@ -815,6 +817,18 @@ private:
815
817
  return INT2NUM(ptr->params.n_gpu_layers);
816
818
  }
817
819
 
820
+ // split_mode
821
+ static VALUE _llama_model_params_set_split_mode(VALUE self, VALUE split_mode) {
822
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
823
+ ptr->params.split_mode = static_cast<enum llama_split_mode>(NUM2INT(split_mode));
824
+ return INT2NUM(ptr->params.split_mode);
825
+ }
826
+
827
+ static VALUE _llama_model_params_get_split_mode(VALUE self) {
828
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
829
+ return INT2NUM(ptr->params.split_mode);
830
+ }
831
+
818
832
  // main_gpu
819
833
  static VALUE _llama_model_params_set_main_gpu(VALUE self, VALUE main_gpu) {
820
834
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
@@ -2031,6 +2045,7 @@ public:
2031
2045
  rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
2032
2046
  rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
2033
2047
  rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
2048
+ rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
2034
2049
  rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
2035
2050
  rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
2036
2051
  rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
@@ -2553,6 +2568,51 @@ private:
2553
2568
  return Qnil;
2554
2569
  }
2555
2570
 
2571
+ static VALUE _llama_context_sample_apply_guidance(int argc, VALUE* argv, VALUE self) {
2572
+ VALUE kw_args = Qnil;
2573
+ ID kw_table[3] = { rb_intern("logits"), rb_intern("logits_guidance"), rb_intern("scale") };
2574
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
2575
+ rb_scan_args(argc, argv, ":", &kw_args);
2576
+ rb_get_kwargs(kw_args, kw_table, 0, 3, kw_values);
2577
+
2578
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2579
+ rb_raise(rb_eArgError, "logits must be an Array");
2580
+ return Qnil;
2581
+ }
2582
+ if (!RB_TYPE_P(kw_values[1], T_ARRAY)) {
2583
+ rb_raise(rb_eArgError, "logits_guidance must be an Array");
2584
+ return Qnil;
2585
+ }
2586
+ if (!RB_FLOAT_TYPE_P(kw_values[2])) {
2587
+ rb_raise(rb_eArgError, "scale must be a float");
2588
+ return Qnil;
2589
+ }
2590
+
2591
+ const size_t sz_logits = RARRAY_LEN(kw_values[0]);
2592
+ std::vector<float> logits(sz_logits);
2593
+ for (size_t i = 0; i < sz_logits; i++) {
2594
+ logits[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
2595
+ }
2596
+
2597
+ const size_t sz_logits_guidance = RARRAY_LEN(kw_values[1]);
2598
+ std::vector<float> logits_guidance(sz_logits_guidance);
2599
+ for (size_t i = 0; i < sz_logits_guidance; i++) {
2600
+ logits_guidance[i] = NUM2DBL(rb_ary_entry(kw_values[1], i));
2601
+ }
2602
+
2603
+ const float scale = NUM2DBL(kw_values[2]);
2604
+
2605
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
2606
+ if (ctx_ptr->ctx == NULL) {
2607
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2608
+ return Qnil;
2609
+ }
2610
+
2611
+ llama_sample_apply_guidance(ctx_ptr->ctx, logits.data(), logits_guidance.data(), scale);
2612
+
2613
+ return Qnil;
2614
+ }
2615
+
2556
2616
  static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
2557
2617
  VALUE kw_args = Qnil;
2558
2618
  ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
@@ -3244,6 +3304,10 @@ extern "C" void Init_llama_cpp(void) {
3244
3304
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
3245
3305
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3246
3306
 
3307
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
3308
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
3309
+ rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
3310
+
3247
3311
  std::stringstream ss_magic;
3248
3312
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
3249
3313
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.1'
6
+ VERSION = '0.12.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1833'
9
+ LLAMA_CPP_VERSION = 'b1892'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -44,6 +44,10 @@ module LLaMACpp
44
44
  LLAMA_ROPE_SCALING_YARN: Integer
45
45
  LLAMA_ROPE_SCALING_MAX_VALUE: Integer
46
46
 
47
+ LLAMA_SPLIT_NONE: Integer
48
+ LLAMA_SPLIT_LAYER: Integer
49
+ LLAMA_SPLIT_ROW: Integer
50
+
47
51
  def self?.backend_init: (?numa: bool) -> void
48
52
  def self?.backend_free: () -> void
49
53
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -138,6 +142,8 @@ module LLaMACpp
138
142
 
139
143
  def n_gpu_layers: () -> Integer
140
144
  def n_gpu_layers=: (Integer) -> Integer
145
+ def split_mode: () -> Integer
146
+ def split_mode=: (Integer) -> Integer
141
147
  def main_gpu: () -> Integer
142
148
  def main_gpu=: (Integer) -> Integer
143
149
  def tensor_split: () -> Array[Float]
@@ -201,6 +207,7 @@ module LLaMACpp
201
207
  def load_session_file: (session_path: String) -> void
202
208
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
203
209
  def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
210
+ def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
204
211
  def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
205
212
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
206
213
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
@@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
43
43
  endif
44
44
  endif
45
45
 
46
- ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
47
- BUILD_TARGETS += metal
48
- endif
49
-
50
46
  default: $(BUILD_TARGETS)
51
47
 
52
48
  test: $(TEST_TARGETS)
@@ -677,11 +673,6 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
677
673
  passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
678
674
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
679
675
 
680
- ifdef LLAMA_METAL
681
- metal: examples/metal/metal.cpp ggml.o $(OBJS)
682
- $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
683
- endif
684
-
685
676
  ifeq ($(UNAME_S),Darwin)
686
677
  swift: examples/batched.swift
687
678
  (cd examples/batched.swift; make build)
@@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
102
102
  }
103
103
  }
104
104
 
105
- AT_PRINTF("block %d\n", best_fit_block);
106
-
107
105
  if (best_fit_block == -1) {
108
106
  // the last block is our last resort
109
107
  struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
@@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
117
115
  return;
118
116
  }
119
117
  }
118
+
120
119
  struct free_block * block = &alloc->free_blocks[best_fit_block];
121
120
  void * addr = block->addr;
122
121
  block->addr = (char*)block->addr + size;
@@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
129
128
  }
130
129
  }
131
130
 
131
+ AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
132
+
132
133
  tensor->data = addr;
133
134
  tensor->buffer = alloc->buffer;
134
135
  if (!alloc->measure) {
@@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
229
230
  alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
230
231
  } else {
231
232
  alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
233
+ ggml_backend_buffer_reset(alloc->buffer);
232
234
  }
233
235
  }
234
236
 
@@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
263
265
  return alloc;
264
266
  }
265
267
 
266
- ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
268
+ ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
267
269
  // create a backend buffer to get the correct tensor allocation sizes
268
- ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
270
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
269
271
 
270
272
  // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
271
273
  ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
@@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
275
277
  return alloc;
276
278
  }
277
279
 
278
- ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
279
- ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
280
+ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
281
+ return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
282
+ }
283
+
284
+ ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
285
+ // create a backend buffer to get the correct tensor allocation sizes
286
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
280
287
  ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
281
288
  alloc->buffer_owned = true;
282
289
  return alloc;
283
290
  }
284
291
 
292
+ ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
293
+ return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
294
+ }
295
+
285
296
  ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
286
297
  ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
287
298
 
@@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
779
790
 
780
791
  if (nbytes == 0) {
781
792
  // all the tensors in the context are already allocated
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
795
+ #endif
782
796
  return NULL;
783
797
  }
784
798
 
785
799
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
800
+ if (buffer == NULL) {
801
+ // failed to allocate buffer
802
+ #ifndef NDEBUG
803
+ fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
804
+ #endif
805
+ return NULL;
806
+ }
807
+
786
808
  ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
787
809
 
788
810
  for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
@@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
52
52
 
53
53
  GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
54
54
  GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
55
- GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
55
+ GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
56
56
  GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
57
+ GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
58
+ GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
57
59
  GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
58
60
 
59
61
  GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
@@ -16,13 +16,14 @@ extern "C" {
16
16
  typedef void * ggml_backend_buffer_type_context_t;
17
17
 
18
18
  struct ggml_backend_buffer_type_i {
19
- ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
20
- size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
21
- size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
22
- bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
19
+ const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
+ ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
+ size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
+ size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
+ bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23
24
  // check if tensor data is in host memory
24
25
  // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
25
- bool (*is_host) (ggml_backend_buffer_type_t buft);
26
+ bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
26
27
  };
27
28
 
28
29
  struct ggml_backend_buffer_type {
@@ -34,16 +35,15 @@ extern "C" {
34
35
  typedef void * ggml_backend_buffer_context_t;
35
36
 
36
37
  struct ggml_backend_buffer_i {
37
- void (*free_buffer) (ggml_backend_buffer_t buffer);
38
- //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
39
- void * (*get_base) (ggml_backend_buffer_t buffer);
40
- void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
41
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
42
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
43
- // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
44
- void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
45
- void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
46
- void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
38
+ const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
39
+ void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
40
+ void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
41
+ void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
42
+ void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
43
+ void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
44
+ bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
45
+ void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
46
+ void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
47
47
  };
48
48
 
49
49
  struct ggml_backend_buffer {
@@ -51,14 +51,17 @@ extern "C" {
51
51
  ggml_backend_buffer_type_t buft;
52
52
  ggml_backend_buffer_context_t context;
53
53
  size_t size;
54
+ enum ggml_backend_buffer_usage usage;
54
55
  };
55
56
 
56
- ggml_backend_buffer_t ggml_backend_buffer_init(
57
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
57
58
  ggml_backend_buffer_type_t buft,
58
59
  struct ggml_backend_buffer_i iface,
59
60
  ggml_backend_buffer_context_t context,
60
61
  size_t size);
61
62
 
63
+ // do not use directly, use ggml_backend_tensor_copy instead
64
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
62
65
 
63
66
  //
64
67
  // Backend
@@ -67,33 +70,31 @@ extern "C" {
67
70
  typedef void * ggml_backend_context_t;
68
71
 
69
72
  struct ggml_backend_i {
70
- const char * (*get_name)(ggml_backend_t backend);
73
+ const char * (*GGML_CALL get_name)(ggml_backend_t backend);
71
74
 
72
- void (*free)(ggml_backend_t backend);
75
+ void (*GGML_CALL free)(ggml_backend_t backend);
73
76
 
74
77
  // buffer allocation
75
- ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
78
+ ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
76
79
 
77
- // (optional) asynchroneous tensor data access
78
- void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
79
- void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
80
+ // (optional) asynchronous tensor data access
81
+ void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
82
+ void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
83
+ bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
80
84
 
81
- // (optional) asynchroneous tensor copy
82
- void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
83
- void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
84
-
85
- void (*synchronize)(ggml_backend_t backend);
85
+ // (optional) complete all pending operations
86
+ void (*GGML_CALL synchronize)(ggml_backend_t backend);
86
87
 
87
88
  // compute graph with a plan
88
- ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
89
- void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
90
- void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
89
+ ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
90
+ void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
91
+ void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
91
92
 
92
- // compute graph without a plan
93
- bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
93
+ // compute graph without a plan (async)
94
+ bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
94
95
 
95
96
  // check if the backend supports an operation
96
- bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
97
+ bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
97
98
  };
98
99
 
99
100
  struct ggml_backend {
@@ -102,14 +103,13 @@ extern "C" {
102
103
  ggml_backend_context_t context;
103
104
  };
104
105
 
105
-
106
106
  //
107
107
  // Backend registry
108
108
  //
109
109
 
110
- typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
110
+ typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
111
111
 
112
- void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
112
+ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
113
113
 
114
114
  #ifdef __cplusplus
115
115
  }