llama_cpp 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a65b477c93060832783d03b065dd336820bf22e985dd7b9f53a20e5834f29a0d
|
4
|
+
data.tar.gz: 3ab3f5147bb207ddeea4b902e86de41398fbe497bb521ab00a4fe89ccd790d50
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 119a77a344ece09afda87d89321f679b9c53975c6b340150e298fa3869a0bf48849fafd49e5ef18b001311aae10e3fa9aba29c96de2c4aa8535cdad7d01382cb
|
7
|
+
data.tar.gz: 444fc224413ee6fc94b0866da07460e9c95162941fcd80c831c6f7a950373503eba74b10d437724db2c9debec4719c5a9b25875f1b0a014c956bcb424ca8bf47
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1833 to b1892.
|
4
|
+
- Change `LLAMA_SESSION_VERSION` value from 3 to 4.
|
5
|
+
- Add constants for split mode: `LLAMA_SPLIT_NONE`, `LLAMA_SPLIT_LAYER`, and `LLAMA_SPLIT_ROW`
|
6
|
+
- Add `split_mode` accessor to ModelParams.
|
7
|
+
- Add `sample_apply_guidance` method to Context.
|
8
|
+
|
1
9
|
## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
|
2
10
|
|
3
11
|
- Bump bundled llama.cpp from b1768 to b1833.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -789,6 +789,8 @@ public:
|
|
789
789
|
rb_define_alloc_func(rb_cLLaMAModelParams, llama_model_params_alloc);
|
790
790
|
rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_model_params_set_n_gpu_layers), 1);
|
791
791
|
rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_model_params_get_n_gpu_layers), 0);
|
792
|
+
rb_define_method(rb_cLLaMAModelParams, "split_mode=", RUBY_METHOD_FUNC(_llama_model_params_set_split_mode), 1);
|
793
|
+
rb_define_method(rb_cLLaMAModelParams, "split_mode", RUBY_METHOD_FUNC(_llama_model_params_get_split_mode), 0);
|
792
794
|
rb_define_method(rb_cLLaMAModelParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_model_params_set_main_gpu), 1);
|
793
795
|
rb_define_method(rb_cLLaMAModelParams, "main_gpu", RUBY_METHOD_FUNC(_llama_model_params_get_main_gpu), 0);
|
794
796
|
rb_define_method(rb_cLLaMAModelParams, "tensor_split", RUBY_METHOD_FUNC(_llama_model_params_get_tensor_split), 0);
|
@@ -815,6 +817,18 @@ private:
|
|
815
817
|
return INT2NUM(ptr->params.n_gpu_layers);
|
816
818
|
}
|
817
819
|
|
820
|
+
// split_mode
|
821
|
+
static VALUE _llama_model_params_set_split_mode(VALUE self, VALUE split_mode) {
|
822
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
823
|
+
ptr->params.split_mode = static_cast<enum llama_split_mode>(NUM2INT(split_mode));
|
824
|
+
return INT2NUM(ptr->params.split_mode);
|
825
|
+
}
|
826
|
+
|
827
|
+
static VALUE _llama_model_params_get_split_mode(VALUE self) {
|
828
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
829
|
+
return INT2NUM(ptr->params.split_mode);
|
830
|
+
}
|
831
|
+
|
818
832
|
// main_gpu
|
819
833
|
static VALUE _llama_model_params_set_main_gpu(VALUE self, VALUE main_gpu) {
|
820
834
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
@@ -2031,6 +2045,7 @@ public:
|
|
2031
2045
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2032
2046
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
2033
2047
|
rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
|
2048
|
+
rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
|
2034
2049
|
rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
|
2035
2050
|
rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
|
2036
2051
|
rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
|
@@ -2553,6 +2568,51 @@ private:
|
|
2553
2568
|
return Qnil;
|
2554
2569
|
}
|
2555
2570
|
|
2571
|
+
static VALUE _llama_context_sample_apply_guidance(int argc, VALUE* argv, VALUE self) {
|
2572
|
+
VALUE kw_args = Qnil;
|
2573
|
+
ID kw_table[3] = { rb_intern("logits"), rb_intern("logits_guidance"), rb_intern("scale") };
|
2574
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
2575
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
2576
|
+
rb_get_kwargs(kw_args, kw_table, 0, 3, kw_values);
|
2577
|
+
|
2578
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2579
|
+
rb_raise(rb_eArgError, "logits must be an Array");
|
2580
|
+
return Qnil;
|
2581
|
+
}
|
2582
|
+
if (!RB_TYPE_P(kw_values[1], T_ARRAY)) {
|
2583
|
+
rb_raise(rb_eArgError, "logits_guidance must be an Array");
|
2584
|
+
return Qnil;
|
2585
|
+
}
|
2586
|
+
if (!RB_FLOAT_TYPE_P(kw_values[2])) {
|
2587
|
+
rb_raise(rb_eArgError, "scale must be a float");
|
2588
|
+
return Qnil;
|
2589
|
+
}
|
2590
|
+
|
2591
|
+
const size_t sz_logits = RARRAY_LEN(kw_values[0]);
|
2592
|
+
std::vector<float> logits(sz_logits);
|
2593
|
+
for (size_t i = 0; i < sz_logits; i++) {
|
2594
|
+
logits[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
|
2595
|
+
}
|
2596
|
+
|
2597
|
+
const size_t sz_logits_guidance = RARRAY_LEN(kw_values[1]);
|
2598
|
+
std::vector<float> logits_guidance(sz_logits_guidance);
|
2599
|
+
for (size_t i = 0; i < sz_logits_guidance; i++) {
|
2600
|
+
logits_guidance[i] = NUM2DBL(rb_ary_entry(kw_values[1], i));
|
2601
|
+
}
|
2602
|
+
|
2603
|
+
const float scale = NUM2DBL(kw_values[2]);
|
2604
|
+
|
2605
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
2606
|
+
if (ctx_ptr->ctx == NULL) {
|
2607
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2608
|
+
return Qnil;
|
2609
|
+
}
|
2610
|
+
|
2611
|
+
llama_sample_apply_guidance(ctx_ptr->ctx, logits.data(), logits_guidance.data(), scale);
|
2612
|
+
|
2613
|
+
return Qnil;
|
2614
|
+
}
|
2615
|
+
|
2556
2616
|
static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
|
2557
2617
|
VALUE kw_args = Qnil;
|
2558
2618
|
ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
|
@@ -3244,6 +3304,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3244
3304
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
|
3245
3305
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
|
3246
3306
|
|
3307
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
|
3308
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
|
3309
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
|
3310
|
+
|
3247
3311
|
std::stringstream ss_magic;
|
3248
3312
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
|
3249
3313
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1892'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -44,6 +44,10 @@ module LLaMACpp
|
|
44
44
|
LLAMA_ROPE_SCALING_YARN: Integer
|
45
45
|
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
46
46
|
|
47
|
+
LLAMA_SPLIT_NONE: Integer
|
48
|
+
LLAMA_SPLIT_LAYER: Integer
|
49
|
+
LLAMA_SPLIT_ROW: Integer
|
50
|
+
|
47
51
|
def self?.backend_init: (?numa: bool) -> void
|
48
52
|
def self?.backend_free: () -> void
|
49
53
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -138,6 +142,8 @@ module LLaMACpp
|
|
138
142
|
|
139
143
|
def n_gpu_layers: () -> Integer
|
140
144
|
def n_gpu_layers=: (Integer) -> Integer
|
145
|
+
def split_mode: () -> Integer
|
146
|
+
def split_mode=: (Integer) -> Integer
|
141
147
|
def main_gpu: () -> Integer
|
142
148
|
def main_gpu=: (Integer) -> Integer
|
143
149
|
def tensor_split: () -> Array[Float]
|
@@ -201,6 +207,7 @@ module LLaMACpp
|
|
201
207
|
def load_session_file: (session_path: String) -> void
|
202
208
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
203
209
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
210
|
+
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
204
211
|
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
205
212
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
206
213
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
@@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
|
|
43
43
|
endif
|
44
44
|
endif
|
45
45
|
|
46
|
-
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
|
47
|
-
BUILD_TARGETS += metal
|
48
|
-
endif
|
49
|
-
|
50
46
|
default: $(BUILD_TARGETS)
|
51
47
|
|
52
48
|
test: $(TEST_TARGETS)
|
@@ -677,11 +673,6 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
|
677
673
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
678
674
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
679
675
|
|
680
|
-
ifdef LLAMA_METAL
|
681
|
-
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
682
|
-
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
683
|
-
endif
|
684
|
-
|
685
676
|
ifeq ($(UNAME_S),Darwin)
|
686
677
|
swift: examples/batched.swift
|
687
678
|
(cd examples/batched.swift; make build)
|
@@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
102
102
|
}
|
103
103
|
}
|
104
104
|
|
105
|
-
AT_PRINTF("block %d\n", best_fit_block);
|
106
|
-
|
107
105
|
if (best_fit_block == -1) {
|
108
106
|
// the last block is our last resort
|
109
107
|
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
@@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
117
115
|
return;
|
118
116
|
}
|
119
117
|
}
|
118
|
+
|
120
119
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
121
120
|
void * addr = block->addr;
|
122
121
|
block->addr = (char*)block->addr + size;
|
@@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
129
128
|
}
|
130
129
|
}
|
131
130
|
|
131
|
+
AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
|
132
|
+
|
132
133
|
tensor->data = addr;
|
133
134
|
tensor->buffer = alloc->buffer;
|
134
135
|
if (!alloc->measure) {
|
@@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
|
229
230
|
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
230
231
|
} else {
|
231
232
|
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
233
|
+
ggml_backend_buffer_reset(alloc->buffer);
|
232
234
|
}
|
233
235
|
}
|
234
236
|
|
@@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
|
263
265
|
return alloc;
|
264
266
|
}
|
265
267
|
|
266
|
-
ggml_tallocr_t
|
268
|
+
ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
|
267
269
|
// create a backend buffer to get the correct tensor allocation sizes
|
268
|
-
ggml_backend_buffer_t buffer =
|
270
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
|
269
271
|
|
270
272
|
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
271
273
|
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
@@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
|
|
275
277
|
return alloc;
|
276
278
|
}
|
277
279
|
|
278
|
-
ggml_tallocr_t
|
279
|
-
|
280
|
+
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
281
|
+
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
|
282
|
+
}
|
283
|
+
|
284
|
+
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
|
285
|
+
// create a backend buffer to get the correct tensor allocation sizes
|
286
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
280
287
|
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
281
288
|
alloc->buffer_owned = true;
|
282
289
|
return alloc;
|
283
290
|
}
|
284
291
|
|
292
|
+
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
293
|
+
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
|
294
|
+
}
|
295
|
+
|
285
296
|
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
286
297
|
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
287
298
|
|
@@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
779
790
|
|
780
791
|
if (nbytes == 0) {
|
781
792
|
// all the tensors in the context are already allocated
|
793
|
+
#ifndef NDEBUG
|
794
|
+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
795
|
+
#endif
|
782
796
|
return NULL;
|
783
797
|
}
|
784
798
|
|
785
799
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
800
|
+
if (buffer == NULL) {
|
801
|
+
// failed to allocate buffer
|
802
|
+
#ifndef NDEBUG
|
803
|
+
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
804
|
+
#endif
|
805
|
+
return NULL;
|
806
|
+
}
|
807
|
+
|
786
808
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
787
809
|
|
788
810
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
@@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
|
|
52
52
|
|
53
53
|
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
|
54
54
|
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
|
55
|
-
GGML_API ggml_tallocr_t
|
55
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
|
56
56
|
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
57
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
58
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
|
57
59
|
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
58
60
|
|
59
61
|
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
|
@@ -16,13 +16,14 @@ extern "C" {
|
|
16
16
|
typedef void * ggml_backend_buffer_type_context_t;
|
17
17
|
|
18
18
|
struct ggml_backend_buffer_type_i {
|
19
|
-
|
20
|
-
|
21
|
-
size_t (*
|
22
|
-
|
19
|
+
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
20
|
+
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
21
|
+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
22
|
+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
23
|
+
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
23
24
|
// check if tensor data is in host memory
|
24
25
|
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
25
|
-
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
26
|
+
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
26
27
|
};
|
27
28
|
|
28
29
|
struct ggml_backend_buffer_type {
|
@@ -34,16 +35,15 @@ extern "C" {
|
|
34
35
|
typedef void * ggml_backend_buffer_context_t;
|
35
36
|
|
36
37
|
struct ggml_backend_buffer_i {
|
37
|
-
|
38
|
-
|
39
|
-
void *
|
40
|
-
void
|
41
|
-
void
|
42
|
-
void
|
43
|
-
|
44
|
-
void
|
45
|
-
void
|
46
|
-
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
38
|
+
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
39
|
+
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
|
40
|
+
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
41
|
+
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
42
|
+
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
43
|
+
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
44
|
+
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
45
|
+
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
46
|
+
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
47
47
|
};
|
48
48
|
|
49
49
|
struct ggml_backend_buffer {
|
@@ -51,14 +51,17 @@ extern "C" {
|
|
51
51
|
ggml_backend_buffer_type_t buft;
|
52
52
|
ggml_backend_buffer_context_t context;
|
53
53
|
size_t size;
|
54
|
+
enum ggml_backend_buffer_usage usage;
|
54
55
|
};
|
55
56
|
|
56
|
-
ggml_backend_buffer_t ggml_backend_buffer_init(
|
57
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
57
58
|
ggml_backend_buffer_type_t buft,
|
58
59
|
struct ggml_backend_buffer_i iface,
|
59
60
|
ggml_backend_buffer_context_t context,
|
60
61
|
size_t size);
|
61
62
|
|
63
|
+
// do not use directly, use ggml_backend_tensor_copy instead
|
64
|
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
62
65
|
|
63
66
|
//
|
64
67
|
// Backend
|
@@ -67,33 +70,31 @@ extern "C" {
|
|
67
70
|
typedef void * ggml_backend_context_t;
|
68
71
|
|
69
72
|
struct ggml_backend_i {
|
70
|
-
const char * (*get_name)(ggml_backend_t backend);
|
73
|
+
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
71
74
|
|
72
|
-
void (*free)(ggml_backend_t backend);
|
75
|
+
void (*GGML_CALL free)(ggml_backend_t backend);
|
73
76
|
|
74
77
|
// buffer allocation
|
75
|
-
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
78
|
+
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
76
79
|
|
77
|
-
// (optional)
|
78
|
-
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
79
|
-
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
80
|
+
// (optional) asynchronous tensor data access
|
81
|
+
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
82
|
+
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
83
|
+
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
80
84
|
|
81
|
-
// (optional)
|
82
|
-
void (*
|
83
|
-
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
84
|
-
|
85
|
-
void (*synchronize)(ggml_backend_t backend);
|
85
|
+
// (optional) complete all pending operations
|
86
|
+
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
86
87
|
|
87
88
|
// compute graph with a plan
|
88
|
-
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
89
|
-
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
90
|
-
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
89
|
+
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
90
|
+
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
|
+
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
92
|
|
92
|
-
// compute graph without a plan
|
93
|
-
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
93
|
+
// compute graph without a plan (async)
|
94
|
+
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
94
95
|
|
95
96
|
// check if the backend supports an operation
|
96
|
-
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
97
|
+
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
97
98
|
};
|
98
99
|
|
99
100
|
struct ggml_backend {
|
@@ -102,14 +103,13 @@ extern "C" {
|
|
102
103
|
ggml_backend_context_t context;
|
103
104
|
};
|
104
105
|
|
105
|
-
|
106
106
|
//
|
107
107
|
// Backend registry
|
108
108
|
//
|
109
109
|
|
110
|
-
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
110
|
+
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
111
111
|
|
112
|
-
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
112
|
+
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
113
113
|
|
114
114
|
#ifdef __cplusplus
|
115
115
|
}
|