llama_cpp 0.12.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a65b477c93060832783d03b065dd336820bf22e985dd7b9f53a20e5834f29a0d
|
4
|
+
data.tar.gz: 3ab3f5147bb207ddeea4b902e86de41398fbe497bb521ab00a4fe89ccd790d50
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 119a77a344ece09afda87d89321f679b9c53975c6b340150e298fa3869a0bf48849fafd49e5ef18b001311aae10e3fa9aba29c96de2c4aa8535cdad7d01382cb
|
7
|
+
data.tar.gz: 444fc224413ee6fc94b0866da07460e9c95162941fcd80c831c6f7a950373503eba74b10d437724db2c9debec4719c5a9b25875f1b0a014c956bcb424ca8bf47
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1833 to b1892.
|
4
|
+
- Change `LLAMA_SESSION_VERSION` value from 3 to 4.
|
5
|
+
- Add constants for split mode: `LLAMA_SPLIT_NONE`, `LLAMA_SPLIT_LAYER`, and `LLAMA_SPLIT_ROW`
|
6
|
+
- Add `split_mode` accessor to ModelParams.
|
7
|
+
- Add `sample_apply_guidance` method to Context.
|
8
|
+
|
1
9
|
## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
|
2
10
|
|
3
11
|
- Bump bundled llama.cpp from b1768 to b1833.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -789,6 +789,8 @@ public:
|
|
789
789
|
rb_define_alloc_func(rb_cLLaMAModelParams, llama_model_params_alloc);
|
790
790
|
rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_model_params_set_n_gpu_layers), 1);
|
791
791
|
rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_model_params_get_n_gpu_layers), 0);
|
792
|
+
rb_define_method(rb_cLLaMAModelParams, "split_mode=", RUBY_METHOD_FUNC(_llama_model_params_set_split_mode), 1);
|
793
|
+
rb_define_method(rb_cLLaMAModelParams, "split_mode", RUBY_METHOD_FUNC(_llama_model_params_get_split_mode), 0);
|
792
794
|
rb_define_method(rb_cLLaMAModelParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_model_params_set_main_gpu), 1);
|
793
795
|
rb_define_method(rb_cLLaMAModelParams, "main_gpu", RUBY_METHOD_FUNC(_llama_model_params_get_main_gpu), 0);
|
794
796
|
rb_define_method(rb_cLLaMAModelParams, "tensor_split", RUBY_METHOD_FUNC(_llama_model_params_get_tensor_split), 0);
|
@@ -815,6 +817,18 @@ private:
|
|
815
817
|
return INT2NUM(ptr->params.n_gpu_layers);
|
816
818
|
}
|
817
819
|
|
820
|
+
// split_mode
|
821
|
+
static VALUE _llama_model_params_set_split_mode(VALUE self, VALUE split_mode) {
|
822
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
823
|
+
ptr->params.split_mode = static_cast<enum llama_split_mode>(NUM2INT(split_mode));
|
824
|
+
return INT2NUM(ptr->params.split_mode);
|
825
|
+
}
|
826
|
+
|
827
|
+
static VALUE _llama_model_params_get_split_mode(VALUE self) {
|
828
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
829
|
+
return INT2NUM(ptr->params.split_mode);
|
830
|
+
}
|
831
|
+
|
818
832
|
// main_gpu
|
819
833
|
static VALUE _llama_model_params_set_main_gpu(VALUE self, VALUE main_gpu) {
|
820
834
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
@@ -2031,6 +2045,7 @@ public:
|
|
2031
2045
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2032
2046
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
2033
2047
|
rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
|
2048
|
+
rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
|
2034
2049
|
rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
|
2035
2050
|
rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
|
2036
2051
|
rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
|
@@ -2553,6 +2568,51 @@ private:
|
|
2553
2568
|
return Qnil;
|
2554
2569
|
}
|
2555
2570
|
|
2571
|
+
static VALUE _llama_context_sample_apply_guidance(int argc, VALUE* argv, VALUE self) {
|
2572
|
+
VALUE kw_args = Qnil;
|
2573
|
+
ID kw_table[3] = { rb_intern("logits"), rb_intern("logits_guidance"), rb_intern("scale") };
|
2574
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
2575
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
2576
|
+
rb_get_kwargs(kw_args, kw_table, 0, 3, kw_values);
|
2577
|
+
|
2578
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2579
|
+
rb_raise(rb_eArgError, "logits must be an Array");
|
2580
|
+
return Qnil;
|
2581
|
+
}
|
2582
|
+
if (!RB_TYPE_P(kw_values[1], T_ARRAY)) {
|
2583
|
+
rb_raise(rb_eArgError, "logits_guidance must be an Array");
|
2584
|
+
return Qnil;
|
2585
|
+
}
|
2586
|
+
if (!RB_FLOAT_TYPE_P(kw_values[2])) {
|
2587
|
+
rb_raise(rb_eArgError, "scale must be a float");
|
2588
|
+
return Qnil;
|
2589
|
+
}
|
2590
|
+
|
2591
|
+
const size_t sz_logits = RARRAY_LEN(kw_values[0]);
|
2592
|
+
std::vector<float> logits(sz_logits);
|
2593
|
+
for (size_t i = 0; i < sz_logits; i++) {
|
2594
|
+
logits[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
|
2595
|
+
}
|
2596
|
+
|
2597
|
+
const size_t sz_logits_guidance = RARRAY_LEN(kw_values[1]);
|
2598
|
+
std::vector<float> logits_guidance(sz_logits_guidance);
|
2599
|
+
for (size_t i = 0; i < sz_logits_guidance; i++) {
|
2600
|
+
logits_guidance[i] = NUM2DBL(rb_ary_entry(kw_values[1], i));
|
2601
|
+
}
|
2602
|
+
|
2603
|
+
const float scale = NUM2DBL(kw_values[2]);
|
2604
|
+
|
2605
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
2606
|
+
if (ctx_ptr->ctx == NULL) {
|
2607
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2608
|
+
return Qnil;
|
2609
|
+
}
|
2610
|
+
|
2611
|
+
llama_sample_apply_guidance(ctx_ptr->ctx, logits.data(), logits_guidance.data(), scale);
|
2612
|
+
|
2613
|
+
return Qnil;
|
2614
|
+
}
|
2615
|
+
|
2556
2616
|
static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
|
2557
2617
|
VALUE kw_args = Qnil;
|
2558
2618
|
ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
|
@@ -3244,6 +3304,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3244
3304
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
|
3245
3305
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
|
3246
3306
|
|
3307
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
|
3308
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
|
3309
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
|
3310
|
+
|
3247
3311
|
std::stringstream ss_magic;
|
3248
3312
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
|
3249
3313
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1892'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -44,6 +44,10 @@ module LLaMACpp
|
|
44
44
|
LLAMA_ROPE_SCALING_YARN: Integer
|
45
45
|
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
46
46
|
|
47
|
+
LLAMA_SPLIT_NONE: Integer
|
48
|
+
LLAMA_SPLIT_LAYER: Integer
|
49
|
+
LLAMA_SPLIT_ROW: Integer
|
50
|
+
|
47
51
|
def self?.backend_init: (?numa: bool) -> void
|
48
52
|
def self?.backend_free: () -> void
|
49
53
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -138,6 +142,8 @@ module LLaMACpp
|
|
138
142
|
|
139
143
|
def n_gpu_layers: () -> Integer
|
140
144
|
def n_gpu_layers=: (Integer) -> Integer
|
145
|
+
def split_mode: () -> Integer
|
146
|
+
def split_mode=: (Integer) -> Integer
|
141
147
|
def main_gpu: () -> Integer
|
142
148
|
def main_gpu=: (Integer) -> Integer
|
143
149
|
def tensor_split: () -> Array[Float]
|
@@ -201,6 +207,7 @@ module LLaMACpp
|
|
201
207
|
def load_session_file: (session_path: String) -> void
|
202
208
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
203
209
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
210
|
+
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
204
211
|
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
205
212
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
206
213
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
@@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
|
|
43
43
|
endif
|
44
44
|
endif
|
45
45
|
|
46
|
-
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
|
47
|
-
BUILD_TARGETS += metal
|
48
|
-
endif
|
49
|
-
|
50
46
|
default: $(BUILD_TARGETS)
|
51
47
|
|
52
48
|
test: $(TEST_TARGETS)
|
@@ -677,11 +673,6 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
|
677
673
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
678
674
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
679
675
|
|
680
|
-
ifdef LLAMA_METAL
|
681
|
-
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
682
|
-
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
683
|
-
endif
|
684
|
-
|
685
676
|
ifeq ($(UNAME_S),Darwin)
|
686
677
|
swift: examples/batched.swift
|
687
678
|
(cd examples/batched.swift; make build)
|
@@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
102
102
|
}
|
103
103
|
}
|
104
104
|
|
105
|
-
AT_PRINTF("block %d\n", best_fit_block);
|
106
|
-
|
107
105
|
if (best_fit_block == -1) {
|
108
106
|
// the last block is our last resort
|
109
107
|
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
@@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
117
115
|
return;
|
118
116
|
}
|
119
117
|
}
|
118
|
+
|
120
119
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
121
120
|
void * addr = block->addr;
|
122
121
|
block->addr = (char*)block->addr + size;
|
@@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
129
128
|
}
|
130
129
|
}
|
131
130
|
|
131
|
+
AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
|
132
|
+
|
132
133
|
tensor->data = addr;
|
133
134
|
tensor->buffer = alloc->buffer;
|
134
135
|
if (!alloc->measure) {
|
@@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
|
229
230
|
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
230
231
|
} else {
|
231
232
|
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
233
|
+
ggml_backend_buffer_reset(alloc->buffer);
|
232
234
|
}
|
233
235
|
}
|
234
236
|
|
@@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
|
263
265
|
return alloc;
|
264
266
|
}
|
265
267
|
|
266
|
-
ggml_tallocr_t
|
268
|
+
ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
|
267
269
|
// create a backend buffer to get the correct tensor allocation sizes
|
268
|
-
ggml_backend_buffer_t buffer =
|
270
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
|
269
271
|
|
270
272
|
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
271
273
|
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
@@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
|
|
275
277
|
return alloc;
|
276
278
|
}
|
277
279
|
|
278
|
-
ggml_tallocr_t
|
279
|
-
|
280
|
+
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
281
|
+
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
|
282
|
+
}
|
283
|
+
|
284
|
+
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
|
285
|
+
// create a backend buffer to get the correct tensor allocation sizes
|
286
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
280
287
|
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
281
288
|
alloc->buffer_owned = true;
|
282
289
|
return alloc;
|
283
290
|
}
|
284
291
|
|
292
|
+
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
293
|
+
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
|
294
|
+
}
|
295
|
+
|
285
296
|
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
286
297
|
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
287
298
|
|
@@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
779
790
|
|
780
791
|
if (nbytes == 0) {
|
781
792
|
// all the tensors in the context are already allocated
|
793
|
+
#ifndef NDEBUG
|
794
|
+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
795
|
+
#endif
|
782
796
|
return NULL;
|
783
797
|
}
|
784
798
|
|
785
799
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
800
|
+
if (buffer == NULL) {
|
801
|
+
// failed to allocate buffer
|
802
|
+
#ifndef NDEBUG
|
803
|
+
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
804
|
+
#endif
|
805
|
+
return NULL;
|
806
|
+
}
|
807
|
+
|
786
808
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
787
809
|
|
788
810
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
@@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
|
|
52
52
|
|
53
53
|
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
|
54
54
|
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
|
55
|
-
GGML_API ggml_tallocr_t
|
55
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
|
56
56
|
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
57
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
58
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
|
57
59
|
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
58
60
|
|
59
61
|
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
|
@@ -16,13 +16,14 @@ extern "C" {
|
|
16
16
|
typedef void * ggml_backend_buffer_type_context_t;
|
17
17
|
|
18
18
|
struct ggml_backend_buffer_type_i {
|
19
|
-
|
20
|
-
|
21
|
-
size_t (*
|
22
|
-
|
19
|
+
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
20
|
+
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
21
|
+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
22
|
+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
23
|
+
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
23
24
|
// check if tensor data is in host memory
|
24
25
|
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
25
|
-
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
26
|
+
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
26
27
|
};
|
27
28
|
|
28
29
|
struct ggml_backend_buffer_type {
|
@@ -34,16 +35,15 @@ extern "C" {
|
|
34
35
|
typedef void * ggml_backend_buffer_context_t;
|
35
36
|
|
36
37
|
struct ggml_backend_buffer_i {
|
37
|
-
|
38
|
-
|
39
|
-
void *
|
40
|
-
void
|
41
|
-
void
|
42
|
-
void
|
43
|
-
|
44
|
-
void
|
45
|
-
void
|
46
|
-
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
38
|
+
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
39
|
+
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
|
40
|
+
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
41
|
+
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
42
|
+
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
43
|
+
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
44
|
+
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
45
|
+
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
46
|
+
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
47
47
|
};
|
48
48
|
|
49
49
|
struct ggml_backend_buffer {
|
@@ -51,14 +51,17 @@ extern "C" {
|
|
51
51
|
ggml_backend_buffer_type_t buft;
|
52
52
|
ggml_backend_buffer_context_t context;
|
53
53
|
size_t size;
|
54
|
+
enum ggml_backend_buffer_usage usage;
|
54
55
|
};
|
55
56
|
|
56
|
-
ggml_backend_buffer_t ggml_backend_buffer_init(
|
57
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
57
58
|
ggml_backend_buffer_type_t buft,
|
58
59
|
struct ggml_backend_buffer_i iface,
|
59
60
|
ggml_backend_buffer_context_t context,
|
60
61
|
size_t size);
|
61
62
|
|
63
|
+
// do not use directly, use ggml_backend_tensor_copy instead
|
64
|
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
62
65
|
|
63
66
|
//
|
64
67
|
// Backend
|
@@ -67,33 +70,31 @@ extern "C" {
|
|
67
70
|
typedef void * ggml_backend_context_t;
|
68
71
|
|
69
72
|
struct ggml_backend_i {
|
70
|
-
const char * (*get_name)(ggml_backend_t backend);
|
73
|
+
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
71
74
|
|
72
|
-
void (*free)(ggml_backend_t backend);
|
75
|
+
void (*GGML_CALL free)(ggml_backend_t backend);
|
73
76
|
|
74
77
|
// buffer allocation
|
75
|
-
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
78
|
+
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
76
79
|
|
77
|
-
// (optional)
|
78
|
-
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
79
|
-
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
80
|
+
// (optional) asynchronous tensor data access
|
81
|
+
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
82
|
+
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
83
|
+
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
80
84
|
|
81
|
-
// (optional)
|
82
|
-
void (*
|
83
|
-
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
84
|
-
|
85
|
-
void (*synchronize)(ggml_backend_t backend);
|
85
|
+
// (optional) complete all pending operations
|
86
|
+
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
86
87
|
|
87
88
|
// compute graph with a plan
|
88
|
-
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
89
|
-
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
90
|
-
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
89
|
+
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
90
|
+
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
|
+
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
92
|
|
92
|
-
// compute graph without a plan
|
93
|
-
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
93
|
+
// compute graph without a plan (async)
|
94
|
+
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
94
95
|
|
95
96
|
// check if the backend supports an operation
|
96
|
-
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
97
|
+
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
97
98
|
};
|
98
99
|
|
99
100
|
struct ggml_backend {
|
@@ -102,14 +103,13 @@ extern "C" {
|
|
102
103
|
ggml_backend_context_t context;
|
103
104
|
};
|
104
105
|
|
105
|
-
|
106
106
|
//
|
107
107
|
// Backend registry
|
108
108
|
//
|
109
109
|
|
110
|
-
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
110
|
+
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
111
111
|
|
112
|
-
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
112
|
+
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
113
113
|
|
114
114
|
#ifdef __cplusplus
|
115
115
|
}
|