llama_cpp 0.12.0 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +78 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +11 -0
- data/vendor/tmp/llama.cpp/Makefile +7 -10
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +512 -261
- data/vendor/tmp/llama.cpp/ggml-backend.h +43 -33
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1494 -559
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1868 -2002
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +2182 -44
- data/vendor/tmp/llama.cpp/ggml-quants.h +36 -1
- data/vendor/tmp/llama.cpp/ggml.c +222 -105
- data/vendor/tmp/llama.cpp/ggml.h +56 -35
- data/vendor/tmp/llama.cpp/llama.cpp +1271 -1618
- data/vendor/tmp/llama.cpp/llama.h +44 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a65b477c93060832783d03b065dd336820bf22e985dd7b9f53a20e5834f29a0d
|
4
|
+
data.tar.gz: 3ab3f5147bb207ddeea4b902e86de41398fbe497bb521ab00a4fe89ccd790d50
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 119a77a344ece09afda87d89321f679b9c53975c6b340150e298fa3869a0bf48849fafd49e5ef18b001311aae10e3fa9aba29c96de2c4aa8535cdad7d01382cb
|
7
|
+
data.tar.gz: 444fc224413ee6fc94b0866da07460e9c95162941fcd80c831c6f7a950373503eba74b10d437724db2c9debec4719c5a9b25875f1b0a014c956bcb424ca8bf47
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1833 to b1892.
|
4
|
+
- Change `LLAMA_SESSION_VERSION` value from 3 to 4.
|
5
|
+
- Add constants for split mode: `LLAMA_SPLIT_NONE`, `LLAMA_SPLIT_LAYER`, and `LLAMA_SPLIT_ROW`
|
6
|
+
- Add `split_mode` accessor to ModelParams.
|
7
|
+
- Add `sample_apply_guidance` method to Context.
|
8
|
+
|
9
|
+
## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
|
10
|
+
|
11
|
+
- Bump bundled llama.cpp from b1768 to b1833.
|
12
|
+
- Add model file type constants.
|
13
|
+
- Add `kv_cache_seq_div` method to `Context`.
|
14
|
+
|
1
15
|
## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
|
2
16
|
|
3
17
|
- Add `get_one` singleton method to `Batch`.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -789,6 +789,8 @@ public:
|
|
789
789
|
rb_define_alloc_func(rb_cLLaMAModelParams, llama_model_params_alloc);
|
790
790
|
rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers=", RUBY_METHOD_FUNC(_llama_model_params_set_n_gpu_layers), 1);
|
791
791
|
rb_define_method(rb_cLLaMAModelParams, "n_gpu_layers", RUBY_METHOD_FUNC(_llama_model_params_get_n_gpu_layers), 0);
|
792
|
+
rb_define_method(rb_cLLaMAModelParams, "split_mode=", RUBY_METHOD_FUNC(_llama_model_params_set_split_mode), 1);
|
793
|
+
rb_define_method(rb_cLLaMAModelParams, "split_mode", RUBY_METHOD_FUNC(_llama_model_params_get_split_mode), 0);
|
792
794
|
rb_define_method(rb_cLLaMAModelParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_model_params_set_main_gpu), 1);
|
793
795
|
rb_define_method(rb_cLLaMAModelParams, "main_gpu", RUBY_METHOD_FUNC(_llama_model_params_get_main_gpu), 0);
|
794
796
|
rb_define_method(rb_cLLaMAModelParams, "tensor_split", RUBY_METHOD_FUNC(_llama_model_params_get_tensor_split), 0);
|
@@ -815,6 +817,18 @@ private:
|
|
815
817
|
return INT2NUM(ptr->params.n_gpu_layers);
|
816
818
|
}
|
817
819
|
|
820
|
+
// split_mode
|
821
|
+
static VALUE _llama_model_params_set_split_mode(VALUE self, VALUE split_mode) {
|
822
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
823
|
+
ptr->params.split_mode = static_cast<enum llama_split_mode>(NUM2INT(split_mode));
|
824
|
+
return INT2NUM(ptr->params.split_mode);
|
825
|
+
}
|
826
|
+
|
827
|
+
static VALUE _llama_model_params_get_split_mode(VALUE self) {
|
828
|
+
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
829
|
+
return INT2NUM(ptr->params.split_mode);
|
830
|
+
}
|
831
|
+
|
818
832
|
// main_gpu
|
819
833
|
static VALUE _llama_model_params_set_main_gpu(VALUE self, VALUE main_gpu) {
|
820
834
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
@@ -2026,10 +2040,12 @@ public:
|
|
2026
2040
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
|
2027
2041
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
|
2028
2042
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
|
2043
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
|
2029
2044
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
2030
2045
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2031
2046
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
2032
2047
|
rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
|
2048
|
+
rb_define_method(rb_cLLaMAContext, "sample_apply_guidance", RUBY_METHOD_FUNC(_llama_context_sample_apply_guidance), -1);
|
2033
2049
|
rb_define_method(rb_cLLaMAContext, "sample_classifier_free_guidance", RUBY_METHOD_FUNC(_llama_context_sample_classifier_free_guidance), -1);
|
2034
2050
|
rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
|
2035
2051
|
rb_define_method(rb_cLLaMAContext, "sample_top_k", RUBY_METHOD_FUNC(_llama_context_sample_top_k), -1);
|
@@ -2378,6 +2394,16 @@ private:
|
|
2378
2394
|
return Qnil;
|
2379
2395
|
}
|
2380
2396
|
|
2397
|
+
static VALUE _llama_context_kv_cache_seq_div(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE d) {
|
2398
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2399
|
+
if (ptr->ctx == NULL) {
|
2400
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2401
|
+
return Qnil;
|
2402
|
+
}
|
2403
|
+
llama_kv_cache_seq_div(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(d));
|
2404
|
+
return Qnil;
|
2405
|
+
}
|
2406
|
+
|
2381
2407
|
static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
|
2382
2408
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2383
2409
|
if (ptr->ctx == NULL) {
|
@@ -2542,6 +2568,51 @@ private:
|
|
2542
2568
|
return Qnil;
|
2543
2569
|
}
|
2544
2570
|
|
2571
|
+
static VALUE _llama_context_sample_apply_guidance(int argc, VALUE* argv, VALUE self) {
|
2572
|
+
VALUE kw_args = Qnil;
|
2573
|
+
ID kw_table[3] = { rb_intern("logits"), rb_intern("logits_guidance"), rb_intern("scale") };
|
2574
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
2575
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
2576
|
+
rb_get_kwargs(kw_args, kw_table, 0, 3, kw_values);
|
2577
|
+
|
2578
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2579
|
+
rb_raise(rb_eArgError, "logits must be an Array");
|
2580
|
+
return Qnil;
|
2581
|
+
}
|
2582
|
+
if (!RB_TYPE_P(kw_values[1], T_ARRAY)) {
|
2583
|
+
rb_raise(rb_eArgError, "logits_guidance must be an Array");
|
2584
|
+
return Qnil;
|
2585
|
+
}
|
2586
|
+
if (!RB_FLOAT_TYPE_P(kw_values[2])) {
|
2587
|
+
rb_raise(rb_eArgError, "scale must be a float");
|
2588
|
+
return Qnil;
|
2589
|
+
}
|
2590
|
+
|
2591
|
+
const size_t sz_logits = RARRAY_LEN(kw_values[0]);
|
2592
|
+
std::vector<float> logits(sz_logits);
|
2593
|
+
for (size_t i = 0; i < sz_logits; i++) {
|
2594
|
+
logits[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
|
2595
|
+
}
|
2596
|
+
|
2597
|
+
const size_t sz_logits_guidance = RARRAY_LEN(kw_values[1]);
|
2598
|
+
std::vector<float> logits_guidance(sz_logits_guidance);
|
2599
|
+
for (size_t i = 0; i < sz_logits_guidance; i++) {
|
2600
|
+
logits_guidance[i] = NUM2DBL(rb_ary_entry(kw_values[1], i));
|
2601
|
+
}
|
2602
|
+
|
2603
|
+
const float scale = NUM2DBL(kw_values[2]);
|
2604
|
+
|
2605
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
2606
|
+
if (ctx_ptr->ctx == NULL) {
|
2607
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2608
|
+
return Qnil;
|
2609
|
+
}
|
2610
|
+
|
2611
|
+
llama_sample_apply_guidance(ctx_ptr->ctx, logits.data(), logits_guidance.data(), scale);
|
2612
|
+
|
2613
|
+
return Qnil;
|
2614
|
+
}
|
2615
|
+
|
2545
2616
|
static VALUE _llama_context_sample_classifier_free_guidance(int argc, VALUE* argv, VALUE self) {
|
2546
2617
|
VALUE kw_args = Qnil;
|
2547
2618
|
ID kw_table[2] = { rb_intern("guidance"), rb_intern("scale") };
|
@@ -3209,6 +3280,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3209
3280
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
|
3210
3281
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
3211
3282
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
3283
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
|
3284
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3285
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3212
3286
|
|
3213
3287
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3214
3288
|
|
@@ -3230,6 +3304,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3230
3304
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
|
3231
3305
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
|
3232
3306
|
|
3307
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
|
3308
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
|
3309
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
|
3310
|
+
|
3233
3311
|
std::stringstream ss_magic;
|
3234
3312
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
|
3235
3313
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1892'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -22,6 +22,9 @@ module LLaMACpp
|
|
22
22
|
LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
|
23
23
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
24
24
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
25
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
27
|
+
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
25
28
|
|
26
29
|
LLAMA_KV_OVERRIDE_INT: Integer
|
27
30
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -41,6 +44,10 @@ module LLaMACpp
|
|
41
44
|
LLAMA_ROPE_SCALING_YARN: Integer
|
42
45
|
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
43
46
|
|
47
|
+
LLAMA_SPLIT_NONE: Integer
|
48
|
+
LLAMA_SPLIT_LAYER: Integer
|
49
|
+
LLAMA_SPLIT_ROW: Integer
|
50
|
+
|
44
51
|
def self?.backend_init: (?numa: bool) -> void
|
45
52
|
def self?.backend_free: () -> void
|
46
53
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -135,6 +142,8 @@ module LLaMACpp
|
|
135
142
|
|
136
143
|
def n_gpu_layers: () -> Integer
|
137
144
|
def n_gpu_layers=: (Integer) -> Integer
|
145
|
+
def split_mode: () -> Integer
|
146
|
+
def split_mode=: (Integer) -> Integer
|
138
147
|
def main_gpu: () -> Integer
|
139
148
|
def main_gpu=: (Integer) -> Integer
|
140
149
|
def tensor_split: () -> Array[Float]
|
@@ -193,10 +202,12 @@ module LLaMACpp
|
|
193
202
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
194
203
|
def kv_cache_seq_keep: (Integer) -> void
|
195
204
|
def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
|
205
|
+
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
196
206
|
def set_rng_seed: (Integer) -> void
|
197
207
|
def load_session_file: (session_path: String) -> void
|
198
208
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
199
209
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
210
|
+
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
200
211
|
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
201
212
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
202
213
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
|
-
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
3
|
+
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
4
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
|
5
|
+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
|
|
43
43
|
endif
|
44
44
|
endif
|
45
45
|
|
46
|
-
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
|
47
|
-
BUILD_TARGETS += metal
|
48
|
-
endif
|
49
|
-
|
50
46
|
default: $(BUILD_TARGETS)
|
51
47
|
|
52
48
|
test: $(TEST_TARGETS)
|
@@ -620,6 +616,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
|
|
620
616
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
621
617
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
622
618
|
|
619
|
+
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
620
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
621
|
+
|
623
622
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
624
623
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
625
624
|
|
@@ -671,10 +670,8 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
671
670
|
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
672
671
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
673
672
|
|
674
|
-
|
675
|
-
|
676
|
-
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
677
|
-
endif
|
673
|
+
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
674
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
678
675
|
|
679
676
|
ifeq ($(UNAME_S),Darwin)
|
680
677
|
swift: examples/batched.swift
|
@@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
102
102
|
}
|
103
103
|
}
|
104
104
|
|
105
|
-
AT_PRINTF("block %d\n", best_fit_block);
|
106
|
-
|
107
105
|
if (best_fit_block == -1) {
|
108
106
|
// the last block is our last resort
|
109
107
|
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
@@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
117
115
|
return;
|
118
116
|
}
|
119
117
|
}
|
118
|
+
|
120
119
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
121
120
|
void * addr = block->addr;
|
122
121
|
block->addr = (char*)block->addr + size;
|
@@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
129
128
|
}
|
130
129
|
}
|
131
130
|
|
131
|
+
AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
|
132
|
+
|
132
133
|
tensor->data = addr;
|
133
134
|
tensor->buffer = alloc->buffer;
|
134
135
|
if (!alloc->measure) {
|
@@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
|
229
230
|
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
230
231
|
} else {
|
231
232
|
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
233
|
+
ggml_backend_buffer_reset(alloc->buffer);
|
232
234
|
}
|
233
235
|
}
|
234
236
|
|
@@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
|
263
265
|
return alloc;
|
264
266
|
}
|
265
267
|
|
266
|
-
ggml_tallocr_t
|
268
|
+
ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
|
267
269
|
// create a backend buffer to get the correct tensor allocation sizes
|
268
|
-
ggml_backend_buffer_t buffer =
|
270
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
|
269
271
|
|
270
272
|
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
271
273
|
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
@@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
|
|
275
277
|
return alloc;
|
276
278
|
}
|
277
279
|
|
278
|
-
ggml_tallocr_t
|
279
|
-
|
280
|
+
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
281
|
+
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
|
282
|
+
}
|
283
|
+
|
284
|
+
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
|
285
|
+
// create a backend buffer to get the correct tensor allocation sizes
|
286
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
280
287
|
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
281
288
|
alloc->buffer_owned = true;
|
282
289
|
return alloc;
|
283
290
|
}
|
284
291
|
|
292
|
+
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
293
|
+
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
|
294
|
+
}
|
295
|
+
|
285
296
|
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
286
297
|
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
287
298
|
|
@@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
779
790
|
|
780
791
|
if (nbytes == 0) {
|
781
792
|
// all the tensors in the context are already allocated
|
793
|
+
#ifndef NDEBUG
|
794
|
+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
795
|
+
#endif
|
782
796
|
return NULL;
|
783
797
|
}
|
784
798
|
|
785
799
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
800
|
+
if (buffer == NULL) {
|
801
|
+
// failed to allocate buffer
|
802
|
+
#ifndef NDEBUG
|
803
|
+
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
804
|
+
#endif
|
805
|
+
return NULL;
|
806
|
+
}
|
807
|
+
|
786
808
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
787
809
|
|
788
810
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
@@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
|
|
52
52
|
|
53
53
|
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
|
54
54
|
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
|
55
|
-
GGML_API ggml_tallocr_t
|
55
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
|
56
56
|
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
57
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
58
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
|
57
59
|
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
58
60
|
|
59
61
|
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
|
@@ -16,13 +16,14 @@ extern "C" {
|
|
16
16
|
typedef void * ggml_backend_buffer_type_context_t;
|
17
17
|
|
18
18
|
struct ggml_backend_buffer_type_i {
|
19
|
-
|
20
|
-
|
21
|
-
size_t (*
|
22
|
-
|
19
|
+
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
20
|
+
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
21
|
+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
22
|
+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
23
|
+
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
23
24
|
// check if tensor data is in host memory
|
24
25
|
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
25
|
-
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
26
|
+
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
26
27
|
};
|
27
28
|
|
28
29
|
struct ggml_backend_buffer_type {
|
@@ -34,16 +35,15 @@ extern "C" {
|
|
34
35
|
typedef void * ggml_backend_buffer_context_t;
|
35
36
|
|
36
37
|
struct ggml_backend_buffer_i {
|
37
|
-
|
38
|
-
|
39
|
-
void *
|
40
|
-
void
|
41
|
-
void
|
42
|
-
void
|
43
|
-
|
44
|
-
void
|
45
|
-
void
|
46
|
-
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
38
|
+
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
39
|
+
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
|
40
|
+
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
41
|
+
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
42
|
+
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
43
|
+
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
44
|
+
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
45
|
+
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
46
|
+
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
47
47
|
};
|
48
48
|
|
49
49
|
struct ggml_backend_buffer {
|
@@ -51,14 +51,17 @@ extern "C" {
|
|
51
51
|
ggml_backend_buffer_type_t buft;
|
52
52
|
ggml_backend_buffer_context_t context;
|
53
53
|
size_t size;
|
54
|
+
enum ggml_backend_buffer_usage usage;
|
54
55
|
};
|
55
56
|
|
56
|
-
ggml_backend_buffer_t ggml_backend_buffer_init(
|
57
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
57
58
|
ggml_backend_buffer_type_t buft,
|
58
59
|
struct ggml_backend_buffer_i iface,
|
59
60
|
ggml_backend_buffer_context_t context,
|
60
61
|
size_t size);
|
61
62
|
|
63
|
+
// do not use directly, use ggml_backend_tensor_copy instead
|
64
|
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
62
65
|
|
63
66
|
//
|
64
67
|
// Backend
|
@@ -67,33 +70,31 @@ extern "C" {
|
|
67
70
|
typedef void * ggml_backend_context_t;
|
68
71
|
|
69
72
|
struct ggml_backend_i {
|
70
|
-
const char * (*get_name)(ggml_backend_t backend);
|
73
|
+
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
71
74
|
|
72
|
-
void (*free)(ggml_backend_t backend);
|
75
|
+
void (*GGML_CALL free)(ggml_backend_t backend);
|
73
76
|
|
74
77
|
// buffer allocation
|
75
|
-
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
78
|
+
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
76
79
|
|
77
|
-
// (optional)
|
78
|
-
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
79
|
-
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
80
|
+
// (optional) asynchronous tensor data access
|
81
|
+
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
82
|
+
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
83
|
+
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
80
84
|
|
81
|
-
// (optional)
|
82
|
-
void (*
|
83
|
-
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
84
|
-
|
85
|
-
void (*synchronize)(ggml_backend_t backend);
|
85
|
+
// (optional) complete all pending operations
|
86
|
+
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
86
87
|
|
87
88
|
// compute graph with a plan
|
88
|
-
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
89
|
-
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
90
|
-
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
89
|
+
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
90
|
+
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
|
+
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
92
|
|
92
|
-
// compute graph without a plan
|
93
|
-
|
93
|
+
// compute graph without a plan (async)
|
94
|
+
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
94
95
|
|
95
96
|
// check if the backend supports an operation
|
96
|
-
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
97
|
+
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
97
98
|
};
|
98
99
|
|
99
100
|
struct ggml_backend {
|
@@ -102,14 +103,13 @@ extern "C" {
|
|
102
103
|
ggml_backend_context_t context;
|
103
104
|
};
|
104
105
|
|
105
|
-
|
106
106
|
//
|
107
107
|
// Backend registry
|
108
108
|
//
|
109
109
|
|
110
|
-
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
110
|
+
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
111
111
|
|
112
|
-
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
112
|
+
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
113
113
|
|
114
114
|
#ifdef __cplusplus
|
115
115
|
}
|