llama_cpp 0.9.5 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1140 -355
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +506 -158
- data/ext/llama_cpp/src/ggml-metal.metal +795 -144
- data/ext/llama_cpp/src/ggml.c +331 -111
- data/ext/llama_cpp/src/ggml.h +49 -4
- data/ext/llama_cpp/src/llama.cpp +749 -329
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f406c15621a7c247adaacf1d588ddf278225e6846466afd1184c00f1ee61768
|
4
|
+
data.tar.gz: df73657c75a80cb44f41d34a3c1054676cf59a5d7d56cb1c2ce8a94264002293
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: acd08d5099f14bf2bd4c8f9bf016253f0e316179b79d72fbe7066b0d645ca31e9bab427fcc53d93874f8df74cb1746731e2cd21864bfecdecff91f9778919b42
|
7
|
+
data.tar.gz: 5014a1bd545be90c56bebd48119a198cf7276513cb6c5f00d8322aa6eaa9a27442bc51bf06953a11c2fc04145f797c630cefee17b36589fe38f9226003416a09
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## [[0.10.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.5...v0.10.0)] - 2023-12-09
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1593 to b1620.
|
4
|
+
- Add `ModelKVOverride` class.
|
5
|
+
- Add `offload_kqv`, `type_k`, and `type_v` to ContextParams.
|
6
|
+
- Add kv overwrite type constants.
|
7
|
+
|
8
|
+
**Breaking Changes**
|
9
|
+
- Remove `f16_kv` from ContextParams.
|
10
|
+
|
1
11
|
## [[0.9.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.4...v0.9.5)] - 2023-12-02
|
2
12
|
|
3
13
|
- Bump bundled llama.cpp from b1555 to b1593.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
VALUE rb_mLLaMACpp;
|
4
4
|
VALUE rb_cLLaMABatch;
|
5
5
|
VALUE rb_cLLaMAModel;
|
6
|
+
VALUE rb_cLLaMAModelKVOverride;
|
6
7
|
VALUE rb_cLLaMAModelParams;
|
7
8
|
VALUE rb_cLLaMATimings;
|
8
9
|
VALUE rb_cLLaMAContext;
|
@@ -612,6 +613,78 @@ const rb_data_type_t RbLLaMATimings::llama_timings_type = {
|
|
612
613
|
RUBY_TYPED_FREE_IMMEDIATELY
|
613
614
|
};
|
614
615
|
|
616
|
+
class RbLLaMAModelKVOverride {
|
617
|
+
public:
|
618
|
+
static VALUE llama_model_kv_override_alloc(VALUE self) {
|
619
|
+
llama_model_kv_override* ptr = (llama_model_kv_override*)ruby_xmalloc(sizeof(llama_model_kv_override));
|
620
|
+
new (ptr) llama_model_kv_override();
|
621
|
+
return TypedData_Wrap_Struct(self, &llama_model_kv_override_type, ptr);
|
622
|
+
}
|
623
|
+
|
624
|
+
static void llama_model_kv_override_free(void* ptr) {
|
625
|
+
((llama_model_kv_override*)ptr)->~llama_model_kv_override();
|
626
|
+
ruby_xfree(ptr);
|
627
|
+
}
|
628
|
+
|
629
|
+
static size_t llama_model_kv_override_size(const void* ptr) {
|
630
|
+
return sizeof(*((llama_model_kv_override*)ptr));
|
631
|
+
}
|
632
|
+
|
633
|
+
static llama_model_kv_override* get_llama_model_kv_override(VALUE self) {
|
634
|
+
llama_model_kv_override* ptr;
|
635
|
+
TypedData_Get_Struct(self, llama_model_kv_override, &llama_model_kv_override_type, ptr);
|
636
|
+
return ptr;
|
637
|
+
}
|
638
|
+
|
639
|
+
static void define_class(VALUE outer) {
|
640
|
+
rb_cLLaMAModelKVOverride = rb_define_class_under(outer, "ModelKVOverride", rb_cObject);
|
641
|
+
rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
|
642
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
|
643
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
|
644
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "int_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_int_value), 0);
|
645
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "float_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_float_value), 0);
|
646
|
+
rb_define_method(rb_cLLaMAModelKVOverride, "bool_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_bool_value), 0);
|
647
|
+
}
|
648
|
+
|
649
|
+
static const rb_data_type_t llama_model_kv_override_type;
|
650
|
+
|
651
|
+
private:
|
652
|
+
static VALUE _llama_model_kv_override_get_key(VALUE self) {
|
653
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
654
|
+
return rb_utf8_str_new_cstr(ptr->key);
|
655
|
+
}
|
656
|
+
|
657
|
+
static VALUE _llama_model_kv_override_get_tag(VALUE self) {
|
658
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
659
|
+
return INT2NUM(ptr->tag);
|
660
|
+
}
|
661
|
+
|
662
|
+
static VALUE _llama_model_kv_override_get_int_value(VALUE self) {
|
663
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
664
|
+
return INT2NUM(ptr->int_value);
|
665
|
+
}
|
666
|
+
|
667
|
+
static VALUE _llama_model_kv_override_get_float_value(VALUE self) {
|
668
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
669
|
+
return DBL2NUM(ptr->float_value);
|
670
|
+
}
|
671
|
+
|
672
|
+
static VALUE _llama_model_kv_override_get_bool_value(VALUE self) {
|
673
|
+
llama_model_kv_override* ptr = get_llama_model_kv_override(self);
|
674
|
+
return ptr->bool_value ? Qtrue : Qfalse;
|
675
|
+
}
|
676
|
+
};
|
677
|
+
|
678
|
+
const rb_data_type_t RbLLaMAModelKVOverride::llama_model_kv_override_type = {
|
679
|
+
"RbLLaMAModelKVOverride",
|
680
|
+
{ NULL,
|
681
|
+
RbLLaMAModelKVOverride::llama_model_kv_override_free,
|
682
|
+
RbLLaMAModelKVOverride::llama_model_kv_override_size },
|
683
|
+
NULL,
|
684
|
+
NULL,
|
685
|
+
RUBY_TYPED_FREE_IMMEDIATELY
|
686
|
+
};
|
687
|
+
|
615
688
|
class LLaMAModelParamsWrapper {
|
616
689
|
public:
|
617
690
|
struct llama_model_params params;
|
@@ -812,14 +885,18 @@ public:
|
|
812
885
|
rb_define_method(rb_cLLaMAContextParams, "yarn_beta_slow", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_beta_slow), 0);
|
813
886
|
rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_yarn_orig_ctx), 1);
|
814
887
|
rb_define_method(rb_cLLaMAContextParams, "yarn_orig_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_yarn_orig_ctx), 0);
|
888
|
+
rb_define_method(rb_cLLaMAContextParams, "type_k=", RUBY_METHOD_FUNC(_llama_context_params_set_type_k), 1);
|
889
|
+
rb_define_method(rb_cLLaMAContextParams, "type_k", RUBY_METHOD_FUNC(_llama_context_params_get_type_k), 0);
|
890
|
+
rb_define_method(rb_cLLaMAContextParams, "type_v=", RUBY_METHOD_FUNC(_llama_context_params_set_type_v), 1);
|
891
|
+
rb_define_method(rb_cLLaMAContextParams, "type_v", RUBY_METHOD_FUNC(_llama_context_params_get_type_v), 0);
|
815
892
|
rb_define_method(rb_cLLaMAContextParams, "mul_mat_q=", RUBY_METHOD_FUNC(_llama_context_params_set_mul_mat_q), 1);
|
816
893
|
rb_define_method(rb_cLLaMAContextParams, "mul_mat_q", RUBY_METHOD_FUNC(_llama_context_params_get_mul_mat_q), 0);
|
817
|
-
rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
|
818
|
-
rb_define_method(rb_cLLaMAContextParams, "f16_kv", RUBY_METHOD_FUNC(_llama_context_params_get_f16_kv), 0);
|
819
894
|
rb_define_method(rb_cLLaMAContextParams, "logits_all=", RUBY_METHOD_FUNC(_llama_context_params_set_logits_all), 1);
|
820
895
|
rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
|
821
896
|
rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
|
822
897
|
rb_define_method(rb_cLLaMAContextParams, "embedding", RUBY_METHOD_FUNC(_llama_context_params_get_embedding), 0);
|
898
|
+
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
899
|
+
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
823
900
|
}
|
824
901
|
|
825
902
|
private:
|
@@ -991,28 +1068,40 @@ private:
|
|
991
1068
|
return UINT2NUM(ptr->params.yarn_orig_ctx);
|
992
1069
|
}
|
993
1070
|
|
994
|
-
//
|
995
|
-
static VALUE
|
1071
|
+
// type_k
|
1072
|
+
static VALUE _llama_context_params_set_type_k(VALUE self, VALUE type_k) {
|
996
1073
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
997
|
-
ptr->params.
|
998
|
-
return ptr->params.
|
1074
|
+
ptr->params.type_k = static_cast<enum ggml_type>(NUM2INT(type_k));
|
1075
|
+
return INT2NUM(ptr->params.type_k);
|
999
1076
|
}
|
1000
1077
|
|
1001
|
-
static VALUE
|
1078
|
+
static VALUE _llama_context_params_get_type_k(VALUE self) {
|
1002
1079
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1003
|
-
return ptr->params.
|
1080
|
+
return INT2NUM(ptr->params.type_k);
|
1004
1081
|
}
|
1005
1082
|
|
1006
|
-
//
|
1007
|
-
static VALUE
|
1083
|
+
// type_v
|
1084
|
+
static VALUE _llama_context_params_set_type_v(VALUE self, VALUE type_v) {
|
1008
1085
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1009
|
-
ptr->params.
|
1010
|
-
return ptr->params.
|
1086
|
+
ptr->params.type_v = static_cast<enum ggml_type>(NUM2INT(type_v));
|
1087
|
+
return INT2NUM(ptr->params.type_v);
|
1011
1088
|
}
|
1012
1089
|
|
1013
|
-
static VALUE
|
1090
|
+
static VALUE _llama_context_params_get_type_v(VALUE self) {
|
1014
1091
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1015
|
-
return ptr->params.
|
1092
|
+
return INT2NUM(ptr->params.type_v);
|
1093
|
+
}
|
1094
|
+
|
1095
|
+
// mul_mat_q
|
1096
|
+
static VALUE _llama_context_params_set_mul_mat_q(VALUE self, VALUE mul_mat_q) {
|
1097
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1098
|
+
ptr->params.mul_mat_q = RTEST(mul_mat_q) ? true : false;
|
1099
|
+
return ptr->params.mul_mat_q ? Qtrue : Qfalse;
|
1100
|
+
}
|
1101
|
+
|
1102
|
+
static VALUE _llama_context_params_get_mul_mat_q(VALUE self) {
|
1103
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1104
|
+
return ptr->params.mul_mat_q ? Qtrue : Qfalse;
|
1016
1105
|
}
|
1017
1106
|
|
1018
1107
|
// logits_all
|
@@ -1038,6 +1127,18 @@ private:
|
|
1038
1127
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1039
1128
|
return ptr->params.embedding ? Qtrue : Qfalse;
|
1040
1129
|
}
|
1130
|
+
|
1131
|
+
// offload_kqv
|
1132
|
+
static VALUE _llama_context_params_set_offload_kqv(VALUE self, VALUE offload_kqv) {
|
1133
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1134
|
+
ptr->params.offload_kqv = RTEST(offload_kqv) ? true : false;
|
1135
|
+
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1136
|
+
}
|
1137
|
+
|
1138
|
+
static VALUE _llama_context_params_get_offload_kqv(VALUE self) {
|
1139
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1140
|
+
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1141
|
+
}
|
1041
1142
|
};
|
1042
1143
|
|
1043
1144
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -2352,7 +2453,7 @@ private:
|
|
2352
2453
|
const float penalty_present = NUM2DBL(kw_values[2]);
|
2353
2454
|
|
2354
2455
|
llama_sample_repetition_penalties(ctx_ptr->ctx, &(cnd_ptr->array), last_n_tokens_data.data(), last_tokens_size,
|
2355
|
-
|
2456
|
+
penalty_repeat, penalty_freq, penalty_present);
|
2356
2457
|
|
2357
2458
|
return Qnil;
|
2358
2459
|
}
|
@@ -2973,6 +3074,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
2973
3074
|
RbLLaMATokenData::define_class(rb_mLLaMACpp);
|
2974
3075
|
RbLLaMATokenDataArray::define_class(rb_mLLaMACpp);
|
2975
3076
|
RbLLaMAModel::define_class(rb_mLLaMACpp);
|
3077
|
+
RbLLaMAModelKVOverride::define_class(rb_mLLaMACpp);
|
2976
3078
|
RbLLaMAModelParams::define_class(rb_mLLaMACpp);
|
2977
3079
|
RbLLaMATimings::define_class(rb_mLLaMACpp);
|
2978
3080
|
RbLLaMAContext::define_class(rb_mLLaMACpp);
|
@@ -3023,6 +3125,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3023
3125
|
|
3024
3126
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3025
3127
|
|
3128
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_INT", INT2NUM(LLAMA_KV_OVERRIDE_INT));
|
3129
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_FLOAT));
|
3130
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_BOOL));
|
3131
|
+
|
3026
3132
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
|
3027
3133
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
|
3028
3134
|
rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_RULE_REF", INT2NUM(LLAMA_GRETYPE_RULE_REF));
|
@@ -168,10 +168,6 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
168
168
|
size = aligned_offset(NULL, size, alloc->alignment);
|
169
169
|
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
170
170
|
|
171
|
-
if (!alloc->measure) {
|
172
|
-
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
173
|
-
}
|
174
|
-
|
175
171
|
#ifdef GGML_ALLOCATOR_DEBUG
|
176
172
|
remove_allocated_tensor(alloc, tensor);
|
177
173
|
#endif
|
@@ -237,7 +233,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
|
237
233
|
}
|
238
234
|
|
239
235
|
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
|
240
|
-
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(
|
236
|
+
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
|
241
237
|
|
242
238
|
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
243
239
|
|
@@ -449,7 +445,6 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
|
|
449
445
|
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
450
446
|
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
451
447
|
|
452
|
-
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
453
448
|
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
454
449
|
if (update_backend) {
|
455
450
|
view->backend = view->view_src->backend;
|
@@ -459,7 +454,7 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
|
459
454
|
|
460
455
|
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
461
456
|
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
462
|
-
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->
|
457
|
+
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
463
458
|
|
464
459
|
if (!alloc->measure) {
|
465
460
|
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
@@ -765,3 +760,43 @@ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
|
|
765
760
|
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
|
766
761
|
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
|
767
762
|
}
|
763
|
+
|
764
|
+
// utils
|
765
|
+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
766
|
+
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
767
|
+
|
768
|
+
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
769
|
+
|
770
|
+
size_t nbytes = 0;
|
771
|
+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
772
|
+
if (t->data == NULL && t->view_src == NULL) {
|
773
|
+
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
774
|
+
}
|
775
|
+
}
|
776
|
+
|
777
|
+
if (nbytes == 0) {
|
778
|
+
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
|
779
|
+
return NULL;
|
780
|
+
}
|
781
|
+
|
782
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
783
|
+
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
784
|
+
|
785
|
+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
786
|
+
if (t->data == NULL) {
|
787
|
+
if (t->view_src == NULL) {
|
788
|
+
ggml_tallocr_alloc(tallocr, t);
|
789
|
+
} else {
|
790
|
+
ggml_backend_view_init(buffer, t);
|
791
|
+
}
|
792
|
+
}
|
793
|
+
}
|
794
|
+
|
795
|
+
ggml_tallocr_free(tallocr);
|
796
|
+
|
797
|
+
return buffer;
|
798
|
+
}
|
799
|
+
|
800
|
+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
801
|
+
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
802
|
+
}
|
@@ -8,6 +8,7 @@ extern "C" {
|
|
8
8
|
|
9
9
|
struct ggml_backend;
|
10
10
|
struct ggml_backend_buffer;
|
11
|
+
struct ggml_backend_buffer_type;
|
11
12
|
|
12
13
|
//
|
13
14
|
// Legacy API
|
@@ -80,6 +81,12 @@ GGML_API void ggml_gallocr_alloc_graph_n(
|
|
80
81
|
struct ggml_hash_set hash_set,
|
81
82
|
ggml_tallocr_t * hash_node_talloc);
|
82
83
|
|
84
|
+
|
85
|
+
// Utils
|
86
|
+
// Create a buffer and allocate all the tensors in a ggml_context
|
87
|
+
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
|
88
|
+
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
|
89
|
+
|
83
90
|
#ifdef __cplusplus
|
84
91
|
}
|
85
92
|
#endif
|
@@ -12,31 +12,50 @@ extern "C" {
|
|
12
12
|
// Backend buffer
|
13
13
|
//
|
14
14
|
|
15
|
+
// buffer type
|
16
|
+
typedef void * ggml_backend_buffer_type_context_t;
|
17
|
+
|
18
|
+
struct ggml_backend_buffer_type_i {
|
19
|
+
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
20
|
+
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
21
|
+
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
22
|
+
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
23
|
+
};
|
24
|
+
|
25
|
+
struct ggml_backend_buffer_type {
|
26
|
+
struct ggml_backend_buffer_type_i iface;
|
27
|
+
ggml_backend_buffer_type_context_t context;
|
28
|
+
};
|
29
|
+
|
30
|
+
// buffer
|
15
31
|
typedef void * ggml_backend_buffer_context_t;
|
16
32
|
|
17
33
|
struct ggml_backend_buffer_i {
|
18
|
-
void
|
19
|
-
void
|
20
|
-
|
21
|
-
void
|
22
|
-
void
|
34
|
+
void (*free_buffer)(ggml_backend_buffer_t buffer);
|
35
|
+
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
36
|
+
void * (*get_base) (ggml_backend_buffer_t buffer);
|
37
|
+
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
38
|
+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
39
|
+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
40
|
+
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
41
|
+
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
42
|
+
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
23
43
|
};
|
24
44
|
|
25
45
|
struct ggml_backend_buffer {
|
26
|
-
struct ggml_backend_buffer_i
|
27
|
-
|
28
|
-
ggml_backend_t backend;
|
46
|
+
struct ggml_backend_buffer_i iface;
|
47
|
+
ggml_backend_buffer_type_t buft;
|
29
48
|
ggml_backend_buffer_context_t context;
|
30
|
-
|
31
49
|
size_t size;
|
32
50
|
};
|
33
51
|
|
34
|
-
|
35
|
-
|
52
|
+
ggml_backend_buffer_t ggml_backend_buffer_init(
|
53
|
+
ggml_backend_buffer_type_t buft,
|
36
54
|
struct ggml_backend_buffer_i iface,
|
37
55
|
ggml_backend_buffer_context_t context,
|
38
56
|
size_t size);
|
39
57
|
|
58
|
+
|
40
59
|
//
|
41
60
|
// Backend
|
42
61
|
//
|
@@ -49,20 +68,17 @@ extern "C" {
|
|
49
68
|
void (*free)(ggml_backend_t backend);
|
50
69
|
|
51
70
|
// buffer allocation
|
52
|
-
|
71
|
+
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
53
72
|
|
54
|
-
//
|
55
|
-
size_t (*get_alignment)(ggml_backend_t backend);
|
56
|
-
|
57
|
-
// tensor data access
|
58
|
-
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
73
|
+
// (optional) asynchroneous tensor data access
|
59
74
|
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
60
75
|
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
61
|
-
void (*synchronize) (ggml_backend_t backend);
|
62
76
|
|
63
|
-
// (optional)
|
64
|
-
void (*
|
65
|
-
void (*
|
77
|
+
// (optional) asynchroneous tensor copy
|
78
|
+
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
79
|
+
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
80
|
+
|
81
|
+
void (*synchronize) (ggml_backend_t backend);
|
66
82
|
|
67
83
|
// compute graph with a plan
|
68
84
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
@@ -82,6 +98,15 @@ extern "C" {
|
|
82
98
|
ggml_backend_context_t context;
|
83
99
|
};
|
84
100
|
|
101
|
+
|
102
|
+
//
|
103
|
+
// Backend registry
|
104
|
+
//
|
105
|
+
|
106
|
+
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
107
|
+
|
108
|
+
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
109
|
+
|
85
110
|
#ifdef __cplusplus
|
86
111
|
}
|
87
112
|
#endif
|