llama_cpp 0.14.2 → 0.14.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +20 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +154 -124
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8741 -8691
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +260 -28
- data/vendor/tmp/llama.cpp/ggml-quants.c +25 -13
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +237 -78
- data/vendor/tmp/llama.cpp/ggml-sycl.h +6 -1
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml.c +98 -16
- data/vendor/tmp/llama.cpp/llama.cpp +382 -42
- data/vendor/tmp/llama.cpp/llama.h +19 -4
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3b503998061ee4c8a87bc3148d41feda0b45b04cbe0cafdb3897d1d457b26e0a
|
4
|
+
data.tar.gz: b761a18fd964ca0a4e871d01cc0a6058527c951413de7b110a8b07862ed64d8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2951b2a59b0579f3afa983283a73853300f822891f0d1dfef292727d6f313392ddc68902144caaca33b173e43e95076dda02ffa97228cf7f65babc4ac82354c9
|
7
|
+
data.tar.gz: cb655d32282b28ebaee30b87b882600c79a6666c306de2692a059da3de1bc21d3c988116fd1dd26d97c00ea0f22fdccc8b3f8d94b20cb01c819d9a578c71bd67
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2435 to b2496.
|
4
|
+
- Add `n_layer` method to `Model`.
|
5
|
+
- Add `apply_control_vector` method to `Context`.
|
6
|
+
|
1
7
|
## [[0.14.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.1...v0.14.2)] - 2024-03-16
|
2
8
|
|
3
9
|
- Fix to use metal embed library on macOS.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1466,6 +1466,7 @@ public:
|
|
1466
1466
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
1467
1467
|
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
1468
1468
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
1469
|
+
rb_define_method(rb_cLLaMAModel, "n_layer", RUBY_METHOD_FUNC(_llama_model_get_model_n_layer), 0);
|
1469
1470
|
rb_define_method(rb_cLLaMAModel, "rope_freq_scale_train", RUBY_METHOD_FUNC(_llama_model_rope_freq_scale_train), 0);
|
1470
1471
|
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece), 1);
|
1471
1472
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize), -1);
|
@@ -1613,6 +1614,11 @@ private:
|
|
1613
1614
|
return INT2NUM(llama_n_embd(ptr->model));
|
1614
1615
|
}
|
1615
1616
|
|
1617
|
+
static VALUE _llama_model_get_model_n_layer(VALUE self) {
|
1618
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1619
|
+
return INT2NUM(llama_n_layer(ptr->model));
|
1620
|
+
}
|
1621
|
+
|
1616
1622
|
static VALUE _llama_model_rope_freq_scale_train(VALUE self) {
|
1617
1623
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1618
1624
|
return DBL2NUM(llama_rope_freq_scale_train(ptr->model));
|
@@ -2083,6 +2089,7 @@ public:
|
|
2083
2089
|
rb_define_method(rb_cLLaMAContext, "sample_token", RUBY_METHOD_FUNC(_llama_context_sample_token), 1);
|
2084
2090
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2085
2091
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2092
|
+
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2086
2093
|
}
|
2087
2094
|
|
2088
2095
|
private:
|
@@ -3153,6 +3160,59 @@ private:
|
|
3153
3160
|
|
3154
3161
|
return Qnil;
|
3155
3162
|
}
|
3163
|
+
|
3164
|
+
static VALUE _llama_context_apply_control_vector(int argc, VALUE* argv, VALUE self) {
|
3165
|
+
VALUE kw_args = Qnil;
|
3166
|
+
ID kw_table[4] = { rb_intern("data"), rb_intern("n_embd"), rb_intern("il_start"), rb_intern("il_end") };
|
3167
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
3168
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
3169
|
+
rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
|
3170
|
+
|
3171
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY) && !NIL_P(kw_values[0])) {
|
3172
|
+
rb_raise(rb_eArgError, "data must be an Array or nil");
|
3173
|
+
return Qnil;
|
3174
|
+
}
|
3175
|
+
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
3176
|
+
rb_raise(rb_eArgError, "n_embd must be an Integer");
|
3177
|
+
return Qnil;
|
3178
|
+
}
|
3179
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
3180
|
+
rb_raise(rb_eArgError, "il_start must be an Integer");
|
3181
|
+
return Qnil;
|
3182
|
+
}
|
3183
|
+
if (!RB_INTEGER_TYPE_P(kw_values[3])) {
|
3184
|
+
rb_raise(rb_eArgError, "il_end must be an Integer");
|
3185
|
+
return Qnil;
|
3186
|
+
}
|
3187
|
+
|
3188
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
3189
|
+
if (ctx_ptr->ctx == NULL) {
|
3190
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3191
|
+
return Qnil;
|
3192
|
+
}
|
3193
|
+
|
3194
|
+
std::vector<float> data(RARRAY_LEN(kw_values[0]));
|
3195
|
+
for (size_t i = 0; i < data.size(); i++) {
|
3196
|
+
data[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
|
3197
|
+
}
|
3198
|
+
const int32_t n_embd = NUM2INT(kw_values[1]);
|
3199
|
+
const int32_t il_start = NUM2INT(kw_values[2]);
|
3200
|
+
const int32_t il_end = NUM2INT(kw_values[3]);
|
3201
|
+
|
3202
|
+
int32_t err = 0;
|
3203
|
+
if (NIL_P(kw_values[0])) {
|
3204
|
+
err = llama_control_vector_apply(ctx_ptr->ctx, NULL, 0, n_embd, il_start, il_end);
|
3205
|
+
} else {
|
3206
|
+
err = llama_control_vector_apply(ctx_ptr->ctx, data.data(), data.size(), n_embd, il_start, il_end);
|
3207
|
+
}
|
3208
|
+
|
3209
|
+
if (err) {
|
3210
|
+
rb_raise(rb_eRuntimeError, "Failed to apply control vector");
|
3211
|
+
return Qnil;
|
3212
|
+
}
|
3213
|
+
|
3214
|
+
return Qnil;
|
3215
|
+
}
|
3156
3216
|
};
|
3157
3217
|
|
3158
3218
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2496'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -108,6 +108,7 @@ module LLaMACpp
|
|
108
108
|
def n_vocab: () -> Integer
|
109
109
|
def n_ctx_train: () -> Integer
|
110
110
|
def n_embd: () -> Integer
|
111
|
+
def n_layer: () -> Integer
|
111
112
|
def rope_freq_scale_train: () -> Float
|
112
113
|
def token_to_piece: (Integer) -> String
|
113
114
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
|
@@ -244,6 +245,7 @@ module LLaMACpp
|
|
244
245
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
245
246
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
246
247
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
248
|
+
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
247
249
|
end
|
248
250
|
|
249
251
|
class ContextParams
|
@@ -9,7 +9,8 @@ TEST_TARGETS = \
|
|
9
9
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
|
-
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
12
|
+
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
13
|
+
tests/test-json-schema-to-grammar
|
13
14
|
|
14
15
|
# Code coverage output files
|
15
16
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -555,7 +556,7 @@ endif
|
|
555
556
|
endif # LLAMA_METAL
|
556
557
|
|
557
558
|
ifdef LLAMA_METAL
|
558
|
-
ggml-metal.o: ggml-metal.m ggml-metal.h
|
559
|
+
ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
|
559
560
|
$(CC) $(CFLAGS) -c $< -o $@
|
560
561
|
|
561
562
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
@@ -597,6 +598,11 @@ include scripts/get-flags.mk
|
|
597
598
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
598
599
|
endif
|
599
600
|
|
601
|
+
ifdef LLAMA_CURL
|
602
|
+
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
|
603
|
+
override LDFLAGS := $(LDFLAGS) -lcurl
|
604
|
+
endif
|
605
|
+
|
600
606
|
#
|
601
607
|
# Print build information
|
602
608
|
#
|
@@ -663,6 +669,9 @@ console.o: common/console.cpp common/console.h
|
|
663
669
|
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
|
664
670
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
665
671
|
|
672
|
+
json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
|
673
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
674
|
+
|
666
675
|
train.o: common/train.cpp common/train.h
|
667
676
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
668
677
|
|
@@ -745,7 +754,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
745
754
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
746
755
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
747
756
|
|
748
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h
|
757
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
749
758
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
750
759
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
751
760
|
|
@@ -753,6 +762,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
|
753
762
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
754
763
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
755
764
|
|
765
|
+
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
766
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
767
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
768
|
+
|
756
769
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
757
770
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
758
771
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -861,6 +874,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
|
861
874
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
862
875
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
863
876
|
|
877
|
+
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
|
878
|
+
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
879
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
880
|
+
|
864
881
|
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
|
865
882
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
866
883
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
548
548
|
for (int i = 0; i < graph->n_nodes; i++) {
|
549
549
|
struct ggml_tensor * node = graph->nodes[i];
|
550
550
|
|
551
|
-
|
551
|
+
// TODO: better way to add external dependencies
|
552
|
+
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
|
553
|
+
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
|
554
|
+
// itself is never used and should not be considered a dependency
|
555
|
+
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
|
552
556
|
struct ggml_tensor * view_src = node->view_src;
|
553
557
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
554
558
|
}
|
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
565
569
|
|
566
570
|
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
567
571
|
|
568
|
-
// allocate explicit inputs
|
569
|
-
if (src->flags & GGML_TENSOR_FLAG_INPUT
|
572
|
+
// allocate explicit inputs
|
573
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
570
574
|
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
571
575
|
}
|
572
576
|
}
|
@@ -103,6 +103,11 @@ extern "C" {
|
|
103
103
|
// check if the backend supports an operation
|
104
104
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
105
105
|
|
106
|
+
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
107
|
+
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
108
|
+
// even if the weight has to be copied from the CPU temporarily
|
109
|
+
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
110
|
+
|
106
111
|
// (optional) event synchronization
|
107
112
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
108
113
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|