llama_cpp 0.14.2 → 0.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +20 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +154 -124
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8741 -8691
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +260 -28
- data/vendor/tmp/llama.cpp/ggml-quants.c +25 -13
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +237 -78
- data/vendor/tmp/llama.cpp/ggml-sycl.h +6 -1
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml.c +98 -16
- data/vendor/tmp/llama.cpp/llama.cpp +382 -42
- data/vendor/tmp/llama.cpp/llama.h +19 -4
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3b503998061ee4c8a87bc3148d41feda0b45b04cbe0cafdb3897d1d457b26e0a
|
4
|
+
data.tar.gz: b761a18fd964ca0a4e871d01cc0a6058527c951413de7b110a8b07862ed64d8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2951b2a59b0579f3afa983283a73853300f822891f0d1dfef292727d6f313392ddc68902144caaca33b173e43e95076dda02ffa97228cf7f65babc4ac82354c9
|
7
|
+
data.tar.gz: cb655d32282b28ebaee30b87b882600c79a6666c306de2692a059da3de1bc21d3c988116fd1dd26d97c00ea0f22fdccc8b3f8d94b20cb01c819d9a578c71bd67
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2435 to b2496.
|
4
|
+
- Add `n_layer` method to `Model`.
|
5
|
+
- Add `apply_control_vector` method to `Context`.
|
6
|
+
|
1
7
|
## [[0.14.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.1...v0.14.2)] - 2024-03-16
|
2
8
|
|
3
9
|
- Fix to use metal embed library on macOS.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1466,6 +1466,7 @@ public:
|
|
1466
1466
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
1467
1467
|
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
1468
1468
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
1469
|
+
rb_define_method(rb_cLLaMAModel, "n_layer", RUBY_METHOD_FUNC(_llama_model_get_model_n_layer), 0);
|
1469
1470
|
rb_define_method(rb_cLLaMAModel, "rope_freq_scale_train", RUBY_METHOD_FUNC(_llama_model_rope_freq_scale_train), 0);
|
1470
1471
|
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece), 1);
|
1471
1472
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize), -1);
|
@@ -1613,6 +1614,11 @@ private:
|
|
1613
1614
|
return INT2NUM(llama_n_embd(ptr->model));
|
1614
1615
|
}
|
1615
1616
|
|
1617
|
+
static VALUE _llama_model_get_model_n_layer(VALUE self) {
|
1618
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1619
|
+
return INT2NUM(llama_n_layer(ptr->model));
|
1620
|
+
}
|
1621
|
+
|
1616
1622
|
static VALUE _llama_model_rope_freq_scale_train(VALUE self) {
|
1617
1623
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1618
1624
|
return DBL2NUM(llama_rope_freq_scale_train(ptr->model));
|
@@ -2083,6 +2089,7 @@ public:
|
|
2083
2089
|
rb_define_method(rb_cLLaMAContext, "sample_token", RUBY_METHOD_FUNC(_llama_context_sample_token), 1);
|
2084
2090
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2085
2091
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2092
|
+
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2086
2093
|
}
|
2087
2094
|
|
2088
2095
|
private:
|
@@ -3153,6 +3160,59 @@ private:
|
|
3153
3160
|
|
3154
3161
|
return Qnil;
|
3155
3162
|
}
|
3163
|
+
|
3164
|
+
static VALUE _llama_context_apply_control_vector(int argc, VALUE* argv, VALUE self) {
|
3165
|
+
VALUE kw_args = Qnil;
|
3166
|
+
ID kw_table[4] = { rb_intern("data"), rb_intern("n_embd"), rb_intern("il_start"), rb_intern("il_end") };
|
3167
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
3168
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
3169
|
+
rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
|
3170
|
+
|
3171
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY) && !NIL_P(kw_values[0])) {
|
3172
|
+
rb_raise(rb_eArgError, "data must be an Array or nil");
|
3173
|
+
return Qnil;
|
3174
|
+
}
|
3175
|
+
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
3176
|
+
rb_raise(rb_eArgError, "n_embd must be an Integer");
|
3177
|
+
return Qnil;
|
3178
|
+
}
|
3179
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
3180
|
+
rb_raise(rb_eArgError, "il_start must be an Integer");
|
3181
|
+
return Qnil;
|
3182
|
+
}
|
3183
|
+
if (!RB_INTEGER_TYPE_P(kw_values[3])) {
|
3184
|
+
rb_raise(rb_eArgError, "il_end must be an Integer");
|
3185
|
+
return Qnil;
|
3186
|
+
}
|
3187
|
+
|
3188
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
3189
|
+
if (ctx_ptr->ctx == NULL) {
|
3190
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3191
|
+
return Qnil;
|
3192
|
+
}
|
3193
|
+
|
3194
|
+
std::vector<float> data(RARRAY_LEN(kw_values[0]));
|
3195
|
+
for (size_t i = 0; i < data.size(); i++) {
|
3196
|
+
data[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
|
3197
|
+
}
|
3198
|
+
const int32_t n_embd = NUM2INT(kw_values[1]);
|
3199
|
+
const int32_t il_start = NUM2INT(kw_values[2]);
|
3200
|
+
const int32_t il_end = NUM2INT(kw_values[3]);
|
3201
|
+
|
3202
|
+
int32_t err = 0;
|
3203
|
+
if (NIL_P(kw_values[0])) {
|
3204
|
+
err = llama_control_vector_apply(ctx_ptr->ctx, NULL, 0, n_embd, il_start, il_end);
|
3205
|
+
} else {
|
3206
|
+
err = llama_control_vector_apply(ctx_ptr->ctx, data.data(), data.size(), n_embd, il_start, il_end);
|
3207
|
+
}
|
3208
|
+
|
3209
|
+
if (err) {
|
3210
|
+
rb_raise(rb_eRuntimeError, "Failed to apply control vector");
|
3211
|
+
return Qnil;
|
3212
|
+
}
|
3213
|
+
|
3214
|
+
return Qnil;
|
3215
|
+
}
|
3156
3216
|
};
|
3157
3217
|
|
3158
3218
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2496'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -108,6 +108,7 @@ module LLaMACpp
|
|
108
108
|
def n_vocab: () -> Integer
|
109
109
|
def n_ctx_train: () -> Integer
|
110
110
|
def n_embd: () -> Integer
|
111
|
+
def n_layer: () -> Integer
|
111
112
|
def rope_freq_scale_train: () -> Float
|
112
113
|
def token_to_piece: (Integer) -> String
|
113
114
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
|
@@ -244,6 +245,7 @@ module LLaMACpp
|
|
244
245
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
245
246
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
246
247
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
248
|
+
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
247
249
|
end
|
248
250
|
|
249
251
|
class ContextParams
|
@@ -9,7 +9,8 @@ TEST_TARGETS = \
|
|
9
9
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
|
-
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
12
|
+
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
13
|
+
tests/test-json-schema-to-grammar
|
13
14
|
|
14
15
|
# Code coverage output files
|
15
16
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -555,7 +556,7 @@ endif
|
|
555
556
|
endif # LLAMA_METAL
|
556
557
|
|
557
558
|
ifdef LLAMA_METAL
|
558
|
-
ggml-metal.o: ggml-metal.m ggml-metal.h
|
559
|
+
ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
|
559
560
|
$(CC) $(CFLAGS) -c $< -o $@
|
560
561
|
|
561
562
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
@@ -597,6 +598,11 @@ include scripts/get-flags.mk
|
|
597
598
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
598
599
|
endif
|
599
600
|
|
601
|
+
ifdef LLAMA_CURL
|
602
|
+
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
|
603
|
+
override LDFLAGS := $(LDFLAGS) -lcurl
|
604
|
+
endif
|
605
|
+
|
600
606
|
#
|
601
607
|
# Print build information
|
602
608
|
#
|
@@ -663,6 +669,9 @@ console.o: common/console.cpp common/console.h
|
|
663
669
|
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
|
664
670
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
665
671
|
|
672
|
+
json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
|
673
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
674
|
+
|
666
675
|
train.o: common/train.cpp common/train.h
|
667
676
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
668
677
|
|
@@ -745,7 +754,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
745
754
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
746
755
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
747
756
|
|
748
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h
|
757
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
749
758
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
750
759
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
751
760
|
|
@@ -753,6 +762,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
|
753
762
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
754
763
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
755
764
|
|
765
|
+
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
766
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
767
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
768
|
+
|
756
769
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
757
770
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
758
771
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -861,6 +874,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
|
861
874
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
862
875
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
863
876
|
|
877
|
+
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
|
878
|
+
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
879
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
880
|
+
|
864
881
|
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
|
865
882
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
866
883
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
548
548
|
for (int i = 0; i < graph->n_nodes; i++) {
|
549
549
|
struct ggml_tensor * node = graph->nodes[i];
|
550
550
|
|
551
|
-
|
551
|
+
// TODO: better way to add external dependencies
|
552
|
+
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
|
553
|
+
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
|
554
|
+
// itself is never used and should not be considered a dependency
|
555
|
+
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
|
552
556
|
struct ggml_tensor * view_src = node->view_src;
|
553
557
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
554
558
|
}
|
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
565
569
|
|
566
570
|
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
567
571
|
|
568
|
-
// allocate explicit inputs
|
569
|
-
if (src->flags & GGML_TENSOR_FLAG_INPUT
|
572
|
+
// allocate explicit inputs
|
573
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
570
574
|
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
571
575
|
}
|
572
576
|
}
|
@@ -103,6 +103,11 @@ extern "C" {
|
|
103
103
|
// check if the backend supports an operation
|
104
104
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
105
105
|
|
106
|
+
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
107
|
+
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
108
|
+
// even if the weight has to be copied from the CPU temporarily
|
109
|
+
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
110
|
+
|
106
111
|
// (optional) event synchronization
|
107
112
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
108
113
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|