llama_cpp 0.14.4 → 0.14.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +11 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +7 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +155 -155
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +878 -216
- data/vendor/tmp/llama.cpp/ggml.c +8 -8
- data/vendor/tmp/llama.cpp/ggml.h +7 -7
- data/vendor/tmp/llama.cpp/llama.cpp +686 -124
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
|
4
|
+
data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
|
7
|
+
data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
|
data/CHANGELOG.md
CHANGED
@@ -1,10 +1,18 @@
|
|
1
|
+
## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2608 to b2658.
|
4
|
+
- Add magic number constants.
|
5
|
+
- Add `token_cls` and `token_sep` methods to `Model`.
|
6
|
+
|
7
|
+
Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
|
8
|
+
|
1
9
|
## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
|
2
10
|
|
3
11
|
- Bump llama.cpp from b2496 to b2573.
|
4
12
|
- Add file type constants.
|
5
13
|
- Bump llama.cpp from b2573 to b2608.
|
6
14
|
|
7
|
-
Implementation
|
15
|
+
Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
|
8
16
|
|
9
17
|
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
10
18
|
|
data/examples/chat.rb
CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
127
127
|
end
|
128
128
|
|
129
129
|
if input_echo
|
130
|
-
output =
|
131
|
-
embd.each { |token| output << context.model.token_to_piece(token) }
|
130
|
+
output = embd.map { |token| context.model.token_to_piece(token) }
|
132
131
|
output_str = output.join
|
133
132
|
output_str.chomp!(antiprompt) if first_input
|
134
133
|
print(output_str)
|
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
136
135
|
|
137
136
|
if embd_input.size <= n_consumed
|
138
137
|
if antiprompt.size.positive?
|
139
|
-
last_output =
|
140
|
-
last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
|
138
|
+
last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
|
141
139
|
last_output_str = last_output.join
|
142
140
|
|
143
141
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
|
|
37
37
|
abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
|
38
38
|
end
|
39
39
|
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
|
40
|
+
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
|
40
41
|
end
|
41
42
|
|
42
43
|
abort('libstdc++ is not found.') unless have_library('stdc++')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1478,6 +1478,8 @@ public:
|
|
1478
1478
|
rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
|
1479
1479
|
rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
|
1480
1480
|
rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
|
1481
|
+
rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
|
1482
|
+
rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
|
1481
1483
|
rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
|
1482
1484
|
rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
|
1483
1485
|
rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
|
@@ -1743,6 +1745,16 @@ private:
|
|
1743
1745
|
return INT2NUM(llama_token_eos(ptr->model));
|
1744
1746
|
}
|
1745
1747
|
|
1748
|
+
static VALUE _llama_model_token_cls(VALUE self) {
|
1749
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1750
|
+
return INT2NUM(llama_token_cls(ptr->model));
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
static VALUE _llama_model_token_sep(VALUE self) {
|
1754
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1755
|
+
return INT2NUM(llama_token_sep(ptr->model));
|
1756
|
+
}
|
1757
|
+
|
1746
1758
|
static VALUE _llama_model_token_nl(VALUE self) {
|
1747
1759
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1748
1760
|
return INT2NUM(llama_token_nl(ptr->model));
|
@@ -3414,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
|
|
3414
3426
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
|
3415
3427
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
|
3416
3428
|
|
3429
|
+
ss_magic.str("");
|
3430
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3431
|
+
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
|
3432
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
|
3433
|
+
|
3417
3434
|
ss_magic.str("");
|
3418
3435
|
ss_magic.clear(std::stringstream::goodbit);
|
3419
3436
|
ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
|
3420
3437
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3421
3438
|
|
3439
|
+
ss_magic.str("");
|
3440
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3441
|
+
ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
|
3442
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3443
|
+
|
3422
3444
|
ss_magic.str("");
|
3423
3445
|
ss_magic.clear(std::stringstream::goodbit);
|
3424
3446
|
ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
|
3425
3447
|
rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
|
3426
3448
|
|
3427
3449
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
|
3450
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
|
3428
3451
|
}
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2658'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_FILE_MAGIC_GGLA: String
|
7
|
+
LLAMA_FILE_MAGIC_GGSN: String
|
8
|
+
LLAMA_FILE_MAGIC_GGSQ: String
|
9
|
+
LLAMA_SESSION_MAGIC: String
|
10
|
+
LLAMA_SESSION_VERSION: String
|
11
|
+
LLAMA_STATE_SEQ_MAGIC: String
|
12
|
+
LLAMA_STATE_SEQ_VERSION: String
|
13
|
+
|
6
14
|
LLAMA_VOCAB_TYPE_NONE: Integer
|
7
15
|
LLAMA_VOCAB_TYPE_SPM: Integer
|
8
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
@@ -124,6 +132,8 @@ module LLaMACpp
|
|
124
132
|
def type: (Integer) -> Integer
|
125
133
|
def token_bos: () -> Integer
|
126
134
|
def token_eos: () -> Integer
|
135
|
+
def token_cls: () -> Integer
|
136
|
+
def token_sep: () -> Integer
|
127
137
|
def token_nl: () -> Integer
|
128
138
|
def add_bos_token?: () -> bool
|
129
139
|
def add_eos_token?: () -> bool
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
5
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
@@ -10,7 +10,7 @@ TEST_TARGETS = \
|
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
12
|
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
13
|
-
tests/test-json-schema-to-grammar
|
13
|
+
tests/test-json-schema-to-grammar tests/test-grammar-integration
|
14
14
|
|
15
15
|
# Code coverage output files
|
16
16
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -648,7 +648,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
|
|
648
648
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
649
649
|
ifndef CUDA_DOCKER_ARCH
|
650
650
|
ifndef CUDA_POWER_ARCH
|
651
|
-
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
651
|
+
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
|
652
652
|
endif # CUDA_POWER_ARCH
|
653
653
|
endif # CUDA_DOCKER_ARCH
|
654
654
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
@@ -805,6 +805,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
|
|
805
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
806
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
807
807
|
|
808
|
+
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
809
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
811
|
+
|
808
812
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
809
813
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
814
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -923,6 +927,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
|
|
923
927
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
924
928
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
925
929
|
|
930
|
+
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
931
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
932
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
933
|
+
|
926
934
|
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
927
935
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
928
936
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -137,7 +137,7 @@ extern "C" {
|
|
137
137
|
/*
|
138
138
|
Example usage:
|
139
139
|
|
140
|
-
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be
|
140
|
+
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
141
141
|
// preferrably to run on the same backend as the buffer
|
142
142
|
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
143
143
|
|
@@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1225
1225
|
|
1226
1226
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
1227
1227
|
// ldc == nrows of the matrix that cuBLAS writes into
|
1228
|
-
|
1228
|
+
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
1229
1229
|
|
1230
1230
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
1231
1231
|
|
@@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
|
|
1377
1377
|
const int64_t ne0 = dst->ne[0];
|
1378
1378
|
const int64_t ne1 = dst->ne[1];
|
1379
1379
|
|
1380
|
-
const
|
1381
|
-
const
|
1380
|
+
const int64_t nb2 = dst->nb[2];
|
1381
|
+
const int64_t nb3 = dst->nb[3];
|
1382
1382
|
|
1383
1383
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
1384
1384
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
@@ -2617,6 +2617,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2617
2617
|
return false;
|
2618
2618
|
}
|
2619
2619
|
|
2620
|
+
#if CUDART_VERSION >= 11100
|
2620
2621
|
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
2621
2622
|
if (err != cudaSuccess) {
|
2622
2623
|
// clear the error
|
@@ -2627,6 +2628,9 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2627
2628
|
return false;
|
2628
2629
|
}
|
2629
2630
|
return true;
|
2631
|
+
#else
|
2632
|
+
return false;
|
2633
|
+
#endif
|
2630
2634
|
}
|
2631
2635
|
|
2632
2636
|
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|