llama_cpp 0.14.4 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +11 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +7 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +155 -155
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +878 -216
- data/vendor/tmp/llama.cpp/ggml.c +8 -8
- data/vendor/tmp/llama.cpp/ggml.h +7 -7
- data/vendor/tmp/llama.cpp/llama.cpp +686 -124
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
|
4
|
+
data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
|
7
|
+
data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
|
data/CHANGELOG.md
CHANGED
@@ -1,10 +1,18 @@
|
|
1
|
+
## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2608 to b2658.
|
4
|
+
- Add magic number constants.
|
5
|
+
- Add `token_cls` and `token_sep` methods to `Model`.
|
6
|
+
|
7
|
+
Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
|
8
|
+
|
1
9
|
## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
|
2
10
|
|
3
11
|
- Bump llama.cpp from b2496 to b2573.
|
4
12
|
- Add file type constants.
|
5
13
|
- Bump llama.cpp from b2573 to b2608.
|
6
14
|
|
7
|
-
Implementation
|
15
|
+
Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
|
8
16
|
|
9
17
|
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
10
18
|
|
data/examples/chat.rb
CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
127
127
|
end
|
128
128
|
|
129
129
|
if input_echo
|
130
|
-
output =
|
131
|
-
embd.each { |token| output << context.model.token_to_piece(token) }
|
130
|
+
output = embd.map { |token| context.model.token_to_piece(token) }
|
132
131
|
output_str = output.join
|
133
132
|
output_str.chomp!(antiprompt) if first_input
|
134
133
|
print(output_str)
|
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
136
135
|
|
137
136
|
if embd_input.size <= n_consumed
|
138
137
|
if antiprompt.size.positive?
|
139
|
-
last_output =
|
140
|
-
last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
|
138
|
+
last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
|
141
139
|
last_output_str = last_output.join
|
142
140
|
|
143
141
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
|
|
37
37
|
abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
|
38
38
|
end
|
39
39
|
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
|
40
|
+
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
|
40
41
|
end
|
41
42
|
|
42
43
|
abort('libstdc++ is not found.') unless have_library('stdc++')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1478,6 +1478,8 @@ public:
|
|
1478
1478
|
rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
|
1479
1479
|
rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
|
1480
1480
|
rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
|
1481
|
+
rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
|
1482
|
+
rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
|
1481
1483
|
rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
|
1482
1484
|
rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
|
1483
1485
|
rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
|
@@ -1743,6 +1745,16 @@ private:
|
|
1743
1745
|
return INT2NUM(llama_token_eos(ptr->model));
|
1744
1746
|
}
|
1745
1747
|
|
1748
|
+
static VALUE _llama_model_token_cls(VALUE self) {
|
1749
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1750
|
+
return INT2NUM(llama_token_cls(ptr->model));
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
static VALUE _llama_model_token_sep(VALUE self) {
|
1754
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1755
|
+
return INT2NUM(llama_token_sep(ptr->model));
|
1756
|
+
}
|
1757
|
+
|
1746
1758
|
static VALUE _llama_model_token_nl(VALUE self) {
|
1747
1759
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1748
1760
|
return INT2NUM(llama_token_nl(ptr->model));
|
@@ -3414,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
|
|
3414
3426
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
|
3415
3427
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
|
3416
3428
|
|
3429
|
+
ss_magic.str("");
|
3430
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3431
|
+
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
|
3432
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
|
3433
|
+
|
3417
3434
|
ss_magic.str("");
|
3418
3435
|
ss_magic.clear(std::stringstream::goodbit);
|
3419
3436
|
ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
|
3420
3437
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3421
3438
|
|
3439
|
+
ss_magic.str("");
|
3440
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3441
|
+
ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
|
3442
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3443
|
+
|
3422
3444
|
ss_magic.str("");
|
3423
3445
|
ss_magic.clear(std::stringstream::goodbit);
|
3424
3446
|
ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
|
3425
3447
|
rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
|
3426
3448
|
|
3427
3449
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
|
3450
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
|
3428
3451
|
}
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2658'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_FILE_MAGIC_GGLA: String
|
7
|
+
LLAMA_FILE_MAGIC_GGSN: String
|
8
|
+
LLAMA_FILE_MAGIC_GGSQ: String
|
9
|
+
LLAMA_SESSION_MAGIC: String
|
10
|
+
LLAMA_SESSION_VERSION: String
|
11
|
+
LLAMA_STATE_SEQ_MAGIC: String
|
12
|
+
LLAMA_STATE_SEQ_VERSION: String
|
13
|
+
|
6
14
|
LLAMA_VOCAB_TYPE_NONE: Integer
|
7
15
|
LLAMA_VOCAB_TYPE_SPM: Integer
|
8
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
@@ -124,6 +132,8 @@ module LLaMACpp
|
|
124
132
|
def type: (Integer) -> Integer
|
125
133
|
def token_bos: () -> Integer
|
126
134
|
def token_eos: () -> Integer
|
135
|
+
def token_cls: () -> Integer
|
136
|
+
def token_sep: () -> Integer
|
127
137
|
def token_nl: () -> Integer
|
128
138
|
def add_bos_token?: () -> bool
|
129
139
|
def add_eos_token?: () -> bool
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
5
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
@@ -10,7 +10,7 @@ TEST_TARGETS = \
|
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
12
|
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
13
|
-
tests/test-json-schema-to-grammar
|
13
|
+
tests/test-json-schema-to-grammar tests/test-grammar-integration
|
14
14
|
|
15
15
|
# Code coverage output files
|
16
16
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -648,7 +648,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
|
|
648
648
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
649
649
|
ifndef CUDA_DOCKER_ARCH
|
650
650
|
ifndef CUDA_POWER_ARCH
|
651
|
-
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
651
|
+
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
|
652
652
|
endif # CUDA_POWER_ARCH
|
653
653
|
endif # CUDA_DOCKER_ARCH
|
654
654
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
@@ -805,6 +805,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
|
|
805
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
806
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
807
807
|
|
808
|
+
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
809
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
811
|
+
|
808
812
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
809
813
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
814
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -923,6 +927,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
|
|
923
927
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
924
928
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
925
929
|
|
930
|
+
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
931
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
932
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
933
|
+
|
926
934
|
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
927
935
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
928
936
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -137,7 +137,7 @@ extern "C" {
|
|
137
137
|
/*
|
138
138
|
Example usage:
|
139
139
|
|
140
|
-
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be
|
140
|
+
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
141
141
|
// preferrably to run on the same backend as the buffer
|
142
142
|
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
143
143
|
|
@@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1225
1225
|
|
1226
1226
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
1227
1227
|
// ldc == nrows of the matrix that cuBLAS writes into
|
1228
|
-
|
1228
|
+
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
1229
1229
|
|
1230
1230
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
1231
1231
|
|
@@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
|
|
1377
1377
|
const int64_t ne0 = dst->ne[0];
|
1378
1378
|
const int64_t ne1 = dst->ne[1];
|
1379
1379
|
|
1380
|
-
const
|
1381
|
-
const
|
1380
|
+
const int64_t nb2 = dst->nb[2];
|
1381
|
+
const int64_t nb3 = dst->nb[3];
|
1382
1382
|
|
1383
1383
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
1384
1384
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
@@ -2617,6 +2617,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2617
2617
|
return false;
|
2618
2618
|
}
|
2619
2619
|
|
2620
|
+
#if CUDART_VERSION >= 11100
|
2620
2621
|
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
2621
2622
|
if (err != cudaSuccess) {
|
2622
2623
|
// clear the error
|
@@ -2627,6 +2628,9 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2627
2628
|
return false;
|
2628
2629
|
}
|
2629
2630
|
return true;
|
2631
|
+
#else
|
2632
|
+
return false;
|
2633
|
+
#endif
|
2630
2634
|
}
|
2631
2635
|
|
2632
2636
|
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|