llama_cpp 0.14.4 → 0.14.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +29 -9
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +142 -49
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +156 -156
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +942 -267
- data/vendor/tmp/llama.cpp/ggml.c +161 -95
- data/vendor/tmp/llama.cpp/ggml.h +12 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1577 -274
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- data/vendor/tmp/llama.cpp/sgemm.cpp +1148 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5c4bd6bcb93b98a00f94dcdf93d04f853174f73e281d96fce8f837a6ba7f250e
|
4
|
+
data.tar.gz: 6d184e9ce927c06ba794bea63a09007a175a72e477366ffb1c5763ceb2c7c71e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 953fe2777a759e5467694b8afb9d3f929a42603e81b2c3e38ba0fda4bb6dca78b2d147345023f99c2c9fb899cc746bf6729ad2726c2cb473d7094e93c13caf73
|
7
|
+
data.tar.gz: 71eb3cd5a5c619e9cc8a3418be745a8b76dc5e8cabe5b26a766230a8533df9a11c3981601b0be4ec0adb34a49f86ad741503ffc9f3b0d7ba021a7e9ddc3246a7
|
data/CHANGELOG.md
CHANGED
@@ -1,10 +1,22 @@
|
|
1
|
+
## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2658 to b2698.
|
4
|
+
|
5
|
+
## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
|
6
|
+
|
7
|
+
- Bump llama.cpp from b2608 to b2658.
|
8
|
+
- Add magic number constants.
|
9
|
+
- Add `token_cls` and `token_sep` methods to `Model`.
|
10
|
+
|
11
|
+
Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
|
12
|
+
|
1
13
|
## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
|
2
14
|
|
3
15
|
- Bump llama.cpp from b2496 to b2573.
|
4
16
|
- Add file type constants.
|
5
17
|
- Bump llama.cpp from b2573 to b2608.
|
6
18
|
|
7
|
-
Implementation
|
19
|
+
Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
|
8
20
|
|
9
21
|
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
10
22
|
|
data/examples/chat.rb
CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
127
127
|
end
|
128
128
|
|
129
129
|
if input_echo
|
130
|
-
output =
|
131
|
-
embd.each { |token| output << context.model.token_to_piece(token) }
|
130
|
+
output = embd.map { |token| context.model.token_to_piece(token) }
|
132
131
|
output_str = output.join
|
133
132
|
output_str.chomp!(antiprompt) if first_input
|
134
133
|
print(output_str)
|
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
136
135
|
|
137
136
|
if embd_input.size <= n_consumed
|
138
137
|
if antiprompt.size.positive?
|
139
|
-
last_output =
|
140
|
-
last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
|
138
|
+
last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
|
141
139
|
last_output_str = last_output.join
|
142
140
|
|
143
141
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
|
|
37
37
|
abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
|
38
38
|
end
|
39
39
|
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
|
40
|
+
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
|
40
41
|
end
|
41
42
|
|
42
43
|
abort('libstdc++ is not found.') unless have_library('stdc++')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1478,6 +1478,8 @@ public:
|
|
1478
1478
|
rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
|
1479
1479
|
rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
|
1480
1480
|
rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
|
1481
|
+
rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
|
1482
|
+
rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
|
1481
1483
|
rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
|
1482
1484
|
rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
|
1483
1485
|
rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
|
@@ -1743,6 +1745,16 @@ private:
|
|
1743
1745
|
return INT2NUM(llama_token_eos(ptr->model));
|
1744
1746
|
}
|
1745
1747
|
|
1748
|
+
static VALUE _llama_model_token_cls(VALUE self) {
|
1749
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1750
|
+
return INT2NUM(llama_token_cls(ptr->model));
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
static VALUE _llama_model_token_sep(VALUE self) {
|
1754
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1755
|
+
return INT2NUM(llama_token_sep(ptr->model));
|
1756
|
+
}
|
1757
|
+
|
1746
1758
|
static VALUE _llama_model_token_nl(VALUE self) {
|
1747
1759
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1748
1760
|
return INT2NUM(llama_token_nl(ptr->model));
|
@@ -3414,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
|
|
3414
3426
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
|
3415
3427
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
|
3416
3428
|
|
3429
|
+
ss_magic.str("");
|
3430
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3431
|
+
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
|
3432
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
|
3433
|
+
|
3417
3434
|
ss_magic.str("");
|
3418
3435
|
ss_magic.clear(std::stringstream::goodbit);
|
3419
3436
|
ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
|
3420
3437
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3421
3438
|
|
3439
|
+
ss_magic.str("");
|
3440
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3441
|
+
ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
|
3442
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3443
|
+
|
3422
3444
|
ss_magic.str("");
|
3423
3445
|
ss_magic.clear(std::stringstream::goodbit);
|
3424
3446
|
ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
|
3425
3447
|
rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
|
3426
3448
|
|
3427
3449
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
|
3450
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
|
3428
3451
|
}
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.6'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2698'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_FILE_MAGIC_GGLA: String
|
7
|
+
LLAMA_FILE_MAGIC_GGSN: String
|
8
|
+
LLAMA_FILE_MAGIC_GGSQ: String
|
9
|
+
LLAMA_SESSION_MAGIC: String
|
10
|
+
LLAMA_SESSION_VERSION: String
|
11
|
+
LLAMA_STATE_SEQ_MAGIC: String
|
12
|
+
LLAMA_STATE_SEQ_VERSION: String
|
13
|
+
|
6
14
|
LLAMA_VOCAB_TYPE_NONE: Integer
|
7
15
|
LLAMA_VOCAB_TYPE_SPM: Integer
|
8
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
@@ -124,6 +132,8 @@ module LLaMACpp
|
|
124
132
|
def type: (Integer) -> Integer
|
125
133
|
def token_bos: () -> Integer
|
126
134
|
def token_eos: () -> Integer
|
135
|
+
def token_cls: () -> Integer
|
136
|
+
def token_sep: () -> Integer
|
127
137
|
def token_nl: () -> Integer
|
128
138
|
def add_bos_token?: () -> bool
|
129
139
|
def add_eos_token?: () -> bool
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
5
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
@@ -10,7 +10,7 @@ TEST_TARGETS = \
|
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
12
|
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
13
|
-
tests/test-json-schema-to-grammar
|
13
|
+
tests/test-json-schema-to-grammar tests/test-grammar-integration
|
14
14
|
|
15
15
|
# Code coverage output files
|
16
16
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -386,6 +386,15 @@ ifdef LLAMA_OPENBLAS
|
|
386
386
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
387
387
|
endif # LLAMA_OPENBLAS
|
388
388
|
|
389
|
+
# TODO: temporary disable until MoE is fixed
|
390
|
+
# https://github.com/ggerganov/llama.cpp/pull/6716
|
391
|
+
LLAMA_NO_LLAMAFILE := 1
|
392
|
+
|
393
|
+
ifndef LLAMA_NO_LLAMAFILE
|
394
|
+
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
395
|
+
OBJS += sgemm.o
|
396
|
+
endif
|
397
|
+
|
389
398
|
ifdef LLAMA_BLIS
|
390
399
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
391
400
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
@@ -482,11 +491,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
|
|
482
491
|
|
483
492
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
484
493
|
$(NVCC_COMPILE)
|
485
|
-
|
486
494
|
endif # LLAMA_CUDA
|
487
495
|
|
488
496
|
ifdef LLAMA_CLBLAST
|
489
|
-
|
490
497
|
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
491
498
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
492
499
|
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
@@ -605,6 +612,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
|
605
612
|
$(CC) $(CFLAGS) -c $< -o $@
|
606
613
|
endif # LLAMA_MPI
|
607
614
|
|
615
|
+
ifndef LLAMA_NO_LLAMAFILE
|
616
|
+
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
617
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
618
|
+
endif
|
619
|
+
|
608
620
|
GF_CC := $(CC)
|
609
621
|
include scripts/get-flags.mk
|
610
622
|
|
@@ -648,7 +660,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
|
|
648
660
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
649
661
|
ifndef CUDA_DOCKER_ARCH
|
650
662
|
ifndef CUDA_POWER_ARCH
|
651
|
-
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
663
|
+
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
|
652
664
|
endif # CUDA_POWER_ARCH
|
653
665
|
endif # CUDA_DOCKER_ARCH
|
654
666
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
@@ -690,7 +702,7 @@ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml
|
|
690
702
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
691
703
|
|
692
704
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
693
|
-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
|
705
|
+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
694
706
|
|
695
707
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
696
708
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -724,7 +736,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
724
736
|
ar rcs libllama.a $^
|
725
737
|
|
726
738
|
clean:
|
727
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll
|
739
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
728
740
|
rm -vrf ggml-cuda/*.o
|
729
741
|
|
730
742
|
#
|
@@ -761,7 +773,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
|
|
761
773
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
762
774
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
763
775
|
|
764
|
-
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o
|
776
|
+
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
765
777
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
766
778
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
767
779
|
|
@@ -793,7 +805,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
793
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
794
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
795
807
|
|
796
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp
|
808
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
797
809
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
798
810
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
799
811
|
|
@@ -805,6 +817,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
|
|
805
817
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
806
818
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
807
819
|
|
820
|
+
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
821
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
822
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
823
|
+
|
808
824
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
809
825
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
826
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -923,6 +939,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
|
|
923
939
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
924
940
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
925
941
|
|
942
|
+
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
943
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
944
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
945
|
+
|
926
946
|
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
927
947
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
928
948
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -137,7 +137,7 @@ extern "C" {
|
|
137
137
|
/*
|
138
138
|
Example usage:
|
139
139
|
|
140
|
-
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be
|
140
|
+
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
141
141
|
// preferrably to run on the same backend as the buffer
|
142
142
|
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
143
143
|
|
@@ -1225,13 +1225,13 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1225
1225
|
|
1226
1226
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
1227
1227
|
// ldc == nrows of the matrix that cuBLAS writes into
|
1228
|
-
|
1228
|
+
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
1229
1229
|
|
1230
1230
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
1231
1231
|
|
1232
1232
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
1233
1233
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
1234
|
-
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
|
1234
|
+
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
1235
1235
|
if (src0->type != GGML_TYPE_F16) {
|
1236
1236
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
1237
1237
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
@@ -1241,7 +1241,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1241
1241
|
}
|
1242
1242
|
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
|
1243
1243
|
|
1244
|
-
ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
|
1244
|
+
ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
|
1245
1245
|
if (src1->type != GGML_TYPE_F16) {
|
1246
1246
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
1247
1247
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
@@ -1250,7 +1250,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1250
1250
|
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
|
1251
1251
|
}
|
1252
1252
|
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
|
1253
|
-
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
|
1253
|
+
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
|
1254
1254
|
|
1255
1255
|
const half alpha_f16 = 1.0f;
|
1256
1256
|
const half beta_f16 = 0.0f;
|
@@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
|
|
1377
1377
|
const int64_t ne0 = dst->ne[0];
|
1378
1378
|
const int64_t ne1 = dst->ne[1];
|
1379
1379
|
|
1380
|
-
const
|
1381
|
-
const
|
1380
|
+
const int64_t nb2 = dst->nb[2];
|
1381
|
+
const int64_t nb3 = dst->nb[3];
|
1382
1382
|
|
1383
1383
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
1384
1384
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
@@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
1946
1946
|
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
1947
1947
|
// KQV single-batch
|
1948
1948
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
1949
|
-
} else if (!split &&
|
1949
|
+
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
1950
1950
|
// KQ + KQV multi-batch
|
1951
1951
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
1952
1952
|
} else if (use_dequantize_mul_mat_vec) {
|
@@ -1960,20 +1960,73 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
1960
1960
|
}
|
1961
1961
|
}
|
1962
1962
|
|
1963
|
+
struct mmid_row_mapping {
|
1964
|
+
int32_t i1;
|
1965
|
+
int32_t i2;
|
1966
|
+
};
|
1967
|
+
|
1968
|
+
static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
|
1969
|
+
int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
|
1970
|
+
const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
1971
|
+
int64_t ne11, int64_t ne10,
|
1972
|
+
size_t nb11, size_t nb12) {
|
1973
|
+
int32_t iid1 = blockIdx.x;
|
1974
|
+
int32_t id = blockIdx.y;
|
1975
|
+
|
1976
|
+
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
1977
|
+
|
1978
|
+
if (row_id_i != i02) {
|
1979
|
+
return;
|
1980
|
+
}
|
1981
|
+
|
1982
|
+
const int64_t i11 = id % ne11;
|
1983
|
+
const int64_t i12 = iid1;
|
1984
|
+
|
1985
|
+
__shared__ int src1_row;
|
1986
|
+
if (threadIdx.x == 0) {
|
1987
|
+
src1_row = atomicAdd(cur_src1_row, 1);
|
1988
|
+
row_mapping[src1_row] = {id, iid1};
|
1989
|
+
}
|
1990
|
+
__syncthreads();
|
1991
|
+
|
1992
|
+
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
1993
|
+
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
1994
|
+
|
1995
|
+
for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
|
1996
|
+
src1_row_contiguous[i] = src1_row_original[i];
|
1997
|
+
}
|
1998
|
+
}
|
1999
|
+
|
2000
|
+
static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
|
2001
|
+
const mmid_row_mapping * __restrict__ row_mapping,
|
2002
|
+
int64_t ne0,
|
2003
|
+
size_t nb1, size_t nb2) {
|
2004
|
+
int32_t i = blockIdx.x;
|
2005
|
+
|
2006
|
+
const int32_t i1 = row_mapping[i].i1;
|
2007
|
+
const int32_t i2 = row_mapping[i].i2;
|
2008
|
+
|
2009
|
+
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
|
2010
|
+
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
|
2011
|
+
|
2012
|
+
for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
|
2013
|
+
dst_row_original[j] = dst_row_contiguous[j];
|
2014
|
+
}
|
2015
|
+
}
|
2016
|
+
|
1963
2017
|
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
1964
2018
|
const ggml_tensor * src0 = dst->src[0];
|
1965
2019
|
const ggml_tensor * src1 = dst->src[1];
|
1966
2020
|
const ggml_tensor * ids = dst->src[2];
|
1967
2021
|
|
2022
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
2023
|
+
|
1968
2024
|
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
1969
2025
|
|
1970
2026
|
cudaStream_t stream = ctx.stream();
|
1971
2027
|
|
1972
|
-
const
|
1973
|
-
const
|
1974
|
-
|
1975
|
-
const int32_t id = ((int32_t *) dst->op_params)[0];
|
1976
|
-
const int32_t n_as = src0->ne[2];
|
2028
|
+
const int64_t n_as = ne02;
|
2029
|
+
const int64_t n_ids = ids->ne[0];
|
1977
2030
|
|
1978
2031
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
1979
2032
|
const char * ids_dev = (const char *) ids->data;
|
@@ -1982,7 +2035,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
1982
2035
|
|
1983
2036
|
ggml_tensor src0_row = *src0;
|
1984
2037
|
ggml_tensor src1_row = *src1;
|
1985
|
-
ggml_tensor dst_row
|
2038
|
+
ggml_tensor dst_row = *dst;
|
1986
2039
|
|
1987
2040
|
char * src0_original = (char *) src0->data;
|
1988
2041
|
char * src1_original = (char *) src1->data;
|
@@ -1990,19 +2043,39 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
1990
2043
|
|
1991
2044
|
src0_row.ne[2] = 1;
|
1992
2045
|
src0_row.ne[3] = 1;
|
1993
|
-
src0_row.nb[3] =
|
2046
|
+
src0_row.nb[3] = nb02;
|
1994
2047
|
|
1995
|
-
|
1996
|
-
|
1997
|
-
|
2048
|
+
src1_row.ne[1] = 1;
|
2049
|
+
src1_row.ne[2] = 1;
|
2050
|
+
src1_row.ne[3] = 1;
|
2051
|
+
src1_row.nb[2] = nb11;
|
2052
|
+
src1_row.nb[3] = nb11;
|
1998
2053
|
|
1999
|
-
|
2054
|
+
dst_row.ne[1] = 1;
|
2055
|
+
dst_row.ne[2] = 1;
|
2056
|
+
dst_row.ne[3] = 1;
|
2057
|
+
dst_row.nb[2] = nb1;
|
2058
|
+
dst_row.nb[3] = nb1;
|
2000
2059
|
|
2001
|
-
|
2002
|
-
|
2003
|
-
|
2060
|
+
if (ne12 == 1) {
|
2061
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
2062
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
2063
|
+
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
2004
2064
|
|
2005
|
-
|
2065
|
+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
2066
|
+
|
2067
|
+
const int64_t i11 = id % ne11;
|
2068
|
+
const int64_t i12 = iid1;
|
2069
|
+
|
2070
|
+
const int64_t i1 = id;
|
2071
|
+
const int64_t i2 = i12;
|
2072
|
+
|
2073
|
+
src0_row.data = src0_original + i02*nb02;
|
2074
|
+
src1_row.data = src1_original + i11*nb11 + i12*nb12;
|
2075
|
+
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
2076
|
+
|
2077
|
+
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
2078
|
+
}
|
2006
2079
|
}
|
2007
2080
|
} else {
|
2008
2081
|
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
@@ -2011,54 +2084,69 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
2011
2084
|
src1_row.data = src1_contiguous.get();
|
2012
2085
|
dst_row.data = dst_contiguous.get();
|
2013
2086
|
|
2014
|
-
for (
|
2087
|
+
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
2015
2088
|
int64_t num_src1_rows = 0;
|
2016
|
-
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
2017
|
-
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
2018
2089
|
|
2019
|
-
|
2020
|
-
|
2021
|
-
|
2090
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
2091
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
2092
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
2093
|
+
|
2094
|
+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
2022
2095
|
|
2023
|
-
|
2096
|
+
if (row_id_i != i02) {
|
2097
|
+
continue;
|
2098
|
+
}
|
2024
2099
|
|
2025
|
-
|
2026
|
-
|
2027
|
-
num_src1_rows++;
|
2100
|
+
num_src1_rows++;
|
2101
|
+
}
|
2028
2102
|
}
|
2029
2103
|
|
2030
2104
|
if (num_src1_rows == 0) {
|
2031
2105
|
continue;
|
2032
2106
|
}
|
2033
2107
|
|
2034
|
-
|
2108
|
+
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
|
2109
|
+
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
|
2110
|
+
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
|
2035
2111
|
|
2036
|
-
|
2037
|
-
|
2112
|
+
{
|
2113
|
+
dim3 block_dims(std::min((unsigned int)ne10, 768u));
|
2114
|
+
dim3 grid_dims(ids->ne[1], n_ids);
|
2115
|
+
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
2116
|
+
src1_original, src1_contiguous.get(),
|
2117
|
+
dev_cur_src1_row.get(), dev_row_mapping.get(),
|
2118
|
+
ids_dev, i02, ids->nb[1], ids->nb[0],
|
2119
|
+
ne11, ne10,
|
2120
|
+
nb11, nb12);
|
2121
|
+
CUDA_CHECK(cudaGetLastError());
|
2122
|
+
}
|
2038
2123
|
|
2124
|
+
src0_row.data = src0_original + i02*nb02;
|
2125
|
+
|
2126
|
+
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
2127
|
+
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
2128
|
+
|
2129
|
+
src1_row.ne[1] = num_src1_rows;
|
2039
2130
|
src1_row.nb[1] = nb11;
|
2040
2131
|
src1_row.nb[2] = num_src1_rows*nb11;
|
2041
2132
|
src1_row.nb[3] = num_src1_rows*nb11;
|
2042
2133
|
|
2134
|
+
dst_row.ne[1] = num_src1_rows;
|
2043
2135
|
dst_row.nb[1] = nb1;
|
2044
2136
|
dst_row.nb[2] = num_src1_rows*nb1;
|
2045
2137
|
dst_row.nb[3] = num_src1_rows*nb1;
|
2046
2138
|
|
2047
2139
|
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
2048
2140
|
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2052
|
-
|
2053
|
-
|
2054
|
-
|
2055
|
-
|
2056
|
-
|
2057
|
-
|
2058
|
-
|
2059
|
-
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
|
2060
|
-
nb1, cudaMemcpyDeviceToDevice, stream));
|
2061
|
-
num_src1_rows++;
|
2141
|
+
{
|
2142
|
+
dim3 block_dims(std::min((unsigned int)ne0, 768u));
|
2143
|
+
dim3 grid_dims(num_src1_rows);
|
2144
|
+
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
2145
|
+
dst_original, dst_contiguous.get(),
|
2146
|
+
dev_row_mapping.get(),
|
2147
|
+
ne0,
|
2148
|
+
nb1, nb2);
|
2149
|
+
CUDA_CHECK(cudaGetLastError());
|
2062
2150
|
}
|
2063
2151
|
}
|
2064
2152
|
}
|
@@ -2487,7 +2575,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2487
2575
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
2488
2576
|
const int min_batch_size = 32;
|
2489
2577
|
|
2490
|
-
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS
|
2578
|
+
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
2579
|
+
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
2491
2580
|
|
2492
2581
|
GGML_UNUSED(backend);
|
2493
2582
|
}
|
@@ -2617,6 +2706,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2617
2706
|
return false;
|
2618
2707
|
}
|
2619
2708
|
|
2709
|
+
#if CUDART_VERSION >= 11100
|
2620
2710
|
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
2621
2711
|
if (err != cudaSuccess) {
|
2622
2712
|
// clear the error
|
@@ -2627,6 +2717,9 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2627
2717
|
return false;
|
2628
2718
|
}
|
2629
2719
|
return true;
|
2720
|
+
#else
|
2721
|
+
return false;
|
2722
|
+
#endif
|
2630
2723
|
}
|
2631
2724
|
|
2632
2725
|
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
@@ -88,7 +88,7 @@ typedef uint16_t ggml_fp16_internal_t;
|
|
88
88
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
89
89
|
#include <intrin.h>
|
90
90
|
#else
|
91
|
-
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
91
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
92
92
|
#if !defined(__riscv)
|
93
93
|
#include <immintrin.h>
|
94
94
|
#endif
|