llama_cpp 0.14.3 → 0.14.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +27 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +14 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +81 -20
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
- data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +141 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -12
- data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
- data/vendor/tmp/llama.cpp/llama.h +145 -29
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
|
4
|
+
data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
|
7
|
+
data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2608 to b2658.
|
4
|
+
- Add magic number constants.
|
5
|
+
- Add `token_cls` and `token_sep` methods to `Model`.
|
6
|
+
|
7
|
+
Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
|
8
|
+
|
9
|
+
## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
|
10
|
+
|
11
|
+
- Bump llama.cpp from b2496 to b2573.
|
12
|
+
- Add file type constants.
|
13
|
+
- Bump llama.cpp from b2573 to b2608.
|
14
|
+
|
15
|
+
Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
|
16
|
+
|
1
17
|
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
2
18
|
|
3
19
|
- Bump llama.cpp from b2435 to b2496.
|
data/examples/chat.rb
CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
127
127
|
end
|
128
128
|
|
129
129
|
if input_echo
|
130
|
-
output =
|
131
|
-
embd.each { |token| output << context.model.token_to_piece(token) }
|
130
|
+
output = embd.map { |token| context.model.token_to_piece(token) }
|
132
131
|
output_str = output.join
|
133
132
|
output_str.chomp!(antiprompt) if first_input
|
134
133
|
print(output_str)
|
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
136
135
|
|
137
136
|
if embd_input.size <= n_consumed
|
138
137
|
if antiprompt.size.positive?
|
139
|
-
last_output =
|
140
|
-
last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
|
138
|
+
last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
|
141
139
|
last_output_str = last_output.join
|
142
140
|
|
143
141
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
|
|
37
37
|
abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
|
38
38
|
end
|
39
39
|
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
|
40
|
+
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
|
40
41
|
end
|
41
42
|
|
42
43
|
abort('libstdc++ is not found.') unless have_library('stdc++')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1478,6 +1478,8 @@ public:
|
|
1478
1478
|
rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
|
1479
1479
|
rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
|
1480
1480
|
rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
|
1481
|
+
rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
|
1482
|
+
rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
|
1481
1483
|
rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
|
1482
1484
|
rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
|
1483
1485
|
rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
|
@@ -1743,6 +1745,16 @@ private:
|
|
1743
1745
|
return INT2NUM(llama_token_eos(ptr->model));
|
1744
1746
|
}
|
1745
1747
|
|
1748
|
+
static VALUE _llama_model_token_cls(VALUE self) {
|
1749
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1750
|
+
return INT2NUM(llama_token_cls(ptr->model));
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
static VALUE _llama_model_token_sep(VALUE self) {
|
1754
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1755
|
+
return INT2NUM(llama_token_sep(ptr->model));
|
1756
|
+
}
|
1757
|
+
|
1746
1758
|
static VALUE _llama_model_token_nl(VALUE self) {
|
1747
1759
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1748
1760
|
return INT2NUM(llama_token_nl(ptr->model));
|
@@ -3371,6 +3383,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3371
3383
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3372
3384
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3373
3385
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3386
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
|
3387
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3388
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3389
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3374
3390
|
|
3375
3391
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3376
3392
|
|
@@ -3410,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
|
|
3410
3426
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
|
3411
3427
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
|
3412
3428
|
|
3429
|
+
ss_magic.str("");
|
3430
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3431
|
+
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
|
3432
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
|
3433
|
+
|
3413
3434
|
ss_magic.str("");
|
3414
3435
|
ss_magic.clear(std::stringstream::goodbit);
|
3415
3436
|
ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
|
3416
3437
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3417
3438
|
|
3439
|
+
ss_magic.str("");
|
3440
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3441
|
+
ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
|
3442
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3443
|
+
|
3418
3444
|
ss_magic.str("");
|
3419
3445
|
ss_magic.clear(std::stringstream::goodbit);
|
3420
3446
|
ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
|
3421
3447
|
rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
|
3422
3448
|
|
3423
3449
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
|
3450
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
|
3424
3451
|
}
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2658'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_FILE_MAGIC_GGLA: String
|
7
|
+
LLAMA_FILE_MAGIC_GGSN: String
|
8
|
+
LLAMA_FILE_MAGIC_GGSQ: String
|
9
|
+
LLAMA_SESSION_MAGIC: String
|
10
|
+
LLAMA_SESSION_VERSION: String
|
11
|
+
LLAMA_STATE_SEQ_MAGIC: String
|
12
|
+
LLAMA_STATE_SEQ_VERSION: String
|
13
|
+
|
6
14
|
LLAMA_VOCAB_TYPE_NONE: Integer
|
7
15
|
LLAMA_VOCAB_TYPE_SPM: Integer
|
8
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
@@ -32,6 +40,10 @@ module LLaMACpp
|
|
32
40
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
33
41
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
34
42
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
43
|
+
LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
|
44
|
+
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
45
|
+
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
46
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
35
47
|
|
36
48
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
37
49
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
@@ -120,6 +132,8 @@ module LLaMACpp
|
|
120
132
|
def type: (Integer) -> Integer
|
121
133
|
def token_bos: () -> Integer
|
122
134
|
def token_eos: () -> Integer
|
135
|
+
def token_cls: () -> Integer
|
136
|
+
def token_sep: () -> Integer
|
123
137
|
def token_nl: () -> Integer
|
124
138
|
def add_bos_token?: () -> bool
|
125
139
|
def add_eos_token?: () -> bool
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
+
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -10,7 +10,7 @@ TEST_TARGETS = \
|
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
12
|
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
13
|
-
tests/test-json-schema-to-grammar
|
13
|
+
tests/test-json-schema-to-grammar tests/test-grammar-integration
|
14
14
|
|
15
15
|
# Code coverage output files
|
16
16
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -392,14 +392,20 @@ ifdef LLAMA_BLIS
|
|
392
392
|
endif # LLAMA_BLIS
|
393
393
|
|
394
394
|
ifdef LLAMA_CUBLAS
|
395
|
+
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
396
|
+
LLAMA_CUDA := 1
|
397
|
+
endif
|
398
|
+
|
399
|
+
ifdef LLAMA_CUDA
|
395
400
|
ifneq ('', '$(wildcard /opt/cuda)')
|
396
401
|
CUDA_PATH ?= /opt/cuda
|
397
402
|
else
|
398
403
|
CUDA_PATH ?= /usr/local/cuda
|
399
404
|
endif
|
400
|
-
MK_CPPFLAGS += -
|
405
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
401
406
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
402
407
|
OBJS += ggml-cuda.o
|
408
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
403
409
|
MK_NVCCFLAGS += -use_fast_math
|
404
410
|
ifdef LLAMA_FATAL_WARNINGS
|
405
411
|
MK_NVCCFLAGS += -Werror all-warnings
|
@@ -454,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|
454
460
|
else
|
455
461
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
456
462
|
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
457
|
-
|
458
|
-
|
459
|
-
|
463
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
464
|
+
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
465
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
460
466
|
ifdef LLAMA_CUDA_CCBIN
|
461
467
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
462
468
|
endif
|
463
|
-
|
469
|
+
|
464
470
|
ifdef JETSON_EOL_MODULE_DETECT
|
465
|
-
|
471
|
+
define NVCC_COMPILE
|
472
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
473
|
+
endef # NVCC_COMPILE
|
466
474
|
else
|
475
|
+
define NVCC_COMPILE
|
467
476
|
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
477
|
+
endef # NVCC_COMPILE
|
468
478
|
endif # JETSON_EOL_MODULE_DETECT
|
469
|
-
|
479
|
+
|
480
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
481
|
+
$(NVCC_COMPILE)
|
482
|
+
|
483
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
484
|
+
$(NVCC_COMPILE)
|
485
|
+
|
486
|
+
endif # LLAMA_CUDA
|
470
487
|
|
471
488
|
ifdef LLAMA_CLBLAST
|
472
489
|
|
@@ -512,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
|
512
529
|
endif # LLAMA_VULKAN
|
513
530
|
|
514
531
|
ifdef LLAMA_HIPBLAS
|
515
|
-
|
516
532
|
ifeq ($(wildcard /opt/rocm),)
|
517
533
|
ROCM_PATH ?= /usr
|
518
534
|
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
@@ -524,7 +540,7 @@ ifdef LLAMA_HIPBLAS
|
|
524
540
|
LLAMA_CUDA_DMMV_X ?= 32
|
525
541
|
LLAMA_CUDA_MMV_Y ?= 1
|
526
542
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
527
|
-
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -
|
543
|
+
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
528
544
|
ifdef LLAMA_HIP_UMA
|
529
545
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
530
546
|
endif # LLAMA_HIP_UMA
|
@@ -537,9 +553,18 @@ endif # LLAMA_HIP_UMA
|
|
537
553
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
538
554
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
539
555
|
endif # LLAMA_CUDA_FORCE_DMMV
|
556
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
557
|
+
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
558
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
540
559
|
OBJS += ggml-cuda.o
|
541
|
-
|
560
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
561
|
+
|
562
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
542
563
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
564
|
+
|
565
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
566
|
+
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
567
|
+
|
543
568
|
endif # LLAMA_HIPBLAS
|
544
569
|
|
545
570
|
ifdef LLAMA_METAL
|
@@ -592,7 +617,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
|
592
617
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
593
618
|
|
594
619
|
# identify CUDA host compiler
|
595
|
-
ifdef
|
620
|
+
ifdef LLAMA_CUDA
|
596
621
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
597
622
|
include scripts/get-flags.mk
|
598
623
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
@@ -617,19 +642,26 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
|
|
617
642
|
$(info I LDFLAGS: $(LDFLAGS))
|
618
643
|
$(info I CC: $(shell $(CC) --version | head -n 1))
|
619
644
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
620
|
-
ifdef
|
645
|
+
ifdef LLAMA_CUDA
|
621
646
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
622
647
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
623
648
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
624
649
|
ifndef CUDA_DOCKER_ARCH
|
625
650
|
ifndef CUDA_POWER_ARCH
|
626
|
-
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
651
|
+
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
|
627
652
|
endif # CUDA_POWER_ARCH
|
628
653
|
endif # CUDA_DOCKER_ARCH
|
629
654
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
630
|
-
endif #
|
655
|
+
endif # LLAMA_CUDA
|
631
656
|
$(info )
|
632
657
|
|
658
|
+
ifdef LLAMA_CUBLAS
|
659
|
+
$(info !!!!)
|
660
|
+
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
|
661
|
+
$(info !!!!)
|
662
|
+
$(info )
|
663
|
+
endif
|
664
|
+
|
633
665
|
#
|
634
666
|
# Build library
|
635
667
|
#
|
@@ -649,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
|
649
681
|
unicode.o: unicode.cpp unicode.h
|
650
682
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
651
683
|
|
652
|
-
|
684
|
+
unicode-data.o: unicode-data.cpp unicode-data.h
|
685
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
686
|
+
|
687
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
653
688
|
|
654
689
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
655
690
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -675,6 +710,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
|
|
675
710
|
train.o: common/train.cpp common/train.h
|
676
711
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
677
712
|
|
713
|
+
ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
|
714
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
715
|
+
|
678
716
|
libllama.so: llama.o ggml.o $(OBJS)
|
679
717
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
680
718
|
|
@@ -686,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
|
|
686
724
|
ar rcs libllama.a $^
|
687
725
|
|
688
726
|
clean:
|
689
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
727
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
728
|
+
rm -vrf ggml-cuda/*.o
|
690
729
|
|
691
730
|
#
|
692
731
|
# Examples
|
@@ -766,6 +805,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
|
|
766
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
767
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
768
807
|
|
808
|
+
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
809
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
811
|
+
|
769
812
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
770
813
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
771
814
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -803,6 +846,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
|
803
846
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
804
847
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
805
848
|
|
849
|
+
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
850
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
851
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
852
|
+
|
806
853
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
807
854
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
808
855
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -815,14 +862,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
815
862
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
816
863
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
817
864
|
|
818
|
-
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
865
|
+
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
819
866
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
820
867
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
868
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
869
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
870
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
871
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
872
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
873
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
821
874
|
|
822
875
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
823
876
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
824
877
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
825
878
|
|
879
|
+
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
880
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
881
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
882
|
+
|
826
883
|
ifeq ($(UNAME_S),Darwin)
|
827
884
|
swift: examples/batched.swift
|
828
885
|
(cd examples/batched.swift; make build)
|
@@ -870,6 +927,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
|
|
870
927
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
871
928
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
872
929
|
|
930
|
+
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
931
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
932
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
933
|
+
|
873
934
|
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
874
935
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
875
936
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
705
705
|
struct ggml_tensor * leaf = graph->leafs[i];
|
706
706
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
707
707
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
708
|
-
|
709
|
-
|
708
|
+
if (leaf->view_src || leaf->data) {
|
709
|
+
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
710
|
+
galloc->leaf_allocs[i].leaf.size_max = 0;
|
711
|
+
} else {
|
712
|
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
713
|
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
714
|
+
}
|
710
715
|
}
|
711
716
|
|
712
717
|
// reallocate buffers if needed
|
@@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
420
420
|
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
421
421
|
|
422
422
|
// add forward decls here to avoid including the backend headers
|
423
|
-
#ifdef
|
423
|
+
#ifdef GGML_USE_CUDA
|
424
424
|
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
425
425
|
ggml_backend_cuda_reg_devices();
|
426
426
|
#endif
|
@@ -137,7 +137,7 @@ extern "C" {
|
|
137
137
|
/*
|
138
138
|
Example usage:
|
139
139
|
|
140
|
-
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be
|
140
|
+
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
141
141
|
// preferrably to run on the same backend as the buffer
|
142
142
|
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
143
143
|
|
@@ -377,6 +377,27 @@ typedef struct {
|
|
377
377
|
} block_iq1_s;
|
378
378
|
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
379
379
|
|
380
|
+
// 1.75 bpw
|
381
|
+
typedef struct {
|
382
|
+
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
|
+
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
+
#if QK_K == 64
|
385
|
+
ggml_half d;
|
386
|
+
#endif
|
387
|
+
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
|
+
} block_iq1_m;
|
389
|
+
#if QK_K == 64
|
390
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
+
#else
|
392
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
+
#endif
|
394
|
+
|
395
|
+
// Used by IQ1_M quants
|
396
|
+
typedef union {
|
397
|
+
ggml_half f16;
|
398
|
+
uint16_t u16;
|
399
|
+
} iq1m_scale_t;
|
400
|
+
|
380
401
|
// Non-linear quants
|
381
402
|
#define QK4_NL 32
|
382
403
|
typedef struct {
|
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
|
426
447
|
|
427
448
|
#define GGML_COMMON_IMPL
|
428
449
|
#elif defined(GGML_COMMON_IMPL_SYCL)
|
450
|
+
|
429
451
|
#include <cstdint>
|
430
452
|
|
431
|
-
#define GGML_TABLE_BEGIN(type, name, size) static
|
432
|
-
#define GGML_TABLE_END() }
|
453
|
+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
454
|
+
#define GGML_TABLE_END() };
|
433
455
|
|
434
456
|
#define GGML_COMMON_IMPL
|
435
457
|
#endif
|
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
|
|
1050
1072
|
|
1051
1073
|
#define NGRID_IQ1S 2048
|
1052
1074
|
#define IQ1S_DELTA 0.125f
|
1075
|
+
#define IQ1M_DELTA 0.125f
|
1053
1076
|
#if defined(GGML_COMMON_IMPL_C)
|
1054
1077
|
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
|
1055
1078
|
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
|