llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +27 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +14 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +81 -20
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
- data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +141 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -12
- data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
- data/vendor/tmp/llama.cpp/llama.h +145 -29
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
|
4
|
+
data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
|
7
|
+
data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2608 to b2658.
|
4
|
+
- Add magic number constants.
|
5
|
+
- Add `token_cls` and `token_sep` methods to `Model`.
|
6
|
+
|
7
|
+
Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
|
8
|
+
|
9
|
+
## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
|
10
|
+
|
11
|
+
- Bump llama.cpp from b2496 to b2573.
|
12
|
+
- Add file type constants.
|
13
|
+
- Bump llama.cpp from b2573 to b2608.
|
14
|
+
|
15
|
+
Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
|
16
|
+
|
1
17
|
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
2
18
|
|
3
19
|
- Bump llama.cpp from b2435 to b2496.
|
data/examples/chat.rb
CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
127
127
|
end
|
128
128
|
|
129
129
|
if input_echo
|
130
|
-
output =
|
131
|
-
embd.each { |token| output << context.model.token_to_piece(token) }
|
130
|
+
output = embd.map { |token| context.model.token_to_piece(token) }
|
132
131
|
output_str = output.join
|
133
132
|
output_str.chomp!(antiprompt) if first_input
|
134
133
|
print(output_str)
|
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
136
135
|
|
137
136
|
if embd_input.size <= n_consumed
|
138
137
|
if antiprompt.size.positive?
|
139
|
-
last_output =
|
140
|
-
last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
|
138
|
+
last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
|
141
139
|
last_output_str = last_output.join
|
142
140
|
|
143
141
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
|
|
37
37
|
abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
|
38
38
|
end
|
39
39
|
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
|
40
|
+
FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
|
40
41
|
end
|
41
42
|
|
42
43
|
abort('libstdc++ is not found.') unless have_library('stdc++')
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1478,6 +1478,8 @@ public:
|
|
1478
1478
|
rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
|
1479
1479
|
rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
|
1480
1480
|
rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
|
1481
|
+
rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
|
1482
|
+
rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
|
1481
1483
|
rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
|
1482
1484
|
rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
|
1483
1485
|
rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
|
@@ -1743,6 +1745,16 @@ private:
|
|
1743
1745
|
return INT2NUM(llama_token_eos(ptr->model));
|
1744
1746
|
}
|
1745
1747
|
|
1748
|
+
static VALUE _llama_model_token_cls(VALUE self) {
|
1749
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1750
|
+
return INT2NUM(llama_token_cls(ptr->model));
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
static VALUE _llama_model_token_sep(VALUE self) {
|
1754
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1755
|
+
return INT2NUM(llama_token_sep(ptr->model));
|
1756
|
+
}
|
1757
|
+
|
1746
1758
|
static VALUE _llama_model_token_nl(VALUE self) {
|
1747
1759
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1748
1760
|
return INT2NUM(llama_token_nl(ptr->model));
|
@@ -3371,6 +3383,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3371
3383
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3372
3384
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3373
3385
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3386
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
|
3387
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3388
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3389
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3374
3390
|
|
3375
3391
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3376
3392
|
|
@@ -3410,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
|
|
3410
3426
|
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
|
3411
3427
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
|
3412
3428
|
|
3429
|
+
ss_magic.str("");
|
3430
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3431
|
+
ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
|
3432
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
|
3433
|
+
|
3413
3434
|
ss_magic.str("");
|
3414
3435
|
ss_magic.clear(std::stringstream::goodbit);
|
3415
3436
|
ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
|
3416
3437
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3417
3438
|
|
3439
|
+
ss_magic.str("");
|
3440
|
+
ss_magic.clear(std::stringstream::goodbit);
|
3441
|
+
ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
|
3442
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
3443
|
+
|
3418
3444
|
ss_magic.str("");
|
3419
3445
|
ss_magic.clear(std::stringstream::goodbit);
|
3420
3446
|
ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
|
3421
3447
|
rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
|
3422
3448
|
|
3423
3449
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
|
3450
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
|
3424
3451
|
}
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2658'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_FILE_MAGIC_GGLA: String
|
7
|
+
LLAMA_FILE_MAGIC_GGSN: String
|
8
|
+
LLAMA_FILE_MAGIC_GGSQ: String
|
9
|
+
LLAMA_SESSION_MAGIC: String
|
10
|
+
LLAMA_SESSION_VERSION: String
|
11
|
+
LLAMA_STATE_SEQ_MAGIC: String
|
12
|
+
LLAMA_STATE_SEQ_VERSION: String
|
13
|
+
|
6
14
|
LLAMA_VOCAB_TYPE_NONE: Integer
|
7
15
|
LLAMA_VOCAB_TYPE_SPM: Integer
|
8
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
@@ -32,6 +40,10 @@ module LLaMACpp
|
|
32
40
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
33
41
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
34
42
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
43
|
+
LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
|
44
|
+
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
45
|
+
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
46
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
35
47
|
|
36
48
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
37
49
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
@@ -120,6 +132,8 @@ module LLaMACpp
|
|
120
132
|
def type: (Integer) -> Integer
|
121
133
|
def token_bos: () -> Integer
|
122
134
|
def token_eos: () -> Integer
|
135
|
+
def token_cls: () -> Integer
|
136
|
+
def token_sep: () -> Integer
|
123
137
|
def token_nl: () -> Integer
|
124
138
|
def add_bos_token?: () -> bool
|
125
139
|
def add_eos_token?: () -> bool
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
+
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -10,7 +10,7 @@ TEST_TARGETS = \
|
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
12
|
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
13
|
-
tests/test-json-schema-to-grammar
|
13
|
+
tests/test-json-schema-to-grammar tests/test-grammar-integration
|
14
14
|
|
15
15
|
# Code coverage output files
|
16
16
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -392,14 +392,20 @@ ifdef LLAMA_BLIS
|
|
392
392
|
endif # LLAMA_BLIS
|
393
393
|
|
394
394
|
ifdef LLAMA_CUBLAS
|
395
|
+
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
396
|
+
LLAMA_CUDA := 1
|
397
|
+
endif
|
398
|
+
|
399
|
+
ifdef LLAMA_CUDA
|
395
400
|
ifneq ('', '$(wildcard /opt/cuda)')
|
396
401
|
CUDA_PATH ?= /opt/cuda
|
397
402
|
else
|
398
403
|
CUDA_PATH ?= /usr/local/cuda
|
399
404
|
endif
|
400
|
-
MK_CPPFLAGS += -
|
405
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
401
406
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
402
407
|
OBJS += ggml-cuda.o
|
408
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
403
409
|
MK_NVCCFLAGS += -use_fast_math
|
404
410
|
ifdef LLAMA_FATAL_WARNINGS
|
405
411
|
MK_NVCCFLAGS += -Werror all-warnings
|
@@ -454,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|
454
460
|
else
|
455
461
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
456
462
|
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
457
|
-
|
458
|
-
|
459
|
-
|
463
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
464
|
+
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
465
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
460
466
|
ifdef LLAMA_CUDA_CCBIN
|
461
467
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
462
468
|
endif
|
463
|
-
|
469
|
+
|
464
470
|
ifdef JETSON_EOL_MODULE_DETECT
|
465
|
-
|
471
|
+
define NVCC_COMPILE
|
472
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
473
|
+
endef # NVCC_COMPILE
|
466
474
|
else
|
475
|
+
define NVCC_COMPILE
|
467
476
|
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
477
|
+
endef # NVCC_COMPILE
|
468
478
|
endif # JETSON_EOL_MODULE_DETECT
|
469
|
-
|
479
|
+
|
480
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
481
|
+
$(NVCC_COMPILE)
|
482
|
+
|
483
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
484
|
+
$(NVCC_COMPILE)
|
485
|
+
|
486
|
+
endif # LLAMA_CUDA
|
470
487
|
|
471
488
|
ifdef LLAMA_CLBLAST
|
472
489
|
|
@@ -512,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
|
512
529
|
endif # LLAMA_VULKAN
|
513
530
|
|
514
531
|
ifdef LLAMA_HIPBLAS
|
515
|
-
|
516
532
|
ifeq ($(wildcard /opt/rocm),)
|
517
533
|
ROCM_PATH ?= /usr
|
518
534
|
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
@@ -524,7 +540,7 @@ ifdef LLAMA_HIPBLAS
|
|
524
540
|
LLAMA_CUDA_DMMV_X ?= 32
|
525
541
|
LLAMA_CUDA_MMV_Y ?= 1
|
526
542
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
527
|
-
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -
|
543
|
+
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
528
544
|
ifdef LLAMA_HIP_UMA
|
529
545
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
530
546
|
endif # LLAMA_HIP_UMA
|
@@ -537,9 +553,18 @@ endif # LLAMA_HIP_UMA
|
|
537
553
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
538
554
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
539
555
|
endif # LLAMA_CUDA_FORCE_DMMV
|
556
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
557
|
+
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
558
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
540
559
|
OBJS += ggml-cuda.o
|
541
|
-
|
560
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
561
|
+
|
562
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
542
563
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
564
|
+
|
565
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
566
|
+
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
567
|
+
|
543
568
|
endif # LLAMA_HIPBLAS
|
544
569
|
|
545
570
|
ifdef LLAMA_METAL
|
@@ -592,7 +617,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
|
592
617
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
593
618
|
|
594
619
|
# identify CUDA host compiler
|
595
|
-
ifdef
|
620
|
+
ifdef LLAMA_CUDA
|
596
621
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
597
622
|
include scripts/get-flags.mk
|
598
623
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
@@ -617,19 +642,26 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
|
|
617
642
|
$(info I LDFLAGS: $(LDFLAGS))
|
618
643
|
$(info I CC: $(shell $(CC) --version | head -n 1))
|
619
644
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
620
|
-
ifdef
|
645
|
+
ifdef LLAMA_CUDA
|
621
646
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
622
647
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
623
648
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
624
649
|
ifndef CUDA_DOCKER_ARCH
|
625
650
|
ifndef CUDA_POWER_ARCH
|
626
|
-
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
651
|
+
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
|
627
652
|
endif # CUDA_POWER_ARCH
|
628
653
|
endif # CUDA_DOCKER_ARCH
|
629
654
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
630
|
-
endif #
|
655
|
+
endif # LLAMA_CUDA
|
631
656
|
$(info )
|
632
657
|
|
658
|
+
ifdef LLAMA_CUBLAS
|
659
|
+
$(info !!!!)
|
660
|
+
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
|
661
|
+
$(info !!!!)
|
662
|
+
$(info )
|
663
|
+
endif
|
664
|
+
|
633
665
|
#
|
634
666
|
# Build library
|
635
667
|
#
|
@@ -649,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
|
649
681
|
unicode.o: unicode.cpp unicode.h
|
650
682
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
651
683
|
|
652
|
-
|
684
|
+
unicode-data.o: unicode-data.cpp unicode-data.h
|
685
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
686
|
+
|
687
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
653
688
|
|
654
689
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
655
690
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -675,6 +710,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
|
|
675
710
|
train.o: common/train.cpp common/train.h
|
676
711
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
677
712
|
|
713
|
+
ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
|
714
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
715
|
+
|
678
716
|
libllama.so: llama.o ggml.o $(OBJS)
|
679
717
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
680
718
|
|
@@ -686,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
|
|
686
724
|
ar rcs libllama.a $^
|
687
725
|
|
688
726
|
clean:
|
689
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
727
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
728
|
+
rm -vrf ggml-cuda/*.o
|
690
729
|
|
691
730
|
#
|
692
731
|
# Examples
|
@@ -766,6 +805,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
|
|
766
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
767
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
768
807
|
|
808
|
+
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
809
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
811
|
+
|
769
812
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
770
813
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
771
814
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -803,6 +846,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
|
803
846
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
804
847
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
805
848
|
|
849
|
+
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
850
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
851
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
852
|
+
|
806
853
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
807
854
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
808
855
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -815,14 +862,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
815
862
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
816
863
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
817
864
|
|
818
|
-
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
865
|
+
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
819
866
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
820
867
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
868
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
869
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
870
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
871
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
872
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
873
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
821
874
|
|
822
875
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
823
876
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
824
877
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
825
878
|
|
879
|
+
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
880
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
881
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
882
|
+
|
826
883
|
ifeq ($(UNAME_S),Darwin)
|
827
884
|
swift: examples/batched.swift
|
828
885
|
(cd examples/batched.swift; make build)
|
@@ -870,6 +927,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
|
|
870
927
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
871
928
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
872
929
|
|
930
|
+
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
931
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
932
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
933
|
+
|
873
934
|
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
874
935
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
875
936
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
705
705
|
struct ggml_tensor * leaf = graph->leafs[i];
|
706
706
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
707
707
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
708
|
-
|
709
|
-
|
708
|
+
if (leaf->view_src || leaf->data) {
|
709
|
+
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
710
|
+
galloc->leaf_allocs[i].leaf.size_max = 0;
|
711
|
+
} else {
|
712
|
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
713
|
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
714
|
+
}
|
710
715
|
}
|
711
716
|
|
712
717
|
// reallocate buffers if needed
|
@@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
420
420
|
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
421
421
|
|
422
422
|
// add forward decls here to avoid including the backend headers
|
423
|
-
#ifdef
|
423
|
+
#ifdef GGML_USE_CUDA
|
424
424
|
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
425
425
|
ggml_backend_cuda_reg_devices();
|
426
426
|
#endif
|
@@ -137,7 +137,7 @@ extern "C" {
|
|
137
137
|
/*
|
138
138
|
Example usage:
|
139
139
|
|
140
|
-
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be
|
140
|
+
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
141
141
|
// preferrably to run on the same backend as the buffer
|
142
142
|
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
143
143
|
|
@@ -377,6 +377,27 @@ typedef struct {
|
|
377
377
|
} block_iq1_s;
|
378
378
|
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
379
379
|
|
380
|
+
// 1.75 bpw
|
381
|
+
typedef struct {
|
382
|
+
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
|
+
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
+
#if QK_K == 64
|
385
|
+
ggml_half d;
|
386
|
+
#endif
|
387
|
+
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
|
+
} block_iq1_m;
|
389
|
+
#if QK_K == 64
|
390
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
+
#else
|
392
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
+
#endif
|
394
|
+
|
395
|
+
// Used by IQ1_M quants
|
396
|
+
typedef union {
|
397
|
+
ggml_half f16;
|
398
|
+
uint16_t u16;
|
399
|
+
} iq1m_scale_t;
|
400
|
+
|
380
401
|
// Non-linear quants
|
381
402
|
#define QK4_NL 32
|
382
403
|
typedef struct {
|
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
|
426
447
|
|
427
448
|
#define GGML_COMMON_IMPL
|
428
449
|
#elif defined(GGML_COMMON_IMPL_SYCL)
|
450
|
+
|
429
451
|
#include <cstdint>
|
430
452
|
|
431
|
-
#define GGML_TABLE_BEGIN(type, name, size) static
|
432
|
-
#define GGML_TABLE_END() }
|
453
|
+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
454
|
+
#define GGML_TABLE_END() };
|
433
455
|
|
434
456
|
#define GGML_COMMON_IMPL
|
435
457
|
#endif
|
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
|
|
1050
1072
|
|
1051
1073
|
#define NGRID_IQ1S 2048
|
1052
1074
|
#define IQ1S_DELTA 0.125f
|
1075
|
+
#define IQ1M_DELTA 0.125f
|
1053
1076
|
#if defined(GGML_COMMON_IMPL_C)
|
1054
1077
|
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
|
1055
1078
|
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
|