llama_cpp 0.12.6 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
|
4
|
+
data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
|
7
|
+
data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b2106 to b2143.
|
4
|
+
- Add constants for file type: `LLAMA_FTYPE_MOSTLY_IQ1_S` and `LLAMA_FTYPE_MOSTLY_IQ4_NL`.
|
5
|
+
- Add constants for pooling type: `LLAMA_POOLING_NONE`, `LLAMA_POOLING_MEAN`, and `LLAMA_POOLING_CLS`.
|
6
|
+
- Add `numa_init` module function to `LLaMACpp`.
|
7
|
+
- Remove unnecessary argument from `backend_init`.
|
8
|
+
|
9
|
+
Implementation of llama_chat_apply_template binding has been postponed for the time being.
|
10
|
+
|
1
11
|
## [[0.12.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.5...v0.12.6)] - 2024-02-17
|
2
12
|
|
3
13
|
- Bump bundled llama.cpp from b2106 to b2143.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -3243,15 +3243,8 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
|
3243
3243
|
|
3244
3244
|
// module functions
|
3245
3245
|
|
3246
|
-
static VALUE rb_llama_llama_backend_init(
|
3247
|
-
|
3248
|
-
ID kw_table[1] = { rb_intern("numa") };
|
3249
|
-
VALUE kw_values[1] = { Qundef };
|
3250
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
3251
|
-
rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
|
3252
|
-
|
3253
|
-
const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
|
3254
|
-
llama_backend_init(numa);
|
3246
|
+
static VALUE rb_llama_llama_backend_init(VALUE self) {
|
3247
|
+
llama_backend_init();
|
3255
3248
|
|
3256
3249
|
return Qnil;
|
3257
3250
|
}
|
@@ -3262,6 +3255,17 @@ static VALUE rb_llama_llama_backend_free(VALUE self) {
|
|
3262
3255
|
return Qnil;
|
3263
3256
|
}
|
3264
3257
|
|
3258
|
+
static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
|
3259
|
+
if (!RB_INTEGER_TYPE_P(strategy)) {
|
3260
|
+
rb_raise(rb_eArgError, "strategy must be an integer");
|
3261
|
+
return Qnil;
|
3262
|
+
}
|
3263
|
+
|
3264
|
+
llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
|
3265
|
+
|
3266
|
+
return Qnil;
|
3267
|
+
}
|
3268
|
+
|
3265
3269
|
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
3266
3270
|
VALUE kw_args = Qnil;
|
3267
3271
|
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
|
@@ -3345,8 +3349,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3345
3349
|
RbLLaMAGrammarElement::define_class(rb_mLLaMACpp);
|
3346
3350
|
RbLLaMAGrammar::define_class(rb_mLLaMACpp);
|
3347
3351
|
|
3348
|
-
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init,
|
3352
|
+
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, 0);
|
3349
3353
|
rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
|
3354
|
+
rb_define_module_function(rb_mLLaMACpp, "numa_init", rb_llama_llama_numa_init, 1);
|
3350
3355
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
3351
3356
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
3352
3357
|
rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
|
@@ -3391,6 +3396,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
3391
3396
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3392
3397
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
|
3393
3398
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3399
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3400
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3394
3401
|
|
3395
3402
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3396
3403
|
|
@@ -3412,6 +3419,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3412
3419
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
|
3413
3420
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
|
3414
3421
|
|
3422
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
|
3423
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
|
3424
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
|
3425
|
+
|
3415
3426
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
|
3416
3427
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
|
3417
3428
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2249'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -29,6 +29,8 @@ module LLaMACpp
|
|
29
29
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
30
30
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
|
31
31
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
32
|
+
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
33
|
+
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
32
34
|
|
33
35
|
LLAMA_KV_OVERRIDE_INT: Integer
|
34
36
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -48,12 +50,17 @@ module LLaMACpp
|
|
48
50
|
LLAMA_ROPE_SCALING_YARN: Integer
|
49
51
|
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
50
52
|
|
53
|
+
LLAMA_POOLING_NONE: Integer
|
54
|
+
LLAMA_POOLING_MEAN: Integer
|
55
|
+
LLAMA_POOLING_CLS: Integer
|
56
|
+
|
51
57
|
LLAMA_SPLIT_NONE: Integer
|
52
58
|
LLAMA_SPLIT_LAYER: Integer
|
53
59
|
LLAMA_SPLIT_ROW: Integer
|
54
60
|
|
55
|
-
def self?.backend_init: (
|
61
|
+
def self?.backend_init: () -> void
|
56
62
|
def self?.backend_free: () -> void
|
63
|
+
def self?.numa_init: (Integer) -> void
|
57
64
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
58
65
|
def self?.generate: (::LLaMACpp::Context, String,
|
59
66
|
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
@@ -97,9 +97,10 @@ endif
|
|
97
97
|
#
|
98
98
|
|
99
99
|
# keep standard at C11 and C++11
|
100
|
-
MK_CPPFLAGS
|
101
|
-
MK_CFLAGS
|
102
|
-
MK_CXXFLAGS
|
100
|
+
MK_CPPFLAGS = -I. -Icommon
|
101
|
+
MK_CFLAGS = -std=c11 -fPIC
|
102
|
+
MK_CXXFLAGS = -std=c++11 -fPIC
|
103
|
+
MK_NVCCFLAGS = -std=c++11
|
103
104
|
|
104
105
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
105
106
|
ifdef LLAMA_FAST
|
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
|
|
172
173
|
MK_LDFLAGS += -g
|
173
174
|
|
174
175
|
ifeq ($(UNAME_S),Linux)
|
175
|
-
|
176
|
+
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
176
177
|
endif
|
177
178
|
else
|
178
179
|
MK_CPPFLAGS += -DNDEBUG
|
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
|
|
215
216
|
-Werror=implicit-function-declaration
|
216
217
|
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
|
217
218
|
|
219
|
+
ifeq ($(LLAMA_FATAL_WARNINGS),1)
|
220
|
+
MK_CFLAGS += -Werror
|
221
|
+
MK_CXXFLAGS += -Werror
|
222
|
+
endif
|
223
|
+
|
218
224
|
# this version of Apple ld64 is buggy
|
219
225
|
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
220
226
|
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
@@ -381,6 +387,9 @@ ifdef LLAMA_CUBLAS
|
|
381
387
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
382
388
|
OBJS += ggml-cuda.o
|
383
389
|
MK_NVCCFLAGS += -use_fast_math
|
390
|
+
ifdef LLAMA_FATAL_WARNINGS
|
391
|
+
MK_NVCCFLAGS += -Werror all-warnings
|
392
|
+
endif # LLAMA_FATAL_WARNINGS
|
384
393
|
ifndef JETSON_EOL_MODULE_DETECT
|
385
394
|
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
386
395
|
endif # JETSON_EOL_MODULE_DETECT
|
@@ -439,9 +448,9 @@ ifdef LLAMA_CUDA_CCBIN
|
|
439
448
|
endif
|
440
449
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
441
450
|
ifdef JETSON_EOL_MODULE_DETECT
|
442
|
-
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
451
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
443
452
|
else
|
444
|
-
$(NVCC) $(
|
453
|
+
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
445
454
|
endif # JETSON_EOL_MODULE_DETECT
|
446
455
|
endif # LLAMA_CUBLAS
|
447
456
|
|
@@ -526,11 +535,29 @@ ifdef LLAMA_METAL
|
|
526
535
|
ifdef LLAMA_METAL_NDEBUG
|
527
536
|
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
528
537
|
endif
|
538
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
539
|
+
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
540
|
+
OBJS += ggml-metal-embed.o
|
541
|
+
endif
|
529
542
|
endif # LLAMA_METAL
|
530
543
|
|
531
544
|
ifdef LLAMA_METAL
|
532
545
|
ggml-metal.o: ggml-metal.m ggml-metal.h
|
533
546
|
$(CC) $(CFLAGS) -c $< -o $@
|
547
|
+
|
548
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
549
|
+
ggml-metal-embed.o: ggml-metal.metal
|
550
|
+
@echo "Embedding Metal library"
|
551
|
+
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
552
|
+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
553
|
+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
554
|
+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
555
|
+
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
|
556
|
+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
557
|
+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
558
|
+
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
559
|
+
@rm -f ${TEMP_ASSEMBLY}
|
560
|
+
endif
|
534
561
|
endif # LLAMA_METAL
|
535
562
|
|
536
563
|
ifdef LLAMA_MPI
|
@@ -542,9 +569,10 @@ GF_CC := $(CC)
|
|
542
569
|
include scripts/get-flags.mk
|
543
570
|
|
544
571
|
# combine build flags with cmdline overrides
|
545
|
-
override
|
546
|
-
|
547
|
-
|
572
|
+
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
|
573
|
+
override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
|
574
|
+
BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
|
575
|
+
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
|
548
576
|
override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
549
577
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
550
578
|
|
@@ -552,7 +580,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
|
552
580
|
ifdef LLAMA_CUBLAS
|
553
581
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
554
582
|
include scripts/get-flags.mk
|
555
|
-
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
|
583
|
+
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
556
584
|
endif
|
557
585
|
|
558
586
|
#
|
@@ -633,7 +661,6 @@ lib: llama.o ggml.o $(OBJS)
|
|
633
661
|
|
634
662
|
clean:
|
635
663
|
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
636
|
-
# find examples pocs -type f -name "*.o" -delete
|
637
664
|
|
638
665
|
#
|
639
666
|
# Examples
|
@@ -697,7 +724,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
697
724
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
698
725
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
699
726
|
|
700
|
-
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
727
|
+
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
701
728
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
702
729
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
703
730
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
@@ -868,3 +895,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
|
|
868
895
|
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
869
896
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
870
897
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
898
|
+
|
899
|
+
tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
900
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
901
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -377,6 +377,9 @@ struct ggml_gallocr {
|
|
377
377
|
|
378
378
|
struct node_alloc * node_allocs; // [n_nodes]
|
379
379
|
int n_nodes;
|
380
|
+
|
381
|
+
struct tensor_alloc * leaf_allocs; // [n_leafs]
|
382
|
+
int n_leafs;
|
380
383
|
};
|
381
384
|
|
382
385
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
427
430
|
free(galloc->buffers);
|
428
431
|
free(galloc->buf_tallocs);
|
429
432
|
free(galloc->node_allocs);
|
433
|
+
free(galloc->leaf_allocs);
|
430
434
|
free(galloc);
|
431
435
|
}
|
432
436
|
|
@@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|
464
468
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
465
469
|
struct ggml_tensor * parent = node->src[i];
|
466
470
|
if (parent == NULL) {
|
467
|
-
|
471
|
+
continue;
|
468
472
|
}
|
469
473
|
|
470
474
|
// if the node's data is external, then we cannot re-use it
|
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
544
548
|
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
545
549
|
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
546
550
|
|
547
|
-
// allocate all graph inputs first to avoid overwriting them
|
548
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
549
|
-
if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
|
550
|
-
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
551
|
-
}
|
552
|
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
553
|
-
if (graph->nodes[i]->src[j] == NULL) {
|
554
|
-
break;
|
555
|
-
}
|
556
|
-
if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
|
557
|
-
ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
|
558
|
-
}
|
559
|
-
}
|
560
|
-
}
|
561
|
-
|
562
551
|
// count number of children and views
|
552
|
+
// allocate all graph inputs and leafs first to avoid overwriting them
|
563
553
|
for (int i = 0; i < graph->n_nodes; i++) {
|
564
554
|
struct ggml_tensor * node = graph->nodes[i];
|
565
555
|
|
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
568
558
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
569
559
|
}
|
570
560
|
|
561
|
+
if (node->flags & GGML_TENSOR_FLAG_INPUT) {
|
562
|
+
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
563
|
+
}
|
564
|
+
|
571
565
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
572
|
-
struct ggml_tensor *
|
573
|
-
if (
|
574
|
-
|
566
|
+
struct ggml_tensor * src = node->src[j];
|
567
|
+
if (src == NULL) {
|
568
|
+
continue;
|
569
|
+
}
|
570
|
+
|
571
|
+
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
572
|
+
|
573
|
+
// allocate explicit inputs and leafs
|
574
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
575
|
+
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
575
576
|
}
|
576
|
-
ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
|
577
577
|
}
|
578
|
-
|
578
|
+
}
|
579
|
+
|
580
|
+
// allocate the remaining leafs that are unused on the graph
|
581
|
+
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
582
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
583
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
584
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
585
|
+
|
586
|
+
if (hn->n_children == 0) {
|
587
|
+
assert(!hn->allocated);
|
588
|
+
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
589
|
+
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
590
|
+
}
|
591
|
+
}
|
579
592
|
|
580
593
|
// allocate tensors
|
581
594
|
for (int i = 0; i < graph->n_nodes; i++) {
|
@@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
586
599
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
587
600
|
struct ggml_tensor * parent = node->src[j];
|
588
601
|
if (parent == NULL) {
|
589
|
-
|
602
|
+
continue;
|
590
603
|
}
|
591
604
|
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
|
592
605
|
}
|
@@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
598
611
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
599
612
|
struct ggml_tensor * parent = node->src[j];
|
600
613
|
if (parent == NULL) {
|
601
|
-
|
614
|
+
continue;
|
602
615
|
}
|
603
616
|
AT_PRINTF("%s", parent->name);
|
604
617
|
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
@@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
611
624
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
612
625
|
struct ggml_tensor * parent = node->src[j];
|
613
626
|
if (parent == NULL) {
|
614
|
-
|
627
|
+
continue;
|
615
628
|
}
|
616
629
|
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
617
630
|
p_hn->n_children -= 1;
|
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
696
709
|
}
|
697
710
|
}
|
698
711
|
}
|
712
|
+
if (galloc->n_leafs < graph->n_leafs) {
|
713
|
+
free(galloc->leaf_allocs);
|
714
|
+
galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
|
715
|
+
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
716
|
+
}
|
717
|
+
galloc->n_leafs = graph->n_leafs;
|
718
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
719
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
720
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
721
|
+
galloc->leaf_allocs[i].offset = hn->offset;
|
722
|
+
galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
723
|
+
}
|
699
724
|
|
700
725
|
// reallocate buffers if needed
|
701
726
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
722
747
|
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
723
748
|
}
|
724
749
|
|
725
|
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node,
|
726
|
-
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[
|
750
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
751
|
+
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
727
752
|
|
728
753
|
if (node->view_src != NULL) {
|
729
754
|
if (node->buffer == NULL) {
|
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
732
757
|
// this tensor was allocated without ggml-backend
|
733
758
|
return;
|
734
759
|
}
|
735
|
-
ggml_backend_view_init(galloc->buffers[
|
760
|
+
ggml_backend_view_init(galloc->buffers[buffer_id], node);
|
736
761
|
}
|
737
762
|
} else {
|
738
763
|
if (node->data == NULL) {
|
739
764
|
assert(tensor_alloc->offset != SIZE_MAX);
|
740
|
-
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[
|
741
|
-
void * base = ggml_backend_buffer_get_base(galloc->buffers[
|
765
|
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
766
|
+
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
742
767
|
void * addr = (char *)base + tensor_alloc->offset;
|
743
|
-
ggml_backend_tensor_alloc(galloc->buffers[
|
768
|
+
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
|
744
769
|
} else {
|
745
770
|
if (node->buffer == NULL) {
|
746
771
|
// this tensor was allocated without ggml-backend
|
747
772
|
return;
|
748
773
|
}
|
749
|
-
|
750
|
-
#ifndef NDEBUG
|
751
|
-
size_t offset =
|
752
|
-
(char *)node->data -
|
753
|
-
(char *)ggml_backend_buffer_get_base(node->buffer);
|
754
|
-
size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
|
755
|
-
assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
|
756
|
-
assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
|
757
|
-
#endif
|
758
774
|
}
|
759
775
|
}
|
760
776
|
}
|
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
773
789
|
return true;
|
774
790
|
}
|
775
791
|
|
792
|
+
if (galloc->n_leafs != graph->n_leafs) {
|
793
|
+
#ifndef NDEBUG
|
794
|
+
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
795
|
+
#endif
|
796
|
+
return true;
|
797
|
+
}
|
798
|
+
|
776
799
|
for (int i = 0; i < graph->n_nodes; i++) {
|
777
800
|
struct ggml_tensor * node = graph->nodes[i];
|
778
801
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
@@ -787,7 +810,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
787
810
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
788
811
|
struct ggml_tensor * src = node->src[j];
|
789
812
|
if (src == NULL) {
|
790
|
-
|
813
|
+
continue;
|
791
814
|
}
|
792
815
|
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
793
816
|
#ifndef NDEBUG
|
@@ -827,17 +850,24 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
827
850
|
}
|
828
851
|
|
829
852
|
// allocate the graph tensors from the previous assignments
|
853
|
+
// nodes
|
830
854
|
for (int i = 0; i < graph->n_nodes; i++) {
|
831
855
|
struct ggml_tensor * node = graph->nodes[i];
|
832
856
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
833
857
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
834
858
|
struct ggml_tensor * src = node->src[j];
|
835
859
|
if (src == NULL) {
|
836
|
-
|
860
|
+
continue;
|
837
861
|
}
|
838
|
-
ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
|
862
|
+
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
|
839
863
|
}
|
840
|
-
ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
|
864
|
+
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
865
|
+
}
|
866
|
+
// leafs
|
867
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
868
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
869
|
+
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
870
|
+
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
841
871
|
}
|
842
872
|
|
843
873
|
return true;
|
@@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
|
|
219
219
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
220
220
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
221
221
|
|
222
|
+
if (!size) {
|
223
|
+
return;
|
224
|
+
}
|
225
|
+
|
222
226
|
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
223
227
|
}
|
224
228
|
|
@@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
|
|
229
233
|
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
230
234
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
231
235
|
|
236
|
+
if (!size) {
|
237
|
+
return;
|
238
|
+
}
|
239
|
+
|
232
240
|
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
233
241
|
}
|
234
242
|
|
@@ -748,7 +756,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
748
756
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
749
757
|
switch (op->op) {
|
750
758
|
case GGML_OP_CPY:
|
751
|
-
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
|
759
|
+
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
|
752
760
|
case GGML_OP_MUL_MAT:
|
753
761
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
754
762
|
default:
|
@@ -998,6 +1006,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
|
|
998
1006
|
}
|
999
1007
|
}
|
1000
1008
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
1009
|
+
return -1; // silence warning
|
1001
1010
|
}
|
1002
1011
|
|
1003
1012
|
#if 0
|
@@ -1032,7 +1041,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1032
1041
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1033
1042
|
const struct ggml_tensor * src = tensor->src[i];
|
1034
1043
|
if (src == NULL) {
|
1035
|
-
|
1044
|
+
continue;
|
1036
1045
|
}
|
1037
1046
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1038
1047
|
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
|
@@ -1079,7 +1088,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1079
1088
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1080
1089
|
struct ggml_tensor * src = node->src[j];
|
1081
1090
|
if (src == NULL) {
|
1082
|
-
|
1091
|
+
continue;
|
1083
1092
|
}
|
1084
1093
|
ggml_backend_t src_backend = tensor_backend(src);
|
1085
1094
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
@@ -1135,7 +1144,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1135
1144
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1136
1145
|
struct ggml_tensor * src = node->src[j];
|
1137
1146
|
if (src == NULL) {
|
1138
|
-
|
1147
|
+
continue;
|
1139
1148
|
}
|
1140
1149
|
if (tensor_backend_id(src) == -1) {
|
1141
1150
|
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
@@ -1247,7 +1256,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1247
1256
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1248
1257
|
struct ggml_tensor * src = node->src[j];
|
1249
1258
|
if (src == NULL) {
|
1250
|
-
|
1259
|
+
continue;
|
1251
1260
|
}
|
1252
1261
|
int src_backend_id = tensor_backend_id(src);
|
1253
1262
|
if (src_backend_id == -1) {
|
@@ -1306,7 +1315,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1306
1315
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1307
1316
|
struct ggml_tensor * src = node->src[j];
|
1308
1317
|
if (src == NULL) {
|
1309
|
-
|
1318
|
+
continue;
|
1310
1319
|
}
|
1311
1320
|
int src_backend_id = tensor_backend_id(src);
|
1312
1321
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
@@ -1353,7 +1362,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1353
1362
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1354
1363
|
struct ggml_tensor * src = node->src[j];
|
1355
1364
|
if (src == NULL) {
|
1356
|
-
|
1365
|
+
continue;
|
1357
1366
|
}
|
1358
1367
|
ggml_backend_t src_backend = tensor_backend(src);
|
1359
1368
|
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
@@ -1659,7 +1668,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
|
1659
1668
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1660
1669
|
struct ggml_tensor * s = src->src[i];
|
1661
1670
|
if (s == NULL) {
|
1662
|
-
|
1671
|
+
continue;
|
1663
1672
|
}
|
1664
1673
|
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
1665
1674
|
}
|
@@ -1688,7 +1697,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1688
1697
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1689
1698
|
struct ggml_tensor * s = src->src[i];
|
1690
1699
|
if (s == NULL) {
|
1691
|
-
|
1700
|
+
continue;
|
1692
1701
|
}
|
1693
1702
|
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
1694
1703
|
}
|