llama_cpp 0.12.6 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
|
4
|
+
data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
|
7
|
+
data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b2106 to b2143.
|
4
|
+
- Add constants for file type: `LLAMA_FTYPE_MOSTLY_IQ1_S` and `LLAMA_FTYPE_MOSTLY_IQ4_NL`.
|
5
|
+
- Add constants for pooling type: `LLAMA_POOLING_NONE`, `LLAMA_POOLING_MEAN`, and `LLAMA_POOLING_CLS`.
|
6
|
+
- Add `numa_init` module function to `LLaMACpp`.
|
7
|
+
- Remove unnecessary argument from `backend_init`.
|
8
|
+
|
9
|
+
Implementation of llama_chat_apply_template binding has been postponed for the time being.
|
10
|
+
|
1
11
|
## [[0.12.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.5...v0.12.6)] - 2024-02-17
|
2
12
|
|
3
13
|
- Bump bundled llama.cpp from b2106 to b2143.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -3243,15 +3243,8 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
|
3243
3243
|
|
3244
3244
|
// module functions
|
3245
3245
|
|
3246
|
-
static VALUE rb_llama_llama_backend_init(
|
3247
|
-
|
3248
|
-
ID kw_table[1] = { rb_intern("numa") };
|
3249
|
-
VALUE kw_values[1] = { Qundef };
|
3250
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
3251
|
-
rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
|
3252
|
-
|
3253
|
-
const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
|
3254
|
-
llama_backend_init(numa);
|
3246
|
+
static VALUE rb_llama_llama_backend_init(VALUE self) {
|
3247
|
+
llama_backend_init();
|
3255
3248
|
|
3256
3249
|
return Qnil;
|
3257
3250
|
}
|
@@ -3262,6 +3255,17 @@ static VALUE rb_llama_llama_backend_free(VALUE self) {
|
|
3262
3255
|
return Qnil;
|
3263
3256
|
}
|
3264
3257
|
|
3258
|
+
static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
|
3259
|
+
if (!RB_INTEGER_TYPE_P(strategy)) {
|
3260
|
+
rb_raise(rb_eArgError, "strategy must be an integer");
|
3261
|
+
return Qnil;
|
3262
|
+
}
|
3263
|
+
|
3264
|
+
llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
|
3265
|
+
|
3266
|
+
return Qnil;
|
3267
|
+
}
|
3268
|
+
|
3265
3269
|
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
3266
3270
|
VALUE kw_args = Qnil;
|
3267
3271
|
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
|
@@ -3345,8 +3349,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3345
3349
|
RbLLaMAGrammarElement::define_class(rb_mLLaMACpp);
|
3346
3350
|
RbLLaMAGrammar::define_class(rb_mLLaMACpp);
|
3347
3351
|
|
3348
|
-
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init,
|
3352
|
+
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, 0);
|
3349
3353
|
rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
|
3354
|
+
rb_define_module_function(rb_mLLaMACpp, "numa_init", rb_llama_llama_numa_init, 1);
|
3350
3355
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
3351
3356
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
3352
3357
|
rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
|
@@ -3391,6 +3396,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
3391
3396
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3392
3397
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
|
3393
3398
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3399
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3400
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3394
3401
|
|
3395
3402
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3396
3403
|
|
@@ -3412,6 +3419,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3412
3419
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
|
3413
3420
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
|
3414
3421
|
|
3422
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
|
3423
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
|
3424
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
|
3425
|
+
|
3415
3426
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
|
3416
3427
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
|
3417
3428
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2249'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -29,6 +29,8 @@ module LLaMACpp
|
|
29
29
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
30
30
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
|
31
31
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
32
|
+
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
33
|
+
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
32
34
|
|
33
35
|
LLAMA_KV_OVERRIDE_INT: Integer
|
34
36
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -48,12 +50,17 @@ module LLaMACpp
|
|
48
50
|
LLAMA_ROPE_SCALING_YARN: Integer
|
49
51
|
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
50
52
|
|
53
|
+
LLAMA_POOLING_NONE: Integer
|
54
|
+
LLAMA_POOLING_MEAN: Integer
|
55
|
+
LLAMA_POOLING_CLS: Integer
|
56
|
+
|
51
57
|
LLAMA_SPLIT_NONE: Integer
|
52
58
|
LLAMA_SPLIT_LAYER: Integer
|
53
59
|
LLAMA_SPLIT_ROW: Integer
|
54
60
|
|
55
|
-
def self?.backend_init: (
|
61
|
+
def self?.backend_init: () -> void
|
56
62
|
def self?.backend_free: () -> void
|
63
|
+
def self?.numa_init: (Integer) -> void
|
57
64
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
58
65
|
def self?.generate: (::LLaMACpp::Context, String,
|
59
66
|
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
@@ -97,9 +97,10 @@ endif
|
|
97
97
|
#
|
98
98
|
|
99
99
|
# keep standard at C11 and C++11
|
100
|
-
MK_CPPFLAGS
|
101
|
-
MK_CFLAGS
|
102
|
-
MK_CXXFLAGS
|
100
|
+
MK_CPPFLAGS = -I. -Icommon
|
101
|
+
MK_CFLAGS = -std=c11 -fPIC
|
102
|
+
MK_CXXFLAGS = -std=c++11 -fPIC
|
103
|
+
MK_NVCCFLAGS = -std=c++11
|
103
104
|
|
104
105
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
105
106
|
ifdef LLAMA_FAST
|
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
|
|
172
173
|
MK_LDFLAGS += -g
|
173
174
|
|
174
175
|
ifeq ($(UNAME_S),Linux)
|
175
|
-
|
176
|
+
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
176
177
|
endif
|
177
178
|
else
|
178
179
|
MK_CPPFLAGS += -DNDEBUG
|
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
|
|
215
216
|
-Werror=implicit-function-declaration
|
216
217
|
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
|
217
218
|
|
219
|
+
ifeq ($(LLAMA_FATAL_WARNINGS),1)
|
220
|
+
MK_CFLAGS += -Werror
|
221
|
+
MK_CXXFLAGS += -Werror
|
222
|
+
endif
|
223
|
+
|
218
224
|
# this version of Apple ld64 is buggy
|
219
225
|
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
220
226
|
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
@@ -381,6 +387,9 @@ ifdef LLAMA_CUBLAS
|
|
381
387
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
382
388
|
OBJS += ggml-cuda.o
|
383
389
|
MK_NVCCFLAGS += -use_fast_math
|
390
|
+
ifdef LLAMA_FATAL_WARNINGS
|
391
|
+
MK_NVCCFLAGS += -Werror all-warnings
|
392
|
+
endif # LLAMA_FATAL_WARNINGS
|
384
393
|
ifndef JETSON_EOL_MODULE_DETECT
|
385
394
|
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
386
395
|
endif # JETSON_EOL_MODULE_DETECT
|
@@ -439,9 +448,9 @@ ifdef LLAMA_CUDA_CCBIN
|
|
439
448
|
endif
|
440
449
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
441
450
|
ifdef JETSON_EOL_MODULE_DETECT
|
442
|
-
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
451
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
443
452
|
else
|
444
|
-
$(NVCC) $(
|
453
|
+
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
445
454
|
endif # JETSON_EOL_MODULE_DETECT
|
446
455
|
endif # LLAMA_CUBLAS
|
447
456
|
|
@@ -526,11 +535,29 @@ ifdef LLAMA_METAL
|
|
526
535
|
ifdef LLAMA_METAL_NDEBUG
|
527
536
|
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
528
537
|
endif
|
538
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
539
|
+
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
540
|
+
OBJS += ggml-metal-embed.o
|
541
|
+
endif
|
529
542
|
endif # LLAMA_METAL
|
530
543
|
|
531
544
|
ifdef LLAMA_METAL
|
532
545
|
ggml-metal.o: ggml-metal.m ggml-metal.h
|
533
546
|
$(CC) $(CFLAGS) -c $< -o $@
|
547
|
+
|
548
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
549
|
+
ggml-metal-embed.o: ggml-metal.metal
|
550
|
+
@echo "Embedding Metal library"
|
551
|
+
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
552
|
+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
553
|
+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
554
|
+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
555
|
+
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
|
556
|
+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
557
|
+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
558
|
+
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
559
|
+
@rm -f ${TEMP_ASSEMBLY}
|
560
|
+
endif
|
534
561
|
endif # LLAMA_METAL
|
535
562
|
|
536
563
|
ifdef LLAMA_MPI
|
@@ -542,9 +569,10 @@ GF_CC := $(CC)
|
|
542
569
|
include scripts/get-flags.mk
|
543
570
|
|
544
571
|
# combine build flags with cmdline overrides
|
545
|
-
override
|
546
|
-
|
547
|
-
|
572
|
+
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
|
573
|
+
override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
|
574
|
+
BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
|
575
|
+
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
|
548
576
|
override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
549
577
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
550
578
|
|
@@ -552,7 +580,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
|
552
580
|
ifdef LLAMA_CUBLAS
|
553
581
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
554
582
|
include scripts/get-flags.mk
|
555
|
-
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
|
583
|
+
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
556
584
|
endif
|
557
585
|
|
558
586
|
#
|
@@ -633,7 +661,6 @@ lib: llama.o ggml.o $(OBJS)
|
|
633
661
|
|
634
662
|
clean:
|
635
663
|
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
636
|
-
# find examples pocs -type f -name "*.o" -delete
|
637
664
|
|
638
665
|
#
|
639
666
|
# Examples
|
@@ -697,7 +724,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
697
724
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
698
725
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
699
726
|
|
700
|
-
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
727
|
+
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
701
728
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
702
729
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
703
730
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
@@ -868,3 +895,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
|
|
868
895
|
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
869
896
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
870
897
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
898
|
+
|
899
|
+
tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
900
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
901
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -377,6 +377,9 @@ struct ggml_gallocr {
|
|
377
377
|
|
378
378
|
struct node_alloc * node_allocs; // [n_nodes]
|
379
379
|
int n_nodes;
|
380
|
+
|
381
|
+
struct tensor_alloc * leaf_allocs; // [n_leafs]
|
382
|
+
int n_leafs;
|
380
383
|
};
|
381
384
|
|
382
385
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
427
430
|
free(galloc->buffers);
|
428
431
|
free(galloc->buf_tallocs);
|
429
432
|
free(galloc->node_allocs);
|
433
|
+
free(galloc->leaf_allocs);
|
430
434
|
free(galloc);
|
431
435
|
}
|
432
436
|
|
@@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|
464
468
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
465
469
|
struct ggml_tensor * parent = node->src[i];
|
466
470
|
if (parent == NULL) {
|
467
|
-
|
471
|
+
continue;
|
468
472
|
}
|
469
473
|
|
470
474
|
// if the node's data is external, then we cannot re-use it
|
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
544
548
|
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
545
549
|
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
546
550
|
|
547
|
-
// allocate all graph inputs first to avoid overwriting them
|
548
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
549
|
-
if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
|
550
|
-
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
551
|
-
}
|
552
|
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
553
|
-
if (graph->nodes[i]->src[j] == NULL) {
|
554
|
-
break;
|
555
|
-
}
|
556
|
-
if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
|
557
|
-
ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
|
558
|
-
}
|
559
|
-
}
|
560
|
-
}
|
561
|
-
|
562
551
|
// count number of children and views
|
552
|
+
// allocate all graph inputs and leafs first to avoid overwriting them
|
563
553
|
for (int i = 0; i < graph->n_nodes; i++) {
|
564
554
|
struct ggml_tensor * node = graph->nodes[i];
|
565
555
|
|
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
568
558
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
569
559
|
}
|
570
560
|
|
561
|
+
if (node->flags & GGML_TENSOR_FLAG_INPUT) {
|
562
|
+
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
563
|
+
}
|
564
|
+
|
571
565
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
572
|
-
struct ggml_tensor *
|
573
|
-
if (
|
574
|
-
|
566
|
+
struct ggml_tensor * src = node->src[j];
|
567
|
+
if (src == NULL) {
|
568
|
+
continue;
|
569
|
+
}
|
570
|
+
|
571
|
+
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
572
|
+
|
573
|
+
// allocate explicit inputs and leafs
|
574
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
575
|
+
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
575
576
|
}
|
576
|
-
ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
|
577
577
|
}
|
578
|
-
|
578
|
+
}
|
579
|
+
|
580
|
+
// allocate the remaining leafs that are unused on the graph
|
581
|
+
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
582
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
583
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
584
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
585
|
+
|
586
|
+
if (hn->n_children == 0) {
|
587
|
+
assert(!hn->allocated);
|
588
|
+
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
589
|
+
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
590
|
+
}
|
591
|
+
}
|
579
592
|
|
580
593
|
// allocate tensors
|
581
594
|
for (int i = 0; i < graph->n_nodes; i++) {
|
@@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
586
599
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
587
600
|
struct ggml_tensor * parent = node->src[j];
|
588
601
|
if (parent == NULL) {
|
589
|
-
|
602
|
+
continue;
|
590
603
|
}
|
591
604
|
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
|
592
605
|
}
|
@@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
598
611
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
599
612
|
struct ggml_tensor * parent = node->src[j];
|
600
613
|
if (parent == NULL) {
|
601
|
-
|
614
|
+
continue;
|
602
615
|
}
|
603
616
|
AT_PRINTF("%s", parent->name);
|
604
617
|
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
@@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
611
624
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
612
625
|
struct ggml_tensor * parent = node->src[j];
|
613
626
|
if (parent == NULL) {
|
614
|
-
|
627
|
+
continue;
|
615
628
|
}
|
616
629
|
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
617
630
|
p_hn->n_children -= 1;
|
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
696
709
|
}
|
697
710
|
}
|
698
711
|
}
|
712
|
+
if (galloc->n_leafs < graph->n_leafs) {
|
713
|
+
free(galloc->leaf_allocs);
|
714
|
+
galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
|
715
|
+
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
716
|
+
}
|
717
|
+
galloc->n_leafs = graph->n_leafs;
|
718
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
719
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
720
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
721
|
+
galloc->leaf_allocs[i].offset = hn->offset;
|
722
|
+
galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
723
|
+
}
|
699
724
|
|
700
725
|
// reallocate buffers if needed
|
701
726
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
722
747
|
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
723
748
|
}
|
724
749
|
|
725
|
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node,
|
726
|
-
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[
|
750
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
751
|
+
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
727
752
|
|
728
753
|
if (node->view_src != NULL) {
|
729
754
|
if (node->buffer == NULL) {
|
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
732
757
|
// this tensor was allocated without ggml-backend
|
733
758
|
return;
|
734
759
|
}
|
735
|
-
ggml_backend_view_init(galloc->buffers[
|
760
|
+
ggml_backend_view_init(galloc->buffers[buffer_id], node);
|
736
761
|
}
|
737
762
|
} else {
|
738
763
|
if (node->data == NULL) {
|
739
764
|
assert(tensor_alloc->offset != SIZE_MAX);
|
740
|
-
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[
|
741
|
-
void * base = ggml_backend_buffer_get_base(galloc->buffers[
|
765
|
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
766
|
+
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
742
767
|
void * addr = (char *)base + tensor_alloc->offset;
|
743
|
-
ggml_backend_tensor_alloc(galloc->buffers[
|
768
|
+
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
|
744
769
|
} else {
|
745
770
|
if (node->buffer == NULL) {
|
746
771
|
// this tensor was allocated without ggml-backend
|
747
772
|
return;
|
748
773
|
}
|
749
|
-
|
750
|
-
#ifndef NDEBUG
|
751
|
-
size_t offset =
|
752
|
-
(char *)node->data -
|
753
|
-
(char *)ggml_backend_buffer_get_base(node->buffer);
|
754
|
-
size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
|
755
|
-
assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
|
756
|
-
assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
|
757
|
-
#endif
|
758
774
|
}
|
759
775
|
}
|
760
776
|
}
|
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
773
789
|
return true;
|
774
790
|
}
|
775
791
|
|
792
|
+
if (galloc->n_leafs != graph->n_leafs) {
|
793
|
+
#ifndef NDEBUG
|
794
|
+
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
795
|
+
#endif
|
796
|
+
return true;
|
797
|
+
}
|
798
|
+
|
776
799
|
for (int i = 0; i < graph->n_nodes; i++) {
|
777
800
|
struct ggml_tensor * node = graph->nodes[i];
|
778
801
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
@@ -787,7 +810,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
787
810
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
788
811
|
struct ggml_tensor * src = node->src[j];
|
789
812
|
if (src == NULL) {
|
790
|
-
|
813
|
+
continue;
|
791
814
|
}
|
792
815
|
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
793
816
|
#ifndef NDEBUG
|
@@ -827,17 +850,24 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
827
850
|
}
|
828
851
|
|
829
852
|
// allocate the graph tensors from the previous assignments
|
853
|
+
// nodes
|
830
854
|
for (int i = 0; i < graph->n_nodes; i++) {
|
831
855
|
struct ggml_tensor * node = graph->nodes[i];
|
832
856
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
833
857
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
834
858
|
struct ggml_tensor * src = node->src[j];
|
835
859
|
if (src == NULL) {
|
836
|
-
|
860
|
+
continue;
|
837
861
|
}
|
838
|
-
ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
|
862
|
+
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
|
839
863
|
}
|
840
|
-
ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
|
864
|
+
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
865
|
+
}
|
866
|
+
// leafs
|
867
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
868
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
869
|
+
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
870
|
+
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
841
871
|
}
|
842
872
|
|
843
873
|
return true;
|
@@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
|
|
219
219
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
220
220
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
221
221
|
|
222
|
+
if (!size) {
|
223
|
+
return;
|
224
|
+
}
|
225
|
+
|
222
226
|
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
223
227
|
}
|
224
228
|
|
@@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
|
|
229
233
|
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
230
234
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
231
235
|
|
236
|
+
if (!size) {
|
237
|
+
return;
|
238
|
+
}
|
239
|
+
|
232
240
|
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
233
241
|
}
|
234
242
|
|
@@ -748,7 +756,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|
748
756
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
749
757
|
switch (op->op) {
|
750
758
|
case GGML_OP_CPY:
|
751
|
-
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
|
759
|
+
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
|
752
760
|
case GGML_OP_MUL_MAT:
|
753
761
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
754
762
|
default:
|
@@ -998,6 +1006,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
|
|
998
1006
|
}
|
999
1007
|
}
|
1000
1008
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
1009
|
+
return -1; // silence warning
|
1001
1010
|
}
|
1002
1011
|
|
1003
1012
|
#if 0
|
@@ -1032,7 +1041,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1032
1041
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1033
1042
|
const struct ggml_tensor * src = tensor->src[i];
|
1034
1043
|
if (src == NULL) {
|
1035
|
-
|
1044
|
+
continue;
|
1036
1045
|
}
|
1037
1046
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1038
1047
|
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
|
@@ -1079,7 +1088,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1079
1088
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1080
1089
|
struct ggml_tensor * src = node->src[j];
|
1081
1090
|
if (src == NULL) {
|
1082
|
-
|
1091
|
+
continue;
|
1083
1092
|
}
|
1084
1093
|
ggml_backend_t src_backend = tensor_backend(src);
|
1085
1094
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
@@ -1135,7 +1144,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1135
1144
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1136
1145
|
struct ggml_tensor * src = node->src[j];
|
1137
1146
|
if (src == NULL) {
|
1138
|
-
|
1147
|
+
continue;
|
1139
1148
|
}
|
1140
1149
|
if (tensor_backend_id(src) == -1) {
|
1141
1150
|
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
@@ -1247,7 +1256,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1247
1256
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1248
1257
|
struct ggml_tensor * src = node->src[j];
|
1249
1258
|
if (src == NULL) {
|
1250
|
-
|
1259
|
+
continue;
|
1251
1260
|
}
|
1252
1261
|
int src_backend_id = tensor_backend_id(src);
|
1253
1262
|
if (src_backend_id == -1) {
|
@@ -1306,7 +1315,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1306
1315
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1307
1316
|
struct ggml_tensor * src = node->src[j];
|
1308
1317
|
if (src == NULL) {
|
1309
|
-
|
1318
|
+
continue;
|
1310
1319
|
}
|
1311
1320
|
int src_backend_id = tensor_backend_id(src);
|
1312
1321
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
@@ -1353,7 +1362,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1353
1362
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1354
1363
|
struct ggml_tensor * src = node->src[j];
|
1355
1364
|
if (src == NULL) {
|
1356
|
-
|
1365
|
+
continue;
|
1357
1366
|
}
|
1358
1367
|
ggml_backend_t src_backend = tensor_backend(src);
|
1359
1368
|
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
@@ -1659,7 +1668,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
|
1659
1668
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1660
1669
|
struct ggml_tensor * s = src->src[i];
|
1661
1670
|
if (s == NULL) {
|
1662
|
-
|
1671
|
+
continue;
|
1663
1672
|
}
|
1664
1673
|
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
1665
1674
|
}
|
@@ -1688,7 +1697,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1688
1697
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1689
1698
|
struct ggml_tensor * s = src->src[i];
|
1690
1699
|
if (s == NULL) {
|
1691
|
-
|
1700
|
+
continue;
|
1692
1701
|
}
|
1693
1702
|
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
1694
1703
|
}
|